In [None]:
import numpy as np
import pandas as pd
import random


def corrupt_dataframe(df, noise_ratio=0.05, missing_ratio=0.05, noise_std=0.1):
    df = df.copy()


    numeric_cols = df.select_dtypes(include=[np.number]).columns

    for col in numeric_cols:
        noise_mask = np.random.rand(len(df)) < noise_ratio
        noise = np.random.normal(0, noise_std * df[col].std(), size=noise_mask.sum())
        df.loc[noise_mask, col] += noise

    total_cells = df.size
    num_missing = int(total_cells * missing_ratio)

    for _ in range(num_missing):
        row = random.randint(0, df.shape[0] - 1)
        col = random.choice(df.columns)
        df.at[row, col] = np.nan

    return df



df = pd.read_csv({
    "product_id": [1001, 1002, 1003, 1004],
    "name": ["Laptop", "Shampoo", "Pasta", "Smartwatch"],
    "price": [799.99, 12.50, 4.99, 199.99],
})

corrupted_df = corrupt_dataframe(
    df,
    noise_ratio=0.2,      # 20% noisy values
    missing_ratio=0.1,    # 10% missing values
    noise_std=0.2         # 20% std noise intensity
)

print(corrupted_df)

In [57]:
import pandas as pd
df = pd.read_csv("/content/final_customer_df.csv")
df.head()

Unnamed: 0,customer_id,gender,age,loyalty_tier,preferred_channel,city,region,store_type,avg_discount,avg_unit_price,...,unique_categories,unique_channels,frequency,monetary,recency,spend_next_30_days,spend_last_30_days,spend_last_90_days,spend_velocity,velocity_segment
0,C00001,M,41,Bronze,Mobile,Baytown,South,OnlineHub,0.034821,99.003661,...,6,3,112,14071.856,3,3362.33,2109.3975,11216.626,0.18806,Inactive
1,C00002,M,33,Bronze,InStore,Southport,South,OnlineHub,0.017539,81.12089,...,6,3,191,19481.543,1,3066.2355,6508.9615,16059.2315,0.40531,Declining
2,C00003,M,43,Platinum,InStore,Easton,East,Street,0.027447,254.423106,...,6,3,235,73037.3575,1,20788.1415,13737.749,54901.063,0.250227,Inactive
3,C00004,F,53,Gold,InStore,Southport,South,OnlineHub,0.027876,103.938628,...,6,3,226,29091.614,1,8838.996,6027.001,21331.8865,0.282535,Inactive
4,C00005,F,32,Bronze,Mobile,Frostford,North,Outlet,0.027586,83.861034,...,6,3,145,14481.376,1,2013.356,4065.3905,11498.4005,0.353561,Declining




In [5]:
import pandas as pd

# --- Load your dataset ---
df['transaction_date'] = pd.to_datetime(df['transaction_date'])

# --- Create a year-month column ---
df['year_month'] = df['transaction_date'].dt.to_period('M')

# --- Compute total amount spent ---
df['amount_spent'] = df['unit_price'] * df['quantity'] * (1 - df['discount_pct'])

# --- Select customer-level attributes (assumed constant per customer) ---
customer_features = df[['customer_id', 'gender', 'age', 'loyalty_tier', 'preferred_channel']].drop_duplicates()

# --- Group by customer, category, and month ---
monthly_summary = df.groupby(
    ['customer_id', 'category', 'year_month']
).agg(
    total_quantity=('quantity', 'sum'),
    total_spent=('amount_spent', 'sum'),
).reset_index()

# --- Merge static customer features back ---
final_df = monthly_summary.merge(customer_features, on='customer_id', how='left')

# Optional: convert 'year_month' to datetime
final_df['year_month'] = final_df['year_month'].astype(str)

print(final_df.head())


  customer_id  category year_month  total_quantity  total_spent gender  age  \
0      C00001    Beauty    2025-08               2       93.770      M   41   
1      C00001    Beauty    2025-09               2       65.066      M   41   
2      C00001    Beauty    2025-10               5      236.890      M   41   
3      C00001    Beauty    2025-12               2       37.020      M   41   
4      C00001  Clothing    2025-08               1      120.000      M   41   

  loyalty_tier preferred_channel  
0       Bronze            Mobile  
1       Bronze            Mobile  
2       Bronze            Mobile  
3       Bronze            Mobile  
4       Bronze            Mobile  


In [16]:
from sklearn.preprocessing import OrdinalEncoder

final_df['loyalty_tier_encoded'] = OrdinalEncoder(
    categories=[['Bronze', 'Silver', 'Gold', 'Platinum']]
).fit_transform(final_df[['loyalty_tier']])
final_df.head()
df =final_df

In [17]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder

# ----- 1. Ordinal encode loyalty_tier -----
df['loyalty_tier_encoded'] = OrdinalEncoder(
    categories=[['Bronze', 'Silver', 'Gold', 'Platinum']]
).fit_transform(df[['loyalty_tier']])

# ----- 2. Label encode category (NO one-hot) -----
le_cat = LabelEncoder()
df['category_encoded'] = le_cat.fit_transform(df['category'])

# ----- 3. One-hot encode gender + preferred_channel -----
df = pd.get_dummies(df,
                    columns=['gender', 'preferred_channel'],
                    drop_first=False)

df.head()


Unnamed: 0,customer_id,category,year_month,total_quantity,total_spent,age,loyalty_tier,loyalty_tier_encoded,category_encoded,gender_F,gender_M,gender_O,preferred_channel_InStore,preferred_channel_Mobile,preferred_channel_Online
0,C00001,Beauty,2025-08,2,93.77,41,Bronze,0.0,0,False,True,False,False,True,False
1,C00001,Beauty,2025-09,2,65.066,41,Bronze,0.0,0,False,True,False,False,True,False
2,C00001,Beauty,2025-10,5,236.89,41,Bronze,0.0,0,False,True,False,False,True,False
3,C00001,Beauty,2025-12,2,37.02,41,Bronze,0.0,0,False,True,False,False,True,False
4,C00001,Clothing,2025-08,1,120.0,41,Bronze,0.0,1,False,True,False,False,True,False


In [18]:
df = df.drop(columns=['loyalty_tier'])

In [20]:
df.head()

Unnamed: 0,customer_id,category,year_month,total_quantity,total_spent,age,loyalty_tier_encoded,category_encoded,gender_F,gender_M,gender_O,preferred_channel_InStore,preferred_channel_Mobile,preferred_channel_Online
0,C00001,Beauty,2025-08,2,93.77,41,0.0,0,False,True,False,False,True,False
1,C00001,Beauty,2025-09,2,65.066,41,0.0,0,False,True,False,False,True,False
2,C00001,Beauty,2025-10,5,236.89,41,0.0,0,False,True,False,False,True,False
3,C00001,Beauty,2025-12,2,37.02,41,0.0,0,False,True,False,False,True,False
4,C00001,Clothing,2025-08,1,120.0,41,0.0,1,False,True,False,False,True,False


In [21]:
import pandas as pd

# Ensure proper sorting
df['year_month'] = pd.to_datetime(df['year_month'].astype(str) + '-01')
df = df.sort_values(['customer_id', 'category', 'year_month'])

# Compute cumulative spend **excluding current month**
df['cum_spent_till_this_month'] = (
    df.groupby(['customer_id', 'category'])['total_spent']
      .cumsum() - df['total_spent']
)

# First month will automatically be 0
print(df[['customer_id','category','year_month','total_spent','cum_spent_till_this_month']].head(10))


  customer_id     category year_month  total_spent  cum_spent_till_this_month
0      C00001       Beauty 2025-08-01      93.7700                     0.0000
1      C00001       Beauty 2025-09-01      65.0660                    93.7700
2      C00001       Beauty 2025-10-01     236.8900                   158.8360
3      C00001       Beauty 2025-12-01      37.0200                   395.7260
4      C00001     Clothing 2025-08-01     120.0000                     0.0000
5      C00001     Clothing 2025-10-01     890.1495                   120.0000
6      C00001     Clothing 2025-11-01      71.9800                  1010.1495
7      C00001     Clothing 2025-12-01     103.8700                  1082.1295
8      C00001  Electronics 2025-08-01     325.2800                     0.0000
9      C00001  Electronics 2025-09-01    1133.1600                   325.2800


In [25]:
df

Unnamed: 0,customer_id,category,year_month,total_quantity,total_spent,age,loyalty_tier_encoded,category_encoded,gender_F,gender_M,gender_O,preferred_channel_InStore,preferred_channel_Mobile,preferred_channel_Online,cum_spent_till_this_month
0,C00001,Beauty,2025-08-01,2,93.770,41,0.0,0,False,True,False,False,True,False,0.000
1,C00001,Beauty,2025-09-01,2,65.066,41,0.0,0,False,True,False,False,True,False,93.770
2,C00001,Beauty,2025-10-01,5,236.890,41,0.0,0,False,True,False,False,True,False,158.836
3,C00001,Beauty,2025-12-01,2,37.020,41,0.0,0,False,True,False,False,True,False,395.726
4,C00001,Clothing,2025-08-01,1,120.000,41,0.0,1,False,True,False,False,True,False,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6682,C00250,Home,2025-12-01,16,2177.038,40,0.0,4,False,True,False,False,True,False,7186.025
6683,C00250,Sports,2025-09-01,5,336.530,40,0.0,5,False,True,False,False,True,False,0.000
6684,C00250,Sports,2025-10-01,1,180.000,40,0.0,5,False,True,False,False,True,False,336.530
6685,C00250,Sports,2025-11-01,1,72.230,40,0.0,5,False,True,False,False,True,False,516.530


In [26]:
df = df.sort_values(['customer_id', 'category', 'year_month'])
# Create lag features
df['spend_lag_1'] = df.groupby(['customer_id', 'category'])['total_spent'].shift(1)
df['spend_lag_2'] = df.groupby(['customer_id', 'category'])['total_spent'].shift(2)
df['spend_lag_3'] = df.groupby(['customer_id', 'category'])['total_spent'].shift(3)

# Drop rows where any lag is NaN (first 3 months per customer-category)
df_model = df.dropna(subset=['spend_lag_1','spend_lag_2','spend_lag_3']).copy()

# Target is current month spend
y = df_model['total_spent']

# Features include lag features + customer static features
features = ['spend_lag_1','spend_lag_2','spend_lag_3','age','loyalty_tier_encoded','category_encoded'] + \
           [col for col in df_model.columns if col.startswith('gender_') or col.startswith('preferred_channel_')]

X = df_model[features]

print(X.head())
print(y.head())


    spend_lag_1  spend_lag_2  spend_lag_3  age  loyalty_tier_encoded  \
3        236.89      65.0660       93.770   41                   0.0   
7         71.98     890.1495      120.000   41                   0.0   
11      1800.00    1133.1600      325.280   41                   0.0   
12       835.41    1800.0000     1133.160   41                   0.0   
16        96.17     156.0970      267.692   41                   0.0   

    category_encoded  gender_F  gender_M  gender_O  preferred_channel_InStore  \
3                  0     False      True     False                      False   
7                  1     False      True     False                      False   
11                 2     False      True     False                      False   
12                 2     False      True     False                      False   
16                 3     False      True     False                      False   

    preferred_channel_Mobile  preferred_channel_Online  
3                      

In [39]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
from sklearn.metrics import mean_absolute_error
# -----------------------------
# 1️⃣ Ensure dataset is sorted
# -----------------------------
df['year_month'] = pd.to_datetime(df['year_month'].astype(str) + '-01')
df = df.sort_values(['customer_id', 'category', 'year_month'])

# -----------------------------
# 2️⃣ Create lag features (previous 3 months)
# -----------------------------
df['spend_lag_1'] = df.groupby(['customer_id', 'category'])['total_spent'].shift(1)
df['spend_lag_2'] = df.groupby(['customer_id', 'category'])['total_spent'].shift(2)
df['spend_lag_3'] = df.groupby(['customer_id', 'category'])['total_spent'].shift(3)

# Drop rows without 3 months history
df_model = df.dropna(subset=['spend_lag_1','spend_lag_2','spend_lag_3']).copy()

# -----------------------------
# 3️⃣ Define target and features
# -----------------------------
y = df_model['total_spent']

# Numeric + categorical features
numeric_features = ['spend_lag_1','spend_lag_2','spend_lag_3','age','loyalty_tier_encoded','category_encoded']
one_hot_features = [col for col in df_model.columns if col.startswith('gender_') or col.startswith('preferred_channel_')]
features = numeric_features + one_hot_features

X = df_model[features]

# -----------------------------
# 4️⃣ Train/test split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# -----------------------------
# 5️⃣ Train XGBoost regressor
# -----------------------------
reg = xgb.XGBRegressor(
    n_estimators=500,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

reg.fit(X_train, y_train)

# -----------------------------
# 6️⃣ Evaluate model
# -----------------------------
y_pred = reg.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# MAE
mae = mean_absolute_error(y_test, y_pred)

# R²
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R² Score: {r2:.2f}")


  df['year_month'] = pd.to_datetime(df['year_month'].astype(str) + '-01')


RMSE: 1014.95
MAE: 487.00
R² Score: 0.76


In [42]:
row = {
    'spend_lag_1': 395.726,
    'spend_lag_2': 236.890,
    'spend_lag_3': 65.066,
    'age': 41,
    'loyalty_tier_encoded': 0,
    'category_encoded': 0,
    'gender_F': 0,
    'gender_M': 1,
    'gender_O': 0,
    'preferred_channel_InStore': 0,
    'preferred_channel_Mobile': 1,
    'preferred_channel_Online': 0
}

X_new = pd.DataFrame([row])

# Make prediction
pred_next_month = reg.predict(X_new)

print(f"Predicted next month spend: {pred_next_month[0]:.2f}")

Predicted next month spend: 62.91


In [44]:
df_model = df.copy()
bool_cols = ['gender_F','gender_M','gender_O','preferred_channel_InStore','preferred_channel_Mobile','preferred_channel_Online']
df_model[bool_cols] = df_model[bool_cols].astype(int)

In [45]:
feature_cols = ['total_quantity','age','loyalty_tier_encoded','category_encoded'] + bool_cols

# Add lag features (past 3 months total_spent)
seq_len = 3
X_seq = []
y_seq = []

# Group by customer-category
grouped = df_model.groupby(['customer_id','category'])

for _, group in grouped:
    group = group.sort_values('year_month')
    values = group[feature_cols + ['total_spent']].values  # features + spend
    for i in range(seq_len, len(values)):
        X_seq.append(values[i-seq_len:i, :-1])  # previous 3 months features (exclude current spend)
        y_seq.append(values[i, -1])             # current month spend

X_seq = np.array(X_seq)  # shape: (samples, seq_len, num_features)
y_seq = np.array(y_seq)  # shape: (samples,)


In [46]:
num_samples = X_seq.shape[0]

train_size = int(0.7 * num_samples)   # 70% for training
val_size = int(0.15 * num_samples)    # 15% for validation
test_size = num_samples - train_size - val_size  # 15% for test

X_train = X_seq[:train_size]
y_train = y_seq[:train_size]

X_val = X_seq[train_size:train_size+val_size]
y_val = y_seq[train_size:train_size+val_size]

X_test = X_seq[train_size+val_size:]
y_test = y_seq[train_size+val_size:]

print(X_train.shape, X_val.shape, X_test.shape)


(1587, 3, 10) (340, 3, 10) (341, 3, 10)


In [52]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization

num_features = X_seq.shape[2]

model = Sequential()

# 1st LSTM layer: high units, returns sequences for stacking
model.add(LSTM(256, input_shape=(seq_len, num_features), return_sequences=True))
model.add(Dropout(0.3))
model.add(BatchNormalization())

# 2nd LSTM layer: medium units, returns sequences for stacking
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.25))
model.add(BatchNormalization())

# 3rd LSTM layer: smaller units, returns last output
model.add(LSTM(64, return_sequences=False))
model.add(Dropout(0.2))
model.add(BatchNormalization())

# Dense layers for extra representation
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.1))

# Output layer
model.add(Dense(1))  # Predict spend

# Compile model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

model.summary()


In [55]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=50,
    batch_size=16,
    verbose=1
)

Epoch 1/50
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step - loss: 1250565.6250 - mae: 639.5130 - val_loss: 1980721.3750 - val_mae: 618.1617
Epoch 2/50
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - loss: 1397018.2500 - mae: 588.8960 - val_loss: 2918805.7500 - val_mae: 725.6212
Epoch 3/50
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 19ms/step - loss: 1388291.6250 - mae: 638.8496 - val_loss: 2714864.0000 - val_mae: 644.3001
Epoch 4/50
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - loss: 1258330.8750 - mae: 625.1365 - val_loss: 1866144.3750 - val_mae: 615.5721
Epoch 5/50
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 19ms/step - loss: 1244920.1250 - mae: 596.3274 - val_loss: 1914913.5000 - val_mae: 609.3405
Epoch 6/50
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - loss: 1713263.5000 - mae: 665.7056 - val_loss: 1477638.0000 - val_

In [56]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

y_pred = model.predict(X_test).flatten()

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Test RMSE: {rmse:.2f}")
print(f"Test MAE: {mae:.2f}")
print(f"Test R²: {r2:.2f}")


[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 
Test RMSE: 1003.70
Test MAE: 467.69
Test R²: 0.83


In [60]:
sample_data = pd.DataFrame({
    'total_quantity': [2, 2, 5],
    'age': [41, 41, 41],
    'loyalty_tier_encoded': [0, 0, 0],
    'category_encoded': [0, 0, 0],
    'gender_F': [0, 0, 0],
    'gender_M': [1, 1, 1],
    'gender_O': [0, 0, 0],
    'preferred_channel_InStore': [0, 0, 0],
    'preferred_channel_Mobile': [1, 1, 1],
    'preferred_channel_Online': [0, 0, 0]
})
X_input_seq = sample_data.values[np.newaxis, :, :]  # shape (1, 3, 10)
print("LSTM input shape:", X_input_seq.shape)

LSTM input shape: (1, 3, 10)


In [61]:
# Predict
pred_next_month = model.predict(X_input_seq)
print(f"Predicted spend for next month: {pred_next_month[0][0]:.2f}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 744ms/step
Predicted spend for next month: 51.56


In [62]:
# Save full model
model.save("customer_lstm_model.h5")



In [66]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Dropout, BatchNormalization
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

feature_cols = ['total_quantity','age','loyalty_tier_encoded','category_encoded',
                'gender_F','gender_M','gender_O','preferred_channel_InStore','preferred_channel_Mobile','preferred_channel_Online']
seq_len = 3

# Ensure boolean columns are numeric
bool_cols = ['gender_F','gender_M','gender_O','preferred_channel_InStore','preferred_channel_Mobile','preferred_channel_Online']
df_model[bool_cols] = df_model[bool_cols].astype(int)

# Build sequences
X_seq = []
y_seq = []

grouped = df_model.groupby(['customer_id','category'])
for _, group in grouped:
    group = group.sort_values('year_month')
    values = group[feature_cols + ['total_spent']].values
    for i in range(seq_len, len(values)):
        X_seq.append(values[i-seq_len:i, :-1])  # past 3 months features
        y_seq.append(values[i, -1])             # current month spend

X_seq = np.array(X_seq)  # shape: (samples, seq_len, num_features)
y_seq = np.array(y_seq)
print("X_seq shape:", X_seq.shape, "y_seq shape:", y_seq.shape)


X_seq shape: (2268, 3, 10) y_seq shape: (2268,)


In [65]:
num_samples = X_seq.shape[0]

train_size = int(0.7 * num_samples)
val_size = int(0.15 * num_samples)

X_train = X_seq[:train_size]
y_train = y_seq[:train_size]

X_val = X_seq[train_size:train_size+val_size]
y_val = y_seq[train_size:train_size+val_size]

X_test = X_seq[train_size+val_size:]
y_test = y_seq[train_size+val_size:]

print(X_train.shape, X_val.shape, X_test.shape)


(1587, 3, 10) (340, 3, 10) (341, 3, 10)


In [76]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Dropout, BatchNormalization

num_features = X_seq.shape[2]

model = Sequential()

# 1st GRU layer: large units, return sequences to stack
model.add(GRU(256, input_shape=(seq_len, num_features), return_sequences=True))
model.add(Dropout(0.3))
model.add(BatchNormalization())

# 2nd GRU layer: medium units, return sequences
model.add(GRU(128, return_sequences=True))
model.add(Dropout(0.25))
model.add(BatchNormalization())

# 3rd GRU layer: smaller units, final output
model.add(GRU(64, return_sequences=False))
model.add(Dropout(0.2))
model.add(BatchNormalization())

# Dense layers for richer representation
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.1))

# Output layer
model.add(Dense(1))  # Predict next month spend

# Compile
model.compile(optimizer='adam', loss='mse', metrics=['mae'])
model.summary()


In [80]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=50,
    batch_size=16,
    verbose=1
)


Epoch 1/50
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - loss: 934798.6250 - mae: 540.5110 - val_loss: 1673918.8750 - val_mae: 576.4323
Epoch 2/50
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - loss: 1286987.1250 - mae: 593.5850 - val_loss: 1701966.1250 - val_mae: 584.6757
Epoch 3/50
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - loss: 1233052.0000 - mae: 575.1141 - val_loss: 1610996.3750 - val_mae: 575.1168
Epoch 4/50
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - loss: 1084129.5000 - mae: 540.9100 - val_loss: 2170352.0000 - val_mae: 615.2782
Epoch 5/50
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - loss: 869322.8750 - mae: 527.4205 - val_loss: 1707347.6250 - val_mae: 577.0239
Epoch 6/50
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - loss: 1069703.8750 - mae: 529.1770 - val_loss: 2052214.7500 - val_ma

In [81]:
y_pred = model.predict(X_test).flatten()

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Test RMSE: {rmse:.2f}")
print(f"Test MAE: {mae:.2f}")
print(f"Test R²: {r2:.2f}")


[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
Test RMSE: 891.20
Test MAE: 439.56
Test R²: 0.87


In [86]:
sample_data = pd.DataFrame({
    'total_quantity': [2, 2, 5],
    'age': [41, 41, 41],
    'loyalty_tier_encoded': [0, 0, 0],
    'category_encoded': [0, 0, 0],
    'gender_F': [0, 0, 0],
    'gender_M': [1, 1, 1],
    'gender_O': [0, 0, 0],
    'preferred_channel_InStore': [0, 0, 0],
    'preferred_channel_Mobile': [1, 1, 1],
    'preferred_channel_Online': [0, 0, 0]
})
# Predict next month spend
pred_next_month = model.predict(X_input_seq)
print(f"Predicted next month spend: {pred_next_month[0][0]:.2f}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
Predicted next month spend: 96.16


In [85]:
model.save("customer_gru_model.h5")
print("Model saved as customer_gru_model.h5")



Model saved as customer_gru_model.h5


In [88]:
df_model

Unnamed: 0,customer_id,category,year_month,total_quantity,total_spent,age,loyalty_tier_encoded,category_encoded,gender_F,gender_M,gender_O,preferred_channel_InStore,preferred_channel_Mobile,preferred_channel_Online,cum_spent_till_this_month,spend_lag_1,spend_lag_2,spend_lag_3
0,C00001,Beauty,2025-08-01 01:00:00-01:00,2,93.770,41,0.0,0,0,1,0,0,1,0,0.000,,,
1,C00001,Beauty,2025-09-01 01:00:00-01:00,2,65.066,41,0.0,0,0,1,0,0,1,0,93.770,93.7700,,
2,C00001,Beauty,2025-10-01 01:00:00-01:00,5,236.890,41,0.0,0,0,1,0,0,1,0,158.836,65.0660,93.770,
3,C00001,Beauty,2025-12-01 01:00:00-01:00,2,37.020,41,0.0,0,0,1,0,0,1,0,395.726,236.8900,65.066,93.7700
4,C00001,Clothing,2025-08-01 01:00:00-01:00,1,120.000,41,0.0,1,0,1,0,0,1,0,0.000,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6682,C00250,Home,2025-12-01 01:00:00-01:00,16,2177.038,40,0.0,4,0,1,0,0,1,0,7186.025,2175.9495,962.050,2317.4775
6683,C00250,Sports,2025-09-01 01:00:00-01:00,5,336.530,40,0.0,5,0,1,0,0,1,0,0.000,,,
6684,C00250,Sports,2025-10-01 01:00:00-01:00,1,180.000,40,0.0,5,0,1,0,0,1,0,336.530,336.5300,,
6685,C00250,Sports,2025-11-01 01:00:00-01:00,1,72.230,40,0.0,5,0,1,0,0,1,0,516.530,180.0000,336.530,


In [90]:
import gradio as gr
import numpy as np
import pandas as pd

# --- Feature Columns (must match the training order) ---
feature_cols = [
    'total_quantity','age','loyalty_tier_encoded','category_encoded',
    'gender_F','gender_M','gender_O',
    'preferred_channel_InStore','preferred_channel_Mobile','preferred_channel_Online'
]

seq_len = 3  # LSTM sequence length

# --- Extract dropdown values from df_model ---
customer_list = sorted(df_model["customer_id"].unique().tolist())
category_list = sorted(df_model["category"].unique().tolist())


# -------------------------------
# FUNCTION: prepare last 3-month sequence
# -------------------------------
def prepare_sequence(customer_id, category):
    cust_cat_df = df_model[
        (df_model['customer_id'] == customer_id) &
        (df_model['category'] == category)
    ].sort_values("year_month")

    if len(cust_cat_df) < seq_len:
        return None, f"Not enough monthly history for {customer_id} in {category} (need at least 3 months)."

    last_3 = cust_cat_df.tail(seq_len)
    X = last_3[feature_cols].values     # shape = (3, num_features)
    X = X[np.newaxis, :, :]             # shape = (1, 3, num_features)

    return X, None


# -------------------------------
# FUNCTION: Prediction
# -------------------------------
def predict_spend(customer_id, category):
    X_input, err = prepare_sequence(customer_id, category)

    if err:
        return err

    pred = model.predict(X_input)[0][0]
    return f"Predicted next-month spend for {customer_id} in {category}: ₹{pred:.2f}"


# -------------------------------
# GRADIO UI
# -------------------------------
ui = gr.Interface(
    fn=predict_spend,
    inputs=[
        gr.Dropdown(choices=customer_list, label="Select Customer ID"),
        gr.Dropdown(choices=category_list, label="Select Category")
    ],
    outputs=gr.Textbox(label="Predicted Spend"),
    title="LSTM/GRU Customer Spend Prediction",
    description="Select a Customer ID and Category to predict the next month's spend."
)

ui.launch()


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://6d0001dbe3ca187e0e.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [96]:
import gradio as gr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

seq_len = 3

feature_cols = [
    'total_quantity','age','loyalty_tier_encoded','category_encoded',
    'gender_F','gender_M','gender_O',
    'preferred_channel_InStore','preferred_channel_Mobile','preferred_channel_Online'
]

customer_list = sorted(df_model["customer_id"].unique().tolist())
category_list = sorted(df_model["category"].unique().tolist())


def format_month(col):
    """Convert full timestamp into Month-Year like 'Aug 2025'"""
    return pd.to_datetime(col).dt.strftime("%b %Y")


# --------------------------
# Prepare sequence for one category
# --------------------------
def prepare_sequence(customer_id, category):
    data = df_model[
        (df_model['customer_id'] == customer_id) &
        (df_model['category'] == category)
    ].sort_values("year_month")

    if len(data) < seq_len:
        return None, None, f"Not enough data for {category}"

    X = data.tail(seq_len)[feature_cols].values
    X = X[np.newaxis, :, :]
    return X, data, None


# --------------------------
# Prediction + Graphs
# --------------------------
def predict_for_customer(customer_id):

    category_preds = {}
    history_dict = {}

    for cat in df_model['category'].unique():
        X_input, hist_df, err = prepare_sequence(customer_id, cat)

        if err:
            continue

        pred = model.predict(X_input)[0][0]
        category_preds[cat] = pred

        # Format month names for trend graph
        month_names = format_month(hist_df["year_month"]).tolist()

        history_dict[cat] = {
            "months": month_names,
            "spend": hist_df["total_spent"].tolist(),
            "pred": pred
        }

    # --------------------------
    # Bar Plot (Category Predictions)
    # --------------------------
    fig_bar = plt.figure(figsize=(7,5))
    plt.bar(category_preds.keys(), category_preds.values())
    plt.title(f"Category-wise Predicted Spend for {customer_id}")
    plt.xlabel("Category")
    plt.ylabel("Predicted Spend")
    plt.xticks(rotation=45)
    plt.tight_layout()

    # --------------------------
    # Line Plot (Trend + Prediction)
    # --------------------------
    fig_line = plt.figure(figsize=(10,6))  # Increased size

    for cat, info in history_dict.items():

      # Convert to datetime for sorting
      m = pd.to_datetime(info["months"], format="%b %Y")

      # Sort by actual time order
      sorted_idx = np.argsort(m)

      sorted_months = [info["months"][i] for i in sorted_idx]
      sorted_spend  = [info["spend"][i]  for i in sorted_idx]

      # Append the predicted "Next Month"
      sorted_months.append("Next Month")
      sorted_spend.append(info["pred"])

      plt.plot(sorted_months, sorted_spend, marker="o", linewidth=2.5, label=cat)


    plt.title(f"Spending Trend + Next Month Prediction ({customer_id})", fontsize=14)
    plt.xlabel("Month", fontsize=12)
    plt.ylabel("Spend", fontsize=12)
    plt.xticks(rotation=45)
    plt.legend()
    plt.grid(True, linestyle="--", alpha=0.4)
    plt.tight_layout()

    total_pred = sum(category_preds.values())

    return (
        f"Total Predicted Spend Next Month: ₹{total_pred:.2f}",
        fig_bar,
        fig_line,
    )


# --------------------------
# Gradio UI
# --------------------------
ui = gr.Interface(
    fn=predict_for_customer,
    inputs=gr.Dropdown(choices=customer_list, label="Select Customer ID"),
    outputs=[
        gr.Textbox(label="Predicted Total Spend"),
        gr.Plot(label="Category-wise Next Month Prediction (Bar Chart)"),
        gr.Plot(label="Previous Trend + Next Prediction (Line Chart)")
    ],
    title="Customer Spend Prediction (LSTM/GRU)",
    description="Select a customer to see category-level predictions and spending trends."
)

ui.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://247275de56faa7a2ca.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


