In [2]:
import pandas as pd
df = pd.read_csv("/content/merged_transactions.csv")
df.head()

Unnamed: 0,product_id,category,unit_price,is_discountable,store_id,store_type,region,city,customer_id,gender,age,loyalty_tier,preferred_channel,transaction_id,transaction_date,channel,quantity,discount_pct
0,P0200,Home,106.31,1,S007,Outlet,Central,Grandview,C00010,M,42,Gold,Mobile,T0042714,2025-08-01,InStore,2,0.0
1,P0140,Grocery,9.13,1,S017,Street,Central,Centrum,C00094,F,31,Silver,InStore,T0024445,2025-08-01,InStore,1,0.0
2,P0225,Beauty,16.93,0,S038,Mall,South,Southport,C00164,M,25,Silver,InStore,T0008413,2025-08-01,Online,2,0.0
3,P0105,Grocery,13.08,1,S039,Street,East,Rivermouth,C00197,F,24,Silver,Online,T0037032,2025-08-01,Mobile,2,0.0
4,P0103,Home,143.33,1,S023,Mall,East,Rivermouth,C00104,M,25,Platinum,InStore,T0024521,2025-08-01,InStore,1,0.0


In [3]:
import pandas as pd

# --- Load your dataset ---
df['transaction_date'] = pd.to_datetime(df['transaction_date'])

# --- Create a year-month column ---
df['year_month'] = df['transaction_date'].dt.to_period('M')

# --- Compute total amount spent ---
df['amount_spent'] = df['unit_price'] * df['quantity'] * (1 - df['discount_pct'])

# --- Select customer-level attributes (assumed constant per customer) ---
customer_features = df[['customer_id', 'gender', 'age', 'loyalty_tier', 'preferred_channel']].drop_duplicates()

# --- Group by customer, category, and month ---
monthly_summary = df.groupby(
    ['customer_id', 'category', 'year_month']
).agg(
    total_quantity=('quantity', 'sum'),
    total_spent=('amount_spent', 'sum'),
).reset_index()

# --- Merge static customer features back ---
final_df = monthly_summary.merge(customer_features, on='customer_id', how='left')

# Optional: convert 'year_month' to datetime
final_df['year_month'] = final_df['year_month'].astype(str)

print(final_df.head())


  customer_id  category year_month  total_quantity  total_spent gender  age  \
0      C00001    Beauty    2025-08               2      93.7700      M   41   
1      C00001    Beauty    2025-09               2      65.0660      M   41   
2      C00001    Beauty    2025-10               5     236.8900      M   41   
3      C00001    Beauty    2025-12               5     105.3336      M   41   
4      C00001  Clothing    2025-08               1     120.0000      M   41   

  loyalty_tier preferred_channel  
0       Silver            Mobile  
1       Silver            Mobile  
2       Silver            Mobile  
3       Silver            Mobile  
4       Silver            Mobile  


In [4]:
from sklearn.preprocessing import OrdinalEncoder

final_df['loyalty_tier_encoded'] = OrdinalEncoder(
    categories=[['Bronze', 'Silver', 'Gold', 'Platinum']]
).fit_transform(final_df[['loyalty_tier']])
final_df.head()
df =final_df

In [5]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder

# ----- 1. Ordinal encode loyalty_tier -----
df['loyalty_tier_encoded'] = OrdinalEncoder(
    categories=[['Bronze', 'Silver', 'Gold', 'Platinum']]
).fit_transform(df[['loyalty_tier']])

# ----- 2. Label encode category (NO one-hot) -----
le_cat = LabelEncoder()
df['category_encoded'] = le_cat.fit_transform(df['category'])

# ----- 3. One-hot encode gender + preferred_channel -----
df = pd.get_dummies(df,
                    columns=['gender', 'preferred_channel'],
                    drop_first=False)

df.head()


Unnamed: 0,customer_id,category,year_month,total_quantity,total_spent,age,loyalty_tier,loyalty_tier_encoded,category_encoded,gender_F,gender_M,gender_O,preferred_channel_InStore,preferred_channel_Mobile,preferred_channel_Online
0,C00001,Beauty,2025-08,2,93.77,41,Silver,1.0,0,False,True,False,False,True,False
1,C00001,Beauty,2025-09,2,65.066,41,Silver,1.0,0,False,True,False,False,True,False
2,C00001,Beauty,2025-10,5,236.89,41,Silver,1.0,0,False,True,False,False,True,False
3,C00001,Beauty,2025-12,5,105.3336,41,Silver,1.0,0,False,True,False,False,True,False
4,C00001,Clothing,2025-08,1,120.0,41,Silver,1.0,1,False,True,False,False,True,False


In [6]:
df = df.drop(columns=['loyalty_tier'])

In [7]:
# Ensure proper sorting
df['year_month'] = pd.to_datetime(df['year_month'].astype(str) + '-01')
df = df.sort_values(['customer_id', 'category', 'year_month'])

# Compute cumulative spend **excluding current month**
df['cum_spent_till_this_month'] = (
    df.groupby(['customer_id', 'category'])['total_spent']
      .cumsum() - df['total_spent']
)

# First month will automatically be 0
print(df[['customer_id','category','year_month','total_spent','cum_spent_till_this_month']].head(10))


  customer_id     category year_month  total_spent  cum_spent_till_this_month
0      C00001       Beauty 2025-08-01      93.7700                     0.0000
1      C00001       Beauty 2025-09-01      65.0660                    93.7700
2      C00001       Beauty 2025-10-01     236.8900                   158.8360
3      C00001       Beauty 2025-12-01     105.3336                   395.7260
4      C00001     Clothing 2025-08-01     120.0000                     0.0000
5      C00001     Clothing 2025-10-01     890.1495                   120.0000
6      C00001     Clothing 2025-11-01      71.9800                  1010.1495
7      C00001     Clothing 2025-12-01     103.8700                  1082.1295
8      C00001  Electronics 2025-08-01     325.2800                     0.0000
9      C00001  Electronics 2025-09-01    1133.1600                   325.2800


In [8]:
df.head()

Unnamed: 0,customer_id,category,year_month,total_quantity,total_spent,age,loyalty_tier_encoded,category_encoded,gender_F,gender_M,gender_O,preferred_channel_InStore,preferred_channel_Mobile,preferred_channel_Online,cum_spent_till_this_month
0,C00001,Beauty,2025-08-01,2,93.77,41,1.0,0,False,True,False,False,True,False,0.0
1,C00001,Beauty,2025-09-01,2,65.066,41,1.0,0,False,True,False,False,True,False,93.77
2,C00001,Beauty,2025-10-01,5,236.89,41,1.0,0,False,True,False,False,True,False,158.836
3,C00001,Beauty,2025-12-01,5,105.3336,41,1.0,0,False,True,False,False,True,False,395.726
4,C00001,Clothing,2025-08-01,1,120.0,41,1.0,1,False,True,False,False,True,False,0.0


In [10]:
df_model=df
df_model['year_month'] = pd.to_datetime(df_model['year_month'])
snapshot_date = df_model['year_month'].max() + pd.DateOffset(days=1)

# Group by customer
rfm = df_model.groupby('customer_id').agg({
    'year_month': lambda x: (snapshot_date - x.max()).days,  # Recency
    'total_quantity': 'count',                               # Frequency
    'total_spent': 'sum'                                     # Monetary
}).reset_index()

rfm.rename(columns={
    'year_month': 'recency_days',
    'total_quantity': 'frequency',
    'total_spent': 'monetary'
}, inplace=True)

print(rfm.head())
df_model = df_model.merge(rfm, on='customer_id', how='left')


  customer_id  recency_days  frequency    monetary
0      C00001             1         27  17512.3896
1      C00002             1         29  22592.9985
2      C00003             1         29  94081.6890
3      C00004             1         30  37965.7790
4      C00005             1         28  16494.7320


In [11]:
df_model.head()

Unnamed: 0,customer_id,category,year_month,total_quantity,total_spent,age,loyalty_tier_encoded,category_encoded,gender_F,gender_M,gender_O,preferred_channel_InStore,preferred_channel_Mobile,preferred_channel_Online,cum_spent_till_this_month,recency_days,frequency,monetary
0,C00001,Beauty,2025-08-01,2,93.77,41,1.0,0,False,True,False,False,True,False,0.0,1,27,17512.3896
1,C00001,Beauty,2025-09-01,2,65.066,41,1.0,0,False,True,False,False,True,False,93.77,1,27,17512.3896
2,C00001,Beauty,2025-10-01,5,236.89,41,1.0,0,False,True,False,False,True,False,158.836,1,27,17512.3896
3,C00001,Beauty,2025-12-01,5,105.3336,41,1.0,0,False,True,False,False,True,False,395.726,1,27,17512.3896
4,C00001,Clothing,2025-08-01,1,120.0,41,1.0,1,False,True,False,False,True,False,0.0,1,27,17512.3896


In [20]:
from sklearn.preprocessing import MinMaxScaler

import numpy as np
bool_cols = ['gender_F','gender_M','gender_O','preferred_channel_InStore','preferred_channel_Mobile','preferred_channel_Online']
df_model[bool_cols] = df_model[bool_cols].astype(int)

# Numeric + RFM features
numeric_features = ['total_quantity','age','loyalty_tier_encoded','category_encoded',
                    'recency_days','frequency','monetary']

scaler = MinMaxScaler()

# Fit and transform numerical features
df_model[numeric_features] = scaler.fit_transform(df_model[numeric_features])

# Check
df_model[numeric_features].head()
# All features for LSTM
feature_cols = numeric_features + bool_cols
seq_len = 3  # previous 3 months
X_seq = []
y_seq = []

grouped = df_model.groupby(['customer_id','category'])

for _, group in grouped:
    group = group.sort_values('year_month')
    values = group[feature_cols + ['total_spent']].values  # features + target
    for i in range(seq_len, len(values)):
        # previous 3 months features
        X_seq.append(values[i-seq_len:i, :-1])
        # current month spend as target
        y_seq.append(values[i, -1])

X_seq = np.array(X_seq)  # shape: (samples, seq_len, num_features)
y_seq = np.array(y_seq)  # shape: (samples,)
print("X_seq shape:", X_seq.shape, "y_seq shape:", y_seq.shape)


X_seq shape: (2268, 3, 13) y_seq shape: (2268,)


In [21]:
# Simple chronological split
num_samples = X_seq.shape[0]
train_size = int(0.7 * num_samples)
val_size = int(0.15 * num_samples)

X_train = X_seq[:train_size]
y_train = y_seq[:train_size]

X_val = X_seq[train_size:train_size+val_size]
y_val = y_seq[train_size:train_size+val_size]

X_test = X_seq[train_size+val_size:]
y_test = y_seq[train_size+val_size:]

print(X_train.shape, X_val.shape, X_test.shape)


(1587, 3, 13) (340, 3, 13) (341, 3, 13)


In [22]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization

num_features = X_seq.shape[2]

model = Sequential()

# 1st LSTM layer: large units, return sequences to stack
model.add(LSTM(256, input_shape=(seq_len, num_features), return_sequences=True))
model.add(Dropout(0.3))
model.add(BatchNormalization())

# 2nd LSTM layer: medium units, return sequences
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.25))
model.add(BatchNormalization())

# 3rd LSTM layer: smaller units, final output sequence
model.add(LSTM(64, return_sequences=False))
model.add(Dropout(0.2))
model.add(BatchNormalization())

# Dense layers for richer representation
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.1))

# Output layer
model.add(Dense(1))  # Predict next month spend

# Compile model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])
model.summary()


  super().__init__(**kwargs)


In [23]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=100,
    batch_size=16,
    callbacks=[early_stop],
    verbose=1
)


Epoch 1/100
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 31ms/step - loss: 7392685.5000 - mae: 1298.1089 - val_loss: 12296401.0000 - val_mae: 1513.0376
Epoch 2/100
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 24ms/step - loss: 5959222.0000 - mae: 1154.0969 - val_loss: 11236834.0000 - val_mae: 1339.4760
Epoch 3/100
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 23ms/step - loss: 4060400.2500 - mae: 1005.7312 - val_loss: 10000090.0000 - val_mae: 1318.3508
Epoch 4/100
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 30ms/step - loss: 4432231.0000 - mae: 1148.1305 - val_loss: 8161519.5000 - val_mae: 1233.7394
Epoch 5/100
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 29ms/step - loss: 3272614.2500 - mae: 997.6605 - val_loss: 7336392.5000 - val_mae: 1251.8048
Epoch 6/100
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 24ms/step - loss: 3691672.2500 - mae: 1048.0122 - val_loss:

In [26]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np

# Evaluate model on test set
loss, mae = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Loss (MSE): {loss:.2f}")
print(f"Test MAE: {mae:.2f}")

# Predict on test set
y_pred = model.predict(X_test).flatten()

# Compute R² score
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"Test RMSE: {rmse:.2f}")
print(f"Test R²: {r2:.2f}")


Test Loss (MSE): 634048.50
Test MAE: 448.34
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
Test RMSE: 796.27
Test R²: 0.90
