# Product Recommendation Model

This notebook implements product category prediction using:
- **LightGBM Classifier** (Gradient Boosting)
- **Random Forest Classifier**
- **XGBoost Classifier**

Models are evaluated on accuracy, F1-score, precision, and recall metrics.

In [None]:
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import matplotlib.pyplot as plt
import joblib

In [182]:
dataset = pd.read_csv('../data/merged_customer_data_detailed.csv')
print(f"Dataset shape: {dataset.shape}")
dataset.head()

Dataset shape: (213, 11)


Unnamed: 0,customer_id_new,social_media_platform,engagement_score,purchase_interest_score,review_sentiment,customer_id_numeric,transaction_id,purchase_amount,purchase_date,product_category,customer_rating
0,A190,Twitter,82,4.8,Neutral,190,1031,333,2024-01-31,Groceries,3.8
1,A190,Twitter,82,4.8,Neutral,190,1140,401,2024-05-19,Sports,4.9
2,A150,Facebook,96,1.6,Positive,150,1042,389,2024-02-11,Sports,
3,A150,Facebook,96,1.6,Positive,150,1046,177,2024-02-15,Books,3.6
4,A162,Twitter,89,2.6,Positive,162,1079,101,2024-03-19,Books,4.2


In [183]:
# remove leading/trailing spaces
dataset.columns = dataset.columns.str.strip()

# Preprocessing - Drop ID columns that are not useful for prediction
columns_to_drop = ['customer_id_numeric', 'transaction_id', 'customer_id_new']
reduced_dataset = dataset.drop([col for col in columns_to_drop if col in dataset.columns], axis=1)

print(f"Dataset shape after dropping ID columns: {reduced_dataset.shape}")
print(f"Remaining columns: {list(reduced_dataset.columns)}")

Dataset shape after dropping ID columns: (213, 8)
Remaining columns: ['social_media_platform', 'engagement_score', 'purchase_interest_score', 'review_sentiment', 'purchase_amount', 'purchase_date', 'product_category', 'customer_rating']


In [184]:
X = reduced_dataset.drop('product_category', axis=1)
Y = reduced_dataset['product_category'].str.strip()  # Clean product category values

print(f"Features shape: {X.shape}")
print(f"Target shape: {Y.shape}")
print(f"\nProduct categories: {Y.unique()}")

Features shape: (213, 7)
Target shape: (213,)

Product categories: ['Groceries' 'Sports' 'Books' 'Electronics' 'Clothing']


In [185]:
# Extract date features from purchase_date
le = LabelEncoder()

if 'purchase_date' in X.columns:
    X['purchase_date'] = pd.to_datetime(X['purchase_date'])
    X['purchase_month'] = X['purchase_date'].dt.month
    X['purchase_day'] = X['purchase_date'].dt.day
    X['purchase_weekday'] = X['purchase_date'].dt.weekday
    
    # Convert to float
    X = X.astype({
        'purchase_month': float,
        'purchase_day': float,
        'purchase_weekday': float
    })
    
    # Drop the original date column
    X = X.drop('purchase_date', axis=1)
    print("✓ Date features extracted: purchase_month, purchase_day, purchase_weekday")

# Ensure numeric columns are float (clean string values first)
numeric_cols = ['purchase_amount', 'customer_rating', 'engagement_score', 'purchase_interest_score']
for col in numeric_cols:
    if col in X.columns:
        # Convert to string, strip spaces, then to numeric (coerce errors to NaN)
        X[col] = pd.to_numeric(X[col].astype(str).str.strip(), errors='coerce')

# Fill any NaN values that resulted from conversion errors with the column median
X[numeric_cols] = X[numeric_cols].fillna(X[numeric_cols].median())

print(f"\nFeatures after date extraction: {X.shape}")
X.head()

✓ Date features extracted: purchase_month, purchase_day, purchase_weekday

Features after date extraction: (213, 9)


Unnamed: 0,social_media_platform,engagement_score,purchase_interest_score,review_sentiment,purchase_amount,customer_rating,purchase_month,purchase_day,purchase_weekday
0,Twitter,82,4.8,Neutral,333,3.8,1.0,31.0,2.0
1,Twitter,82,4.8,Neutral,401,4.9,5.0,19.0,6.0
2,Facebook,96,1.6,Positive,389,3.2,2.0,11.0,6.0
3,Facebook,96,1.6,Positive,177,3.6,2.0,15.0,3.0
4,Twitter,89,2.6,Positive,101,4.2,3.0,19.0,1.0


In [186]:
# Encode categorical variables
if 'social_media_platform' in X.columns:
    X['social_media_platform'] = X['social_media_platform'].astype('category')
    X['social_media_platform'] = le.fit_transform(X['social_media_platform'])
    print("✓ Encoded: social_media_platform")

if 'review_sentiment' in X.columns:
    X['review_sentiment'] = le.fit_transform(X['review_sentiment'])
    print("✓ Encoded: review_sentiment")

# Encode target variable
Y = le.fit_transform(Y)
print(f"✓ Encoded target variable: {len(np.unique(Y))} classes")

print(f"\nFinal features shape: {X.shape}")
print(f"Feature columns: {list(X.columns)}")

✓ Encoded: social_media_platform
✓ Encoded: review_sentiment
✓ Encoded target variable: 5 classes

Final features shape: (213, 9)
Feature columns: ['social_media_platform', 'engagement_score', 'purchase_interest_score', 'review_sentiment', 'purchase_amount', 'customer_rating', 'purchase_month', 'purchase_day', 'purchase_weekday']


## Feature Scaling

In [187]:
# Scale numerical columns
nums_cols = ['purchase_amount', 'customer_rating', 'engagement_score', 'purchase_interest_score']
nums_cols = [col for col in nums_cols if col in X.columns]

scaler = StandardScaler()
X[nums_cols] = scaler.fit_transform(X[nums_cols])

print(f"✓ Scaled {len(nums_cols)} numerical columns")
print(f"Scaled columns: {nums_cols}")

✓ Scaled 4 numerical columns
Scaled columns: ['purchase_amount', 'customer_rating', 'engagement_score', 'purchase_interest_score']


In [188]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

print(f"✓ Training samples: {X_train.shape[0]}")
print(f"✓ Test samples: {X_test.shape[0]}")

✓ Training samples: 170
✓ Test samples: 43


## Model 1: XGBoost Classifier

In [189]:
xgb_model = XGBClassifier(n_estimators=100, random_state=42)

print("Training XGBoost model...")
xgb_model.fit(X_train, y_train)
print("✓ XGBoost model trained successfully")

Training XGBoost model...


✓ XGBoost model trained successfully


In [190]:
y_pred_xgb = xgb_model.predict(X_test) # type: ignore

print("XGBoost Model Performance")
print(f"Accuracy: {accuracy_score(y_test, y_pred_xgb):.4f}")
print(f"F1-score (weighted): {f1_score(y_test, y_pred_xgb, average='weighted'):.4f}")
print(f"F1-score (macro): {f1_score(y_test, y_pred_xgb, average='macro'):.4f}")
print(f"Precision (weighted): {precision_score(y_test, y_pred_xgb, average='weighted'):.4f}")
print(f"Recall (weighted): {recall_score(y_test, y_pred_xgb, average='weighted'):.4f}")

XGBoost Model Performance
Accuracy: 0.6744
F1-score (weighted): 0.6648
F1-score (macro): 0.6655
Precision (weighted): 0.7026
Recall (weighted): 0.6744


## Model 2: Random Forest Classifier

In [191]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

print("Training Random Forest model...")
rf_model.fit(X_train, y_train)
print("✓ Random Forest model trained successfully")

Training Random Forest model...
✓ Random Forest model trained successfully


In [192]:
y_pred_rf = rf_model.predict(X_test) # type: ignore

print("Random Forest MOdel Performance")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print(f"F1-score (weighted): {f1_score(y_test, y_pred_rf, average='weighted'):.4f}")
print(f"F1-score (macro): {f1_score(y_test, y_pred_rf, average='macro'):.4f}")
print(f"Precision (weighted): {precision_score(y_test, y_pred_rf, average='weighted'):.4f}")
print(f"Recall (weighted): {recall_score(y_test, y_pred_rf, average='weighted'):.4f}")

Random Forest MOdel Performance
Accuracy: 0.6279
F1-score (weighted): 0.6315
F1-score (macro): 0.6312
Precision (weighted): 0.6802
Recall (weighted): 0.6279


## Model 3: LightGBM Classifier

In [193]:
# Evaluate LightGBM
y_pred_lgbm = lgbm_model.predict(X_test) # type: ignore

print("LIGHTGBM Model Performance")
print(f"Accuracy: {accuracy_score(y_test, y_pred_lgbm):.4f}")
print(f"F1-score (weighted): {f1_score(y_test, y_pred_lgbm, average='weighted'):.4f}")
print(f"F1-score (macro): {f1_score(y_test, y_pred_lgbm, average='macro'):.4f}")
print(f"Precision (weighted): {precision_score(y_test, y_pred_lgbm, average='weighted'):.4f}")
print(f"Recall (weighted): {recall_score(y_test, y_pred_lgbm, average='weighted'):.4f}")

LIGHTGBM Model Performance
Accuracy: 0.5581
F1-score (weighted): 0.5509
F1-score (macro): 0.5434
Precision (weighted): 0.5712
Recall (weighted): 0.5581


In [194]:
# Train LightGBM model
num_classes = len(np.unique(Y))
lgbm_model = LGBMClassifier(
    learning_rate=0.05,
    num_leaves=31,
    max_depth=7,
    n_estimators=100,
    lambda_l1=0.1,
    lambda_l2=0.1,
    objective='multiclass',
    num_class=num_classes,
    random_state=42
)

print("Training LightGBM model...")
lgbm_model.fit(X_train, y_train)
print("✓ LightGBM model trained successfully")

Training LightGBM model...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000024 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 186
[LightGBM] [Info] Number of data points in the train set: 170, number of used features: 9
[LightGBM] [Info] Start training from score -1.803594
[LightGBM] [Info] Start training from score -1.803594
[LightGBM] [Info] Start training from score -1.580450
[LightGBM] [Info] Start training from score -1.701811
[LightGBM] [Info] Start training from score -1.264597
✓ LightGBM model trained successfully


In [195]:
# Save all trained models
joblib.dump(xgb_model, '../models/xgb_model.joblib')
joblib.dump(rf_model, '../models/rf_model.joblib')
joblib.dump(lgbm_model, '../models/lgbm_model.joblib')

print("MOdels Saved")
print("✓ XGBoost model saved to: ../models/xgb_model.joblib")
print("✓ Random Forest model saved to: ../models/rf_model.joblib")
print("✓ LightGBM model saved to: ../models/lgbm_model.joblib")

MOdels Saved
✓ XGBoost model saved to: ../models/xgb_model.joblib
✓ Random Forest model saved to: ../models/rf_model.joblib
✓ LightGBM model saved to: ../models/lgbm_model.joblib


In [196]:
print("Model Comparison Summary")

models_performance = pd.DataFrame({
    'Model': ['LightGBM', 'Random Forest', 'XGBoost'],
    'Accuracy': [
        accuracy_score(y_test, y_pred_lgbm),
        accuracy_score(y_test, y_pred_rf),
        accuracy_score(y_test, y_pred_xgb)
    ],
    'F1-Score (Weighted)': [
        f1_score(y_test, y_pred_lgbm, average='weighted'),
        f1_score(y_test, y_pred_rf, average='weighted'),
        f1_score(y_test, y_pred_xgb, average='weighted')
    ],
    'F1-Score (Macro)': [
        f1_score(y_test, y_pred_lgbm, average='macro'),
        f1_score(y_test, y_pred_rf, average='macro'),
        f1_score(y_test, y_pred_xgb, average='macro')
    ],
    'Precision': [
        precision_score(y_test, y_pred_lgbm, average='weighted'),
        precision_score(y_test, y_pred_rf, average='weighted'),
        precision_score(y_test, y_pred_xgb, average='weighted')
    ],
    'Recall': [
        recall_score(y_test, y_pred_lgbm, average='weighted'),
        recall_score(y_test, y_pred_rf, average='weighted'),
        recall_score(y_test, y_pred_xgb, average='weighted')
    ]
})

print(models_performance.to_string(index=False))

# Determine best model
best_model_idx = models_performance['F1-Score (Weighted)'].idxmax()
best_model_name = models_performance.loc[best_model_idx, 'Model']
print(f"\n✓ Best performing model: {best_model_name}")
print(f"  F1-Score (Weighted): {models_performance.loc[best_model_idx, 'F1-Score (Weighted)']:.4f}")

Model Comparison Summary
        Model  Accuracy  F1-Score (Weighted)  F1-Score (Macro)  Precision   Recall
     LightGBM  0.558140             0.550880          0.543386   0.571221 0.558140
Random Forest  0.627907             0.631475          0.631217   0.680233 0.627907
      XGBoost  0.674419             0.664787          0.665471   0.702639 0.674419

✓ Best performing model: XGBoost
  F1-Score (Weighted): 0.6648
