# Phase 1: Baseline Model Training (Optimized & Balanced)

This notebook implements the baseline training for the **OP-ECOM** project. It establishes a comparison between several traditional models on the UCI Online Shoppers Purchasing Intention dataset.

### üíé Key Features:
1. **80/10/10 Split**: Training (80%), Testing (10%), and Validation (10%).
2. **Training Set Balancing**: Manual upsampling to ensure the AI learns from an equal number of buyers and non-buyers.
3. **Premium Visuals**: Lapis Lazuli themed performance charts.

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import (
    roc_auc_score, f1_score, precision_score, recall_score,
    classification_report, confusion_matrix, roc_curve, precision_recall_curve
)
import xgboost as xgb
import joblib
import json
from datetime import datetime
from sklearn.utils import resample

# Premium Styling (ŸÑÿßÿ¨Ÿàÿ±ÿØ€å)
plt.style.use('seaborn-v0_8-whitegrid')
PRIMARY_COLOR = '#1E4FA8'
SECONDARY_COLOR = '#E8F0FF'

# Paths
DATA_PATH = "../data/raw/online_shoppers_intention.csv"
MODELS_PATH = "../backend/models"
REPORTS_PATH = "../reports/metrics"

os.makedirs(MODELS_PATH, exist_ok=True)
os.makedirs(REPORTS_PATH, exist_ok=True)

print("‚úÖ Environment Ready!")

## 1. Data Loading & Distribution
We start by loading the UCI dataset and verifying the class imbalance.

In [None]:
df = pd.read_csv(DATA_PATH)
print(f"Dataset shape: {df.shape}")

# Target distribution visualization
plt.figure(figsize=(8, 5))
sns.countplot(x='Revenue', data=df, palette=[SECONDARY_COLOR, PRIMARY_COLOR])
plt.title('Purchase Intent Distribution (Original)')
plt.show()

df.head()

## 2. Advanced Preprocessing
Implementing the 80/10/10 split and balancing the training set via upsampling.

In [None]:
X = df.drop('Revenue', axis=1)
y = df['Revenue'].astype(int)

# 1. Features Handling
categorical_cols = ['Month', 'VisitorType', 'Weekend']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le

# 2. 80/10/10 Split Strategy
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_test, X_val, y_test, y_val = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

# 3. Scaling
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
X_val_scaled = X_val.copy()

X_train_scaled[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test_scaled[numerical_cols] = scaler.transform(X_test[numerical_cols])
X_val_scaled[numerical_cols] = scaler.transform(X_val[numerical_cols])

# 4. Training Set Balancing (Oversampling)
train_data = pd.concat([X_train_scaled, y_train], axis=1)
not_buy = train_data[train_data.Revenue == 0]
buy = train_data[train_data.Revenue == 1]

buy_upsampled = resample(buy, replace=True, n_samples=len(not_buy), random_state=42)
train_balanced = pd.concat([not_buy, buy_upsampled])

X_train_final = train_balanced.drop('Revenue', axis=1)
y_train_final = train_balanced.Revenue

print(f"‚úÖ Preprocessing Complete!")
print(f"   ‚Üí Training: {len(X_train_final)} (Balanced)")
print(f"   ‚Üí Test:     {len(X_test)}")
print(f"   ‚Üí Val:      {len(X_val)}")

## 3. Multi-Model Training Pipeline
We train baseline models and collect metrics for a comprehensive comparison.

In [None]:
# 3.1 Logistic Regression
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train_final, y_train_final)
y_prob_lr = lr_model.predict_proba(X_test_scaled)[:, 1]
y_pred_lr = lr_model.predict(X_test_scaled)

# 3.2 XGBoost Classifier
xgb_model = xgb.XGBClassifier(n_estimators=100, max_depth=6, random_state=42, eval_metric='logloss')
xgb_model.fit(X_train_final, y_train_final)
y_prob_xgb = xgb_model.predict_proba(X_test_scaled)[:, 1]
y_pred_xgb = xgb_model.predict(X_test_scaled)

# 3.3 Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_final, y_train_final)
y_prob_dt = dt_model.predict_proba(X_test_scaled)[:, 1]
y_pred_dt = dt_model.predict(X_test_scaled)

# 3.4 Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_final, y_train_final)
y_prob_rf = rf_model.predict_proba(X_test_scaled)[:, 1]
y_pred_rf = rf_model.predict(X_test_scaled)

# 3.5 SVM (Support Vector Machine)
svm_model = SVC(probability=True, random_state=42)
svm_model.fit(X_train_final, y_train_final)
y_prob_svm = svm_model.predict_proba(X_test_scaled)[:, 1]
y_pred_svm = svm_model.predict(X_test_scaled)

def get_metrics(name, y_true, y_pred, y_prob):
    return {
        'model': name,
        'auc_roc': roc_auc_score(y_true, y_prob),
        'f1': f1_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred)
    }

results = [
    get_metrics('Logistic Regression', y_test, y_pred_lr, y_prob_lr),
    get_metrics('XGBoost', y_test, y_pred_xgb, y_prob_xgb),
    get_metrics('Decision Tree', y_test, y_pred_dt, y_prob_dt),
    get_metrics('Random Forest', y_test, y_pred_rf, y_prob_rf),
    get_metrics('SVM', y_test, y_pred_svm, y_prob_svm)
]

pd.DataFrame(results).set_index('model')

## 4. Visual Comparison
Let's see the performance spread and feature importance across models.

In [None]:
# 4.1 Combined ROC Curves
plt.figure(figsize=(10, 7))
probs = {
    'Logistic Regression': y_prob_lr,
    'XGBoost': y_prob_xgb,
    'Decision Tree': y_prob_dt,
    'Random Forest': y_prob_rf,
    'SVM': y_prob_svm
}

for name, prob in probs.items():
    fpr, tpr, _ = roc_curve(y_test, prob)
    auc = roc_auc_score(y_test, prob)
    plt.plot(fpr, tpr, label=f"{name} (AUC = {auc:.3f})")

plt.plot([0, 1], [0, 1], 'k--', alpha=0.5)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Baseline Model Performance: ROC Comparison')
plt.legend()
plt.show()

# 4.2 Feature Importance (Comparison)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

importance_xgb = pd.Series(xgb_model.feature_importances_, index=X_train.columns).sort_values(ascending=False)
sns.barplot(x=importance_xgb.values, y=importance_xgb.index, ax=ax1, palette='Blues_r')
ax1.set_title('XGBoost Feature Importance')

importance_rf = pd.Series(rf_model.feature_importances_, index=X_train.columns).sort_values(ascending=False)
sns.barplot(x=importance_rf.values, y=importance_rf.index, ax=ax2, palette='Blues_r')
ax2.set_title('Random Forest Feature Importance')

plt.tight_layout()
plt.show()

# 5. Saving Results
best_result = pd.DataFrame(results).set_index('model')['auc_roc'].idxmax()
print(f"‚≠ê Best Baseline Model: {best_result}")

# Save metrics
with open(f"{REPORTS_PATH}/baseline_comparison.json", "w") as f:
    json.dump(results, f, indent=4)

# Save core components
joblib.dump(xgb_model, f"{MODELS_PATH}/xgb_baseline.joblib")
joblib.dump(rf_model, f"{MODELS_PATH}/rf_baseline.joblib")
joblib.dump(lr_model, f"{MODELS_PATH}/lr_baseline.joblib")
joblib.dump(scaler, f"{MODELS_PATH}/scaler.joblib")