# Cell 1 : Setup and Data

In [1]:
import sys
import os
import joblib
from sklearn.model_selection import train_test_split

# 1. Build the bridge
sys.path.append(os.path.abspath('..'))

# 2. Import the NEW functions
from src.data_loader import load_raw_data
from src.preprocessing import clean_and_feature_engineer
from src.models import get_classification_models, run_model_tournament
from src.evaluation import evaluate_classification

# 3. Load, Clean, and SPLIT (This defines X_train)
df = clean_and_feature_engineer(load_raw_data())

features_num = ['Income', 'Age', 'Children', 'Recency', 'TotalSpending']
features_cat = ['Education', 'Marital_Status']
X = df[features_num + features_cat]
y = df['Response']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Bridge built and Data Split! X_train is now defined.")

Bridge built and Data Split! X_train is now defined.


# Step 2: The Tournament Cell

In [2]:
# 1. Initialize the Tournament
classification_tourney = get_classification_models(features_num, features_cat)

# 2. Run the competition
print("Running Tournament... please wait.")
results = run_model_tournament(classification_tourney, X_train, y_train, scoring='roc_auc')

print("--- Model Competition Results ---")
results

Running Tournament... please wait.
--- Model Competition Results ---


Unnamed: 0,Model,Mean Score,Std Dev
2,Gradient Boosting,0.805697,0.028259
1,Random Forest,0.798396,0.028952
0,Logistic,0.77595,0.035478


# Step 3: Training the Winner

In [3]:
# 1. Pick the winner from your tournament dictionary
best_model_pipeline = classification_tourney['Random Forest'] 

# 2. Train it on the full training set
best_model_pipeline.fit(X_train, y_train)

# 3. Predict and Evaluate
y_pred = best_model_pipeline.predict(X_test)
y_probs = best_model_pipeline.predict_proba(X_test)[:, 1]

evaluate_classification(y_test, y_pred, y_probs)

# 4. Save
joblib.dump(best_model_pipeline, '../models/best_classifier.joblib')
print("Winner trained and saved!")

--- Classification Report ---
              precision    recall  f1-score   support

           0       0.88      0.98      0.93       376
           1       0.68      0.25      0.37        67

    accuracy                           0.87       443
   macro avg       0.78      0.62      0.65       443
weighted avg       0.85      0.87      0.84       443

ROC-AUC Score: 0.8297
Winner trained and saved!


In [4]:
# 1. Select the winner from the tournament dictionary
# Using the exact name from your table: 'Gradient Boosting'
best_clf_pipeline = classification_tourney['Gradient Boosting']

# 2. Train it on the full training set
best_clf_pipeline.fit(X_train, y_train)

# 3. Final Evaluation on the test set (the data it has never seen)
y_pred = best_clf_pipeline.predict(X_test)
y_probs = best_clf_pipeline.predict_proba(X_test)[:, 1]

print("--- FINAL CHAMPION EVALUATION ---")
evaluate_classification(y_test, y_pred, y_probs)

# 4. Save the professional pipeline
import joblib
os.makedirs('../models', exist_ok=True)
joblib.dump(best_clf_pipeline, '../models/best_classifier.joblib')

print("\nSuccess! The Gradient Boosting champion is saved to models/best_classifier.joblib")

--- FINAL CHAMPION EVALUATION ---
--- Classification Report ---
              precision    recall  f1-score   support

           0       0.87      0.96      0.91       376
           1       0.45      0.19      0.27        67

    accuracy                           0.84       443
   macro avg       0.66      0.58      0.59       443
weighted avg       0.81      0.84      0.81       443

ROC-AUC Score: 0.8124

Success! The Gradient Boosting champion is saved to models/best_classifier.joblib
