# Cell 1 : Setup & Data

In [1]:
import sys
import os
import joblib
from sklearn.model_selection import train_test_split

# 1. Build the bridge
sys.path.append(os.path.abspath('..'))

# 2. Import the UPDATED functions
from src.data_loader import load_raw_data
from src.preprocessing import clean_and_feature_engineer
from src.models import get_regression_models, run_model_tournament  # Changed this line
from src.evaluation import evaluate_regression

# 3. Load and clean
df = clean_and_feature_engineer(load_raw_data())

# Define Features
features_num = ['Income', 'Age', 'Children', 'Recency']
features_cat = ['Education', 'Marital_Status']
X = df[features_num + features_cat]
y = df['TotalSpending']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Setup complete! X_train and X_test are ready.")

Setup complete! X_train and X_test are ready.


# Cell2 : Train and Save the Model

In [2]:
# 1. Initialize the Regression Tournament
regression_tourney = get_regression_models(features_num, features_cat)

# 2. Run the competition (using R2 score)
results = run_model_tournament(regression_tourney, X_train, y_train, scoring='r2')

print("--- Regression Model Competition ---")
results

--- Regression Model Competition ---


Unnamed: 0,Model,Mean Score,Std Dev
2,Random Forest,0.754036,0.032446
1,Ridge,0.689226,0.030705
0,Linear,0.689217,0.030753


# Step 3: Train and Save the Winning Regressor

In [3]:
# 1. Pick the winner
winner_name = "Random Forest" 
best_reg_pipeline = regression_tourney[winner_name]

# 2. Train on full training set
best_reg_pipeline.fit(X_train, y_train)

# 3. Evaluate
y_pred = best_reg_pipeline.predict(X_test)
evaluate_regression(y_test, y_pred)

# 4. Save
joblib.dump(best_reg_pipeline, '../models/best_regressor.joblib')
print(f"Success! {winner_name} saved as the champion regressor.")

RÂ² Score: 0.7779
Mean Absolute Error: $198.83
Root Mean Squared Error: $293.08
Success! Random Forest saved as the champion regressor.
