# Model Training & Evaluation
This notebook:
- Builds ML pipelines with preprocessing + SMOTE
- Trains multiple classifiers
- Compares performance using ROC-AUC and classification metrics

In [1]:
import pandas as pd
import joblib
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

## Load Train-test split

In [2]:
X_train = pd.read_csv('../data/X_train.csv')
X_test = pd.read_csv('../data/X_test.csv')
y_train = pd.read_csv('../data/y_train.csv').values.ravel()
y_test = pd.read_csv('../data/y_test.csv').values.ravel()

## Define feature groups

In [3]:
numeric_features = ["tenure", "MonthlyCharges", "TotalCharges"]

binary_features = ["gender", "Partner", "Dependents", "PhoneService", "PaperlessBilling" , "MultipleLines", "OnlineSecurity", "OnlineBackup", 
                   "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies"]

categorical_features = ["InternetService", "Contract", "PaymentMethod"]

## Preprocessing pipeline

In [4]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("bin", OneHotEncoder(drop="if_binary"), binary_features),
        ("cat", OneHotEncoder(drop="first"), categorical_features)
    ]
)

## Train & evaluate models

In [5]:
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'Support Vector Machine': SVC(probability=True, random_state=42)
}

results = []
for name, model in models.items():

    pipeline = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("model", model)
    ])

    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='roc_auc')
    results.append((name, cv_scores.mean(), cv_scores.std()))
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    print(f"\n{name}")
    print(classification_report(y_test, y_pred))


Logistic Regression
              precision    recall  f1-score   support

           0       0.90      0.71      0.80      1033
           1       0.50      0.78      0.61       374

    accuracy                           0.73      1407
   macro avg       0.70      0.75      0.70      1407
weighted avg       0.79      0.73      0.75      1407


Random Forest
              precision    recall  f1-score   support

           0       0.85      0.84      0.85      1033
           1       0.58      0.60      0.59       374

    accuracy                           0.78      1407
   macro avg       0.72      0.72      0.72      1407
weighted avg       0.78      0.78      0.78      1407


Gradient Boosting
              precision    recall  f1-score   support

           0       0.88      0.79      0.83      1033
           1       0.55      0.71      0.62       374

    accuracy                           0.77      1407
   macro avg       0.72      0.75      0.73      1407
weighted avg       

## Model comparison

In [6]:
results_df = pd.DataFrame(results, columns=["Model", "CV ROC-AUC Mean", "CV ROC-AUC Std"]).sort_values("CV ROC-AUC Mean", ascending=False)
print(results_df)

                    Model  CV ROC-AUC Mean  CV ROC-AUC Std
0     Logistic Regression         0.844847        0.019098
2       Gradient Boosting         0.844313        0.017230
3  Support Vector Machine         0.829299        0.020242
1           Random Forest         0.823887        0.013623


## Save best model pipeline (Logistic Regression)

In [7]:
best_pipeline = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("model", LogisticRegression(random_state=42))
])

best_pipeline.fit(X_train, y_train)
joblib.dump(best_pipeline, "../models/base_pipeline.pkl")

['../models/base_pipeline.pkl']