In [None]:
# verstack requirements.txt file:

# !pip install \
#     "numpy>=1.26.4,<=2.1.1" \
#     "pandas==2.2.2" \
#     "scikit-learn>=1.3.2,<=1.5.1" \
#     "lightgbm>=4.4.0,<=4.5.0" \
#     "optuna>=3.5.0,<=4.0.0" \
#     "optuna-integration>=3.2.0,<=4.0.0" \
#     "plotly>=5.11.0,<=5.24.0" \
#     "matplotlib==3.9.2" \
#     "seaborn==0.13.2" \
#     "python-dateutil==2.9.0" \
#     "holidays==0.56" \
#     "mlxtend==0.23.1" \
#     "category_encoders>=2.5.1,<=2.6.3" \
#     "verstack"

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_validate
from sklearn.metrics import accuracy_score, classification_report, f1_score, precision_score, recall_score
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression # Baseline
from sklearn.ensemble import RandomForestClassifier 
from sklearn.svm import SVC 
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import joblib
from verstack import NaNImputer

### Load preprocessed data (ready for modeling)

In [None]:
train_preprocessed = pd.read_csv('https://raw.githubusercontent.com/MohamedMostafa259/Customer-Churn-Prediction-and-Analysis/refs/heads/main/Milestone2_FeatureEng_AdvancedAnalysis/data/train_split_preprocessed.csv')
train_preprocessed.head()

In [None]:
train_preprocessed.isna().sum()

In [None]:
X_train_preprocessed = train_preprocessed.drop('churn_risk_score', axis=1)
y_train = train_preprocessed['churn_risk_score']

### Selecting promising models

In [None]:
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'RandomForest': RandomForestClassifier(),
    'SVC': SVC(),
    'XGBoost': XGBClassifier(),
    'CatBoost': CatBoostClassifier(verbose=0),
    'LightGBM': LGBMClassifier(verbose=0)
}


for name, model in models.items():
    print(f"CV on {name}")
    cv_results = cross_validate(
        model,
        X_train_preprocessed,
# target labels (y_train_split) starts from '1': [1, 2, 3, 4, 5], 
# but XGBoost expects them to start from 0, like [0, 1, 2, 3, 4].
        y_train - 1,
# don't forget to add one in the prediction time:
# y_pred = xgb_clf.predict(X_test_preprocessed) + 1
        cv=4,
        scoring='accuracy',
        return_train_score=True
    )
    print(f"Train Accuracy: {cv_results['train_score'].mean():.4f} (+/- {cv_results['train_score'].std():.4f})")
    print(f"Validation Accuracy: {cv_results['test_score'].mean():.4f} (+/- {cv_results['test_score'].std():.4f})")
    print('-'*30)


It's clear that XGBoost, CatBoost, LightGBM are the best models. So, we will take them to next step, which hyperparamater tuning.

### Hyperparameter Tuning

In [None]:
pipe = Pipeline([
	('model', LogisticRegression()) # will be replaced by the best model
])

param_grid = [
    {
        'model': [XGBClassifier(verbosity=0, random_state=42)],
        'model__n_estimators': [100, 200],
        'model__learning_rate': [0.01, 0.1],
        'model__max_depth': [6, 10, 13],
        'model__subsample': [0.5, 0.75, 1],
    },
    {
        # `bootstrap_type='Bernoulli'` is required for `subsample` parameter to work
        'model': [CatBoostClassifier(verbose=0, random_state=42, bootstrap_type='Bernoulli')],
        'model__iterations': [100, 200],  # CatBoost uses iterations instead of n_estimators
        'model__learning_rate': [0.01, 0.1],
        'model__depth': [6, 10, 13],  # CatBoost uses depth instead of max_depth
        'model__subsample': [0.5, 0.75, 1],
    },
    {
        'model': [LGBMClassifier(verbose=-1, random_state=42)],
        'model__n_estimators': [100, 200],
        'model__learning_rate': [0.01, 0.1],
        'model__max_depth': [6, 10, 13],
        'model__subsample': [0.5, 0.75, 1],
    }
]

grid = RandomizedSearchCV(pipe, param_distributions=param_grid, cv=4, scoring='accuracy', 
                    verbose=1, return_train_score=True, n_iter=60, n_jobs=-1, 
                    error_score='raise', random_state=42)

grid.fit(X_train_preprocessed, y_train-1)

In [None]:
grid.best_params_

In [None]:
grid.best_score_

In [None]:
best_model = grid.best_estimator_
best_model

In [None]:
cv_results_df = pd.DataFrame(grid.cv_results_).sort_values('mean_test_score', ascending=False)
print(cv_results_df.shape)
cols_to_show = [ "rank_test_score", "mean_test_score", "std_test_score",
                "mean_train_score", "std_train_score", "param_model", 
                "param_model__subsample", "param_model__n_estimators",
                "param_model__max_depth", "param_model__learning_rate"
]
cv_results_df[cols_to_show].head(10)

In [None]:
cv_results_df[(cv_results_df['param_model__subsample']==0.750) & (cv_results_df['param_model__n_estimators']==100) &\
                (cv_results_df['param_model__max_depth']==6) & (cv_results_df['param_model__learning_rate']==0.01)]

The best model was `XGBClassifier` with these parameters:
- 'subsample': 0.75,
- 'n_estimators': 200,
- 'max_depth': 6,
- 'learning_rate': 0.01

Let's see what is the performance of the same model with 'n_estimators'=100. 

In [None]:
xgb_clf = XGBClassifier(verbosity=0, random_state=42, n_estimators=100, 
                       learning_rate=0.01, max_depth=6, subsample=0.75)
xgb_cv_results = cross_validate(xgb_clf, X_train_preprocessed, y_train-1, 
                                cv=4, scoring='accuracy', return_train_score=True)

In [None]:
print(f"XGB Train Accuracy: {xgb_cv_results['train_score'].mean():.4f} (+/- {cv_results['train_score'].std():.4f})")
print(f"XGB Validation Accuracy: {xgb_cv_results['test_score'].mean():.4f} (+/- {cv_results['test_score'].std():.4f})")
print('\nBest model from cross validation:')
cv_results_df[['mean_train_score', 'mean_test_score']].iloc[0]

It seems that the best model XGBClassifier, resulted from the Randomized Cross Validation above, performs better (or equivalent) when having less number of estimators: n_estimators=100 !!

So, let's stick with this simpler version.

#### Merging the selected ml model with the previous pipelines & transformers into one pipeline


In [None]:
# `DataCleaner` and `NaNImputerWrapper` are needed for `cleaning_pipeline`
from custom_transformers import DataCleaner, NaNImputerWrapper, FeatureEng

In [None]:
train_split_cleaned = pd.read_csv('https://raw.githubusercontent.com/MohamedMostafa259/Customer-Churn-Prediction-and-Analysis/main/Milestone1_DataCollection_EDA_DataCleaning/data/train_split_cleaned.csv')
val_split_cleaned = pd.read_csv('https://raw.githubusercontent.com/MohamedMostafa259/Customer-Churn-Prediction-and-Analysis/main/Milestone1_DataCollection_EDA_DataCleaning/data/validation_split_cleaned.csv')

In [None]:
X_train_split = train_split_cleaned.drop('churn_risk_score', axis=1)
y_train_split = train_split_cleaned['churn_risk_score']

In [None]:
X_val_split = val_split_cleaned.drop('churn_risk_score', axis=1)
y_val_split = val_split_cleaned['churn_risk_score']

In [None]:
# ../ means go up one dir
cleaning_pipeline = joblib.load("../../Milestone1_DataCollection_EDA_DataCleaning/pipelines/cleaning_pipeline.joblib")
cleaning_pipeline

In [None]:
pipeline = Pipeline([
    ('cleaning_pipeline', cleaning_pipeline),  
    ('feature_engineering', FeatureEng()), 
	('model', xgb_clf)
])

pipeline.fit(X_train_split, y_train_split-1)

In [None]:
y_train_pred = pipeline.predict(X_train_split) + 1

In [None]:
print('train_accuracy:', accuracy_score(y_train_split, y_train_pred), '\n')
print('train_classification_report:\n', classification_report(y_train_split, y_train_pred), '\n')

In [None]:
y_val_pred = pipeline.predict(X_val_split) + 1

In [None]:
print('test_accuracy:', accuracy_score(y_val_split, y_val_pred), '\n')
print('test_classification_report:\n', classification_report(y_val_split, y_val_pred), '\n')

### Summary of the model results

* **Overall test accuracy**: **0.788** → The model correctly predicts the class **\~79% of the time** across the entire test set, which is better than the score of the 1st winner of the competition made on this dataset! [He built a model with **77%** accuracy.](https://www.hackerearth.com/challenges/new/competitive/hackerearth-machine-learning-challenge-predict-customer-churn/#:~:text=Machine%20Learning%20practice-,Winners,-Adarsh%20Wase)

* **F1-score (macro avg) = F1-score (weighted avg) = 0.77** → The model performs well overall despite imbalanced data.

---

**One Limitation of the model:**

It has low recall (0.43) on class 4, leading to a low f1-score (0.57) on that class as well. 

As a result, many real class 4s are getting misclassified. 

**Solution** may be:

* More examples 

* Better features focusing on separating class 4

* decreasing the threshold

### Saving the final ML pipeline

In [None]:
joblib.dump(pipeline, '/kaggle/working/full_xgb_pipeline.joblib')