## 1.Importing the dependencies

In [None]:
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBRFClassifier
import pickle
import optuna

In [None]:
# List of dataset names to load
names = ["X_train", "y_train", "X_train_smote", "y_train_smote",]

loaded_data = {}
for name in names:
    with open(f"../data/{name}.pkl", "rb") as f:
        loaded_data[name] = pickle.load(f)

# Unpack variables
X_train = loaded_data["X_train"]
y_train = loaded_data["y_train"]
X_train_smote = loaded_data["X_train_smote"]
y_train_smote = loaded_data["y_train_smote"]

## 2.Model Selection

In [8]:
# dictionary of models
models = {
    "Decision Tree" : DecisionTreeClassifier(random_state = 42),
    "Random Forest" : RandomForestClassifier(random_state = 42),
    "XGBoost": XGBRFClassifier(random_state = 42)
}

In [9]:
# dictionary to store the cross validation results
cv_scores ={}

#perform 5-fold cross validation for each model
for model_name,model in models.items():
    print(f"Training {model_name} with default parameters")
    scores = cross_val_score(model,X_train_smote,y_train_smote,cv=5,scoring="accuracy")
    cv_scores[model_name] = scores
    print(f"{model_name} cross-validation accuracy:{np.mean(scores):.2f}")
    print("-"*70)

Training Decision Tree with default parameters
Decision Tree cross-validation accuracy:0.79
----------------------------------------------------------------------
Training Random Forest with default parameters
Random Forest cross-validation accuracy:0.84
----------------------------------------------------------------------
Training XGBoost with default parameters
XGBoost cross-validation accuracy:0.82
----------------------------------------------------------------------


Random forest gives the highest accuracy compared to other models with default parameters 

## 3.Hyperparameter Tuning

In [None]:
def objective(trial):
    # Suggest values for hyperparameters
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    max_depth = trial.suggest_int('max_depth', 3, 20)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2'])

    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42,
        n_jobs=-1
    )

    # Perform cross-validation on your SMOTE-processed training data
    scores = cross_val_score(model, X_train_smote, y_train_smote, cv=5, scoring='accuracy')
    return np.mean(scores)


In [17]:
study = optuna.create_study(direction='maximize')  # We want to maximize accuracy
study.optimize(objective, n_trials=50)  # Try 50 sets of hyperparameters

print("Best hyperparameters:", study.best_params)
print("Best CV accuracy:", study.best_value)


[I 2025-09-17 14:02:43,572] A new study created in memory with name: no-name-3394ef2e-2858-42df-b198-94ba35973a2d
[I 2025-09-17 14:02:44,744] Trial 0 finished with value: 0.8060733102733628 and parameters: {'n_estimators': 124, 'max_depth': 4, 'min_samples_split': 11, 'min_samples_leaf': 4, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.8060733102733628.
[I 2025-09-17 14:02:47,584] Trial 1 finished with value: 0.8389474874848577 and parameters: {'n_estimators': 250, 'max_depth': 16, 'min_samples_split': 18, 'min_samples_leaf': 6, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.8389474874848577.
[I 2025-09-17 14:02:48,776] Trial 2 finished with value: 0.8159828947560459 and parameters: {'n_estimators': 122, 'max_depth': 5, 'min_samples_split': 6, 'min_samples_leaf': 7, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.8389474874848577.
[I 2025-09-17 14:02:49,463] Trial 3 finished with value: 0.8295177109330532 and parameters: {'n_estimators': 59, 'max_depth': 7, 'min

Best hyperparameters: {'n_estimators': 206, 'max_depth': 11, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 'sqrt'}
Best CV accuracy: 0.8468010128873125


In [18]:
best_params = study.best_params
best_model = RandomForestClassifier(**best_params, random_state=42, n_jobs=-1)
best_model.fit(X_train_smote, y_train_smote)

0,1,2
,n_estimators,206
,criterion,'gini'
,max_depth,11
,min_samples_split,3
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [22]:
model_data = {"model":best_model,"features_names":X_train_smote.columns.tolist()}

with open("../models/customer_churn_model.pkl","wb") as f:
    pickle.dump(model_data,f)