## CUSTOMERS CHURN PREDICTIVE MODEL

In [None]:
# importing the dependencies
!pip install catboost
!pip install xgboost
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer

# modeling
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score # For evaluation
from sklearn.linear_model import LogisticRegression # For linear classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from catboost import CatBoostRegressor # parameters for classification
from xgboost import XGBClassifier # Convenience class for XGBoost classification
import warnings 
import logging

In [None]:
# loading the dataset
df = pd.read_csv('02 churn-dataset.csv')
df

### Data Cleaning

In [None]:
# checking for missing values
missing_values = df.isnull().sum()
missing_value_percentage = (missing_values/len(df)) * 100
print(missing_values)
print(missing_value_percentage)
df.shape

### Splitting the Dataset

In [None]:
x = df.drop(columns=['Churn', 'customerID'], axis = 1) 
x

In [None]:
x.columns

In [None]:
# checking all the categorical columns
print("The categories in 'gender' variables;.......", end=" ")
print(df['gender'].unique())

print("The categories in 'SeniorCitizen' variables;.......", end=" ")
print(df['SeniorCitizen'].unique())

print("The categories in 'Partner' variables;.......", end=" ")
print(df['Partner'].unique())

print("The categories in 'Dependents' variables;.......", end=" ")
print(df['Dependents'].unique())

print("The categories in 'PhoneService' variables;.......", end=" ")
print(df['PhoneService'].unique())

print("The categories in 'MultipleLines' variables;.......", end=" ")
print(df['MultipleLines'].unique())

print("The categories in 'InternetService' variables;.......", end=" ")
print(df['InternetService'].unique())

print("The categories in 'OnlineSecurity' variables;.......", end=" ")
print(df['OnlineSecurity'].unique())

print("The categories in 'OnlineBackup' variables;.......", end=" ")
print(df['OnlineBackup'].unique())

print("The categories in 'DeviceProtection' variables;.......", end=" ")
print(df['DeviceProtection'].unique())

print("The categories in 'TechSupport' variables;.......", end=" ")
print(df['TechSupport'].unique())

print("The categories in 'StreamingTV' variables;.......", end=" ")
print(df['StreamingTV'].unique())

print("The categories in 'StreamingMovies' variables;.......", end=" ")
print(df['StreamingMovies'].unique())

print("The categories in 'Contract' variables;.......", end=" ")
print(df['Contract'].unique())

print("The categories in 'PaperlessBilling' variables;.......", end=" ")
print(df['PaperlessBilling'].unique())

print("The categories in 'PaymentMethod' variables;.......", end=" ")
print(df['PaymentMethod'].unique())

In [None]:
y = df['Churn']
y

### Creating Column Transformer

In [None]:
# creating column transformer
num_features = x.select_dtypes(exclude="object").columns
cat_features = x.select_dtypes(include="object").columns

numerical_transformer = StandardScaler()
categorical_transforemer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    [
      ("OneHotEncoder", categorical_transforemer, cat_features),
      ("StandardScaler", numerical_transformer, num_features)  
    ]
)


In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
x_train.shape, x_test.shape

In [None]:
preprocessor.fit(x_train)
x_train_processed = preprocessor.transform(x_train)
x_test_processed = preprocessor.transform(x_test)
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

### HyperParameter Tuning

In [None]:
import os
print(os.getcwd())
CATBOOST_LOG_DIR = "C:\\CatBoost_Temp"
if not os.path.exists(CATBOOST_LOG_DIR):
    os.makedirs(CATBOOST_LOG_DIR)
    print(f"Created CatBoost log directory: {CATBOOST_LOG_DIR}")
else:
    print(f"CatBoost log directory already exists: {CATBOOST_LOG_DIR}")


params ={
    "logistic_regression": {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['saga'], # 'saga' supports 'l1', 'l2', 'elasticnet', 'none'
    'max_iter': [1000]
},
    
"kneighbors_classifier": {
    'n_neighbors': [3, 5, 7, 9, 11, 15],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
},

"decision_tree_classifier": {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
},

"random_forest_classifier": {
    'n_estimators': [100, 200, 300],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False]
},

"ada_boost_classifier": {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.5, 1.0],
    'base_estimator': [DecisionTreeClassifier(max_depth=1), DecisionTreeClassifier(max_depth=2)] # Use shallow trees
},

"gradient_boosting_classifier": {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7, 9],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'subsample': [0.7, 0.8, 0.9, 1.0]
},

"svc": {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto', 0.1, 1]
    # 'degree': [2, 3, 4] # Only relevant if kernel='poly'
},

"catboost_classifier": {
    'loss_function': ['Logloss'], # Use ['Logloss'] for binary classification
    # 'loss_function': ['MultiClass'], # Use ['MultiClass'] for multi-class classification
    'iterations': [100, 200, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'depth': [4, 6, 8, 10],
    'l2_leaf_reg': [1, 3, 5, 7]
    # 'random_seed': [42]
},

"xgboost_classifier_params": {
    'objective': ['binary:logistic'], # Use ['binary:logistic'] for binary classification
    # 'objective': ['multi:softmax'], # Use ['multi:softmax'] for multi-class classification
    'n_estimators': [100, 200, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7, 9],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3],
    'reg_alpha': [0, 0.005, 0.01, 0.1],
    'reg_lambda': [0, 0.005, 0.01, 0.1]
},
}

models = {
    "Logistic Regression": LogisticRegression(random_state=42),
    "K-Neighbors Classifier": KNeighborsClassifier(),
    "Decision Tree Classifier": DecisionTreeClassifier(random_state=42),
    "Random Forest Classifier": RandomForestClassifier(random_state=42),
    "AdaBoost Classifier": AdaBoostClassifier(random_state=42),
    "Gradient Boosting Classifier": GradientBoostingClassifier(random_state=42),
    "Support Vector Classifier": SVC(random_state=42), 
    "CatBoost Classifier": CatBoostClassifier(random_state=42, train_dir=CATBOOST_LOG_DIR),
    "XGB Classifier": XGBClassifier(random_state=42)
}

model_report = {}

In [None]:
CLASSIFICATION_SCORING_METRIC = 'accuracy'
best_estimators_per_model = {}
for name, model in models.items():
    print(f"--- Tuning hyperparameters for {name} ---")

    param_grid_for_model = params.get(name, {})

    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid_for_model,
        cv=3,
        scoring=CLASSIFICATION_SCORING_METRIC,
        n_jobs=-1,
        verbose=1,
        error_score='raise'
    )

    grid_search.fit(x_train_processed, y_train_encoded)

    best_model_for_fold = grid_search.best_estimator_
    best_estimators_per_model[name] = best_model_for_fold # Store the best estimator

    y_pred = best_model_for_fold.predict(x_test_processed)

    if CLASSIFICATION_SCORING_METRIC == 'accuracy':
        score = accuracy_score(y_test_encoded, y_pred)
    elif CLASSIFICATION_SCORING_METRIC == 'f1_weighted':
        score = f1_score(y_test_encoded, y_pred, average='weighted')
    elif CLASSIFICATION_SCORING_METRIC == 'roc_auc':
        score = roc_auc_score(y_test_encoded, y_pred)
    else:
        score = accuracy_score(y_test_encoded, y_pred)

    model_report[name] = score
    print(f"{name} best {CLASSIFICATION_SCORING_METRIC} on test set: {score:.4f}")

best_model_score = max(model_report.values())
best_model_name = [name for name, score in model_report.items() if score == best_model_score][0]

print(f"\nBest Model: {best_model_name}, {CLASSIFICATION_SCORING_METRIC}: {best_model_score:.4f}")

if best_model_score < 0.6:
    logging.warning(f"No significantly good model found. Best {CLASSIFICATION_SCORING_METRIC} is {best_model_score:.4f}.")
else:
    print(f"\nFinal evaluation of the overall best model ({best_model_name})...")
    final_best_model = best_estimators_per_model[best_model_name]

    predicted = final_best_model.predict(x_test_processed)

    if CLASSIFICATION_SCORING_METRIC == 'accuracy':
        final_score = accuracy_score(y_test_encoded, predicted)
    elif CLASSIFICATION_SCORING_METRIC == 'f1_weighted':
        final_score = f1_score(y_test_encoded, predicted, average='weighted')
    elif CLASSIFICATION_SCORING_METRIC == 'roc_auc':
        final_score = roc_auc_score(y_test_encoded, predicted)
    else:
        final_score = accuracy_score(y_test_encoded, predicted)

    print(f"\nFinal {CLASSIFICATION_SCORING_METRIC} of the best model ({best_model_name}) on the test set: {final_score:.4f}")

### Difference between Actual and Predicted values

In [None]:
pred_df=pd.DataFrame({'Actual Value':y_test_encoded,'Predicted Value':y_pred,'Difference':y_test_encoded-y_pred})
pred_df