In [191]:
import os
print(os.getcwd())
os.chdir('d:\\vscode_machineLearning\\internship\\Customer-Churn-Prediction')
print(os.getcwd())

d:\vscode_machineLearning\internship\Customer-Churn-Prediction
d:\vscode_machineLearning\internship\Customer-Churn-Prediction


In [192]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [193]:
df = pd.read_csv(f'artifacts/raw_data/customer_churn_removed_col.csv')

In [194]:
df.isnull().sum()

Age                           0
Gender                        0
Location                      0
Subscription_Length_Months    0
Monthly_Bill                  0
Total_Usage_GB                0
Churn                         0
dtype: int64

## Transformation

In [195]:
df['Gender']=df['Gender'].replace({'Male':0,'Female':1})

In [196]:
X = df.drop(columns='Churn')
y = df['Churn']

In [197]:
cat_cols =[feature for feature in df.columns if df[feature].dtype == 'O']
num_cols =[feature for feature in df.columns if df[feature].dtype != 'O']
print(cat_cols)
print(num_cols)

['Location']
['Age', 'Gender', 'Subscription_Length_Months', 'Monthly_Bill', 'Total_Usage_GB', 'Churn']


In [198]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (OneHotEncoder,
                                   MinMaxScaler)
import numpy as np
from sklearn.model_selection import train_test_split

In [199]:
X_train, X_test, y_train, y_test = train_test_split(
         X, y, test_size=0.30, random_state=42)

In [200]:
df['Gender']=df['Gender'].replace({'Male':0,'Female':1})

In [201]:
X_train

Unnamed: 0,Age,Gender,Location,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB
76513,25,0,Los Angeles,13,98.13,468
60406,54,1,Houston,2,71.90,383
27322,32,0,Chicago,20,65.65,427
53699,61,1,Houston,12,83.20,64
65412,70,1,New York,5,41.93,99
...,...,...,...,...,...,...
6265,35,0,Miami,21,67.33,235
54886,56,0,Chicago,13,85.40,347
76820,69,0,Houston,2,76.24,321
860,55,0,Chicago,12,89.19,315


In [202]:
preprocessing = ColumnTransformer(transformers=[
    ('OHE',OneHotEncoder(drop='first',sparse=False,dtype=np.int64),['Location']),
    ('scaling',MinMaxScaler(),['Age', 'Subscription_Length_Months', 'Monthly_Bill', 'Total_Usage_GB'])
],remainder='passthrough')

In [203]:
X_train= preprocessing.fit_transform(X_train)
X_test = preprocessing.transform(X_test)

In [204]:
X_train[0]

array([0.        , 1.        , 0.        , 0.        , 0.13461538,
       0.52173913, 0.97342477, 0.92888889, 0.        ])

## model

In [153]:
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, recall_score, accuracy_score, precision_score, classification_report

In [154]:
rfc = RandomForestClassifier(n_estimators=200,oob_score=True)

In [155]:
rfc.fit(X_train,y_train)

In [162]:
def evaluate(true,pred):
    
    cm = confusion_matrix(true, pred)
    accuracy = accuracy_score(true, pred)
    recall = recall_score(true, pred)
    precision = precision_score(true, pred)
    
    report = classification_report(true, pred)

    evaluation_report = {
        'confusion_matrix': cm,
        'accuracy': accuracy,
        'recall': recall,
        'precision': precision,
        'classification_report': report
    }
    print('confusion metrics: \n',cm)
    print('accuracy: \n',accuracy)
    print('precision: \n',precision)
    print('recall: \n',recall)
    print('classification_report: \n',report)
    
    # return evaluation_report


    

In [163]:
y_pred = rfc.predict(X_test)

In [164]:
from pprint import pprint

In [165]:
evaluate(true=y_test,pred=y_pred)

confusion metrics: 
 [[7729 7423]
 [7670 7178]]
accuracy: 
 0.4969
precision: 
 0.49161016368741867
recall: 
 0.4834321120689655
classification_report: 
               precision    recall  f1-score   support

           0       0.50      0.51      0.51     15152
           1       0.49      0.48      0.49     14848

    accuracy                           0.50     30000
   macro avg       0.50      0.50      0.50     30000
weighted avg       0.50      0.50      0.50     30000



## training all aglorithms

In [205]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import AdaBoostClassifier

In [206]:
def evaluate(true,pred):
    accuracy = accuracy_score(true, pred)
    recall = recall_score(true, pred)
    precision = precision_score(true, pred)
    return accuracy , recall , precision


In [207]:
params = {
    "Gradient Boosting Classifier": {
        'loss': ['deviance', 'exponential'],  # Classification loss functions
        'learning_rate': [0.1, 0.01, 0.05, 0.001],
        'criterion': ['friedman_mse'],
        'n_estimators': [8, 16, 32, 64, 128, 256],
        'subsample': [0.6, 0.7, 0.75, 0.8, 0.85, 0.9],
        'max_features': ['auto', 'sqrt', 'log2']
    },
    "XGBoost Classifier": {
        'learning_rate': [0.1, 0.01, 0.05, 0.001],
        'n_estimators': [8, 16, 32, 64, 128, 256],
        'subsample': [0.6, 0.7, 0.75, 0.8, 0.85, 0.9],
        'max_depth': [3, 4, 5, 6, 7, 8],
        'min_child_weight': [1, 2, 3, 4]
    },
    "CatBoost Classifier": {
        'depth': [6, 8, 10],
        'learning_rate': [0.01, 0.05, 0.1],
        'iterations': [30, 50, 100],
        'loss_function': ['Logloss', 'CrossEntropy'],
        'eval_metric': ['Logloss', 'AUC'],
    },
    "AdaBoost Classifier": {
        'learning_rate': [0.1, 0.01, 0.5, 0.001],
        'n_estimators': [8, 16, 32, 64, 128, 256],
        'algorithm': ['SAMME', 'SAMME.R'],
    }
}

In [208]:
models = {
    "Gradient Boosting Classifier": GradientBoostingClassifier(),
    "XGBoost Classifier": XGBClassifier(),
    "CatBoost Classifier": CatBoostClassifier(),
    "AdaBoost Classifier": AdaBoostClassifier(),
}


In [212]:

class ModelTraining:
    def __init__(self):
        self.model_list = []
        self.accuracy_list = []
        self.best_params_dict = {}

    def train_model(self, models, params, X_train, y_train, X_test, y_test):
        for i, model_name in enumerate(models.keys()):
            model = models[model_name]
            param = params[model_name]

            grid_search_cv = RandomizedSearchCV(estimator=model, param_distributions=param, cv=5)
            grid_search_cv.fit(X_train, y_train)

            best_params = grid_search_cv.best_params_
            model.set_params(**best_params)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            accuracy, recall, precision = evaluate(true=y_test, pred=y_pred)

            print(model_name)
            self.model_list.append(model_name)

            print("- precision : {:.4f}".format(precision))
            print("- recall : {:.4f}".format(recall))
            print("- accuracy : {:.4f}".format(accuracy))
            print('\n')

            self.best_params_dict[model_name] = best_params

            self.accuracy_list.append(accuracy)

    def return_accuracy_list(self):
        return self.accuracy_list
    
    def best_params(self):
        return self.best_params_dict


In [213]:
train_model = ModelTraining()

In [214]:
train_model.train_model(models=models,params=params,X_train=X_train,y_train=y_train,X_test=X_test,y_test=y_test)

Gradient Boosting Classifier
- precision : 0.5009
- recall : 0.3351
- accuracy : 0.5057


XGBoost Classifier
- precision : 0.4943
- recall : 0.3970
- accuracy : 0.5005


0:	total: 35.9ms	remaining: 1.76s
1:	total: 69.1ms	remaining: 1.66s
2:	total: 99.9ms	remaining: 1.56s
3:	total: 129ms	remaining: 1.48s
4:	total: 159ms	remaining: 1.43s
5:	total: 188ms	remaining: 1.38s
6:	total: 218ms	remaining: 1.34s
7:	total: 247ms	remaining: 1.29s
8:	total: 275ms	remaining: 1.25s
9:	total: 301ms	remaining: 1.2s
10:	total: 329ms	remaining: 1.17s
11:	total: 357ms	remaining: 1.13s
12:	total: 381ms	remaining: 1.08s
13:	total: 413ms	remaining: 1.06s
14:	total: 443ms	remaining: 1.03s
15:	total: 467ms	remaining: 993ms
16:	total: 496ms	remaining: 963ms
17:	total: 532ms	remaining: 945ms
18:	total: 564ms	remaining: 920ms
19:	total: 594ms	remaining: 891ms
20:	total: 620ms	remaining: 857ms
21:	total: 644ms	remaining: 819ms
22:	total: 670ms	remaining: 787ms
23:	total: 695ms	remaining: 753ms
24:	total: 721ms	remai

In [215]:
train_model.best_params_dict

{'Gradient Boosting Classifier': {'subsample': 0.75,
  'n_estimators': 16,
  'max_features': 'log2',
  'loss': 'exponential',
  'learning_rate': 0.05,
  'criterion': 'friedman_mse'},
 'XGBoost Classifier': {'subsample': 0.85,
  'n_estimators': 8,
  'min_child_weight': 1,
  'max_depth': 6,
  'learning_rate': 0.1},
 'CatBoost Classifier': {'loss_function': 'Logloss',
  'learning_rate': 0.1,
  'iterations': 100,
  'eval_metric': 'Logloss',
  'depth': 8},
 'AdaBoost Classifier': {'n_estimators': 32,
  'learning_rate': 0.01,
  'algorithm': 'SAMME'}}

In [216]:
model_accuracy=pd.DataFrame(list(zip(list(models.keys()),train_model.accuracy_list)),columns=['model','accuracy'])
model_accuracy

Unnamed: 0,model,accuracy
0,Gradient Boosting Classifier,0.505667
1,XGBoost Classifier,0.500533
2,CatBoost Classifier,0.499667
3,AdaBoost Classifier,0.503067
