In [3]:
import os
print(os.getcwd())
os.chdir('d:\\vscode_machineLearning\\internship\\Customer-Churn-Prediction')
print(os.getcwd())

d:\vscode_machineLearning\internship\Customer-Churn-Prediction
d:\vscode_machineLearning\internship\Customer-Churn-Prediction


In [4]:
import pandas as pd

In [5]:
df = pd.read_csv(f'artifacts/raw_data/customer_churn_removed_col.csv')

In [6]:
df.isnull().sum()

Age                           0
Gender                        0
Location                      0
Subscription_Length_Months    0
Monthly_Bill                  0
Total_Usage_GB                0
Churn                         0
dtype: int64

## Transformation

In [7]:
df['Gender']=df['Gender'].replace({'Male':0,'Female':1})

In [8]:
X = df.drop(columns='Churn')
y = df['Churn']

In [9]:
cat_cols =[feature for feature in df.columns if df[feature].dtype == 'O']
num_cols =[feature for feature in df.columns if df[feature].dtype != 'O']
print(cat_cols)
print(num_cols)

['Location']
['Age', 'Gender', 'Subscription_Length_Months', 'Monthly_Bill', 'Total_Usage_GB', 'Churn']


In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (OneHotEncoder,
                                   MinMaxScaler)
import numpy as np
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
         X, y, test_size=0.30, random_state=42)

In [12]:
df['Gender']=df['Gender'].replace({'Male':0,'Female':1})

In [13]:
X_train

Unnamed: 0,Age,Gender,Location,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB
76513,25,0,Los Angeles,13,98.13,468
60406,54,1,Houston,2,71.90,383
27322,32,0,Chicago,20,65.65,427
53699,61,1,Houston,12,83.20,64
65412,70,1,New York,5,41.93,99
...,...,...,...,...,...,...
6265,35,0,Miami,21,67.33,235
54886,56,0,Chicago,13,85.40,347
76820,69,0,Houston,2,76.24,321
860,55,0,Chicago,12,89.19,315


In [14]:
preprocessing = ColumnTransformer(transformers=[
    ('OHE',OneHotEncoder(drop='first',sparse=False,dtype=np.int64),['Location']),
    ('scaling',MinMaxScaler(),['Age', 'Subscription_Length_Months', 'Monthly_Bill', 'Total_Usage_GB'])
],remainder='passthrough')

In [15]:
X_train= preprocessing.fit_transform(X_train)
X_test = preprocessing.transform(X_test)



In [16]:
X_train.shape

(70000, 9)

## model

In [17]:
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, recall_score, accuracy_score, precision_score, classification_report

In [18]:
rfc = RandomForestClassifier(n_estimators=200,oob_score=True)

In [19]:
rfc.fit(X_train,y_train)

In [26]:
def evaluate(true,pred):
    
    cm = confusion_matrix(true, pred)
    accuracy = accuracy_score(true, pred)
    recall = recall_score(true, pred)
    precision = precision_score(true, pred)
    
    report = classification_report(true, pred)

    evaluation_report = {
        'confusion_matrix': cm,
        'accuracy': accuracy,
        'recall': recall,
        'precision': precision,
        'classification_report': report
    }
    print('confusion metrics: \n',cm)
    print('accuracy: \n',accuracy)
    print('precision: \n',precision)
    print('recall: \n',recall)
    print('classification_report: \n',report)
    
    return evaluation_report

In [21]:
y_pred = rfc.predict(X_test)

In [22]:
from pprint import pprint

In [23]:
evaluate(true=y_test,pred=y_pred)

confusion metrics: 
 [[7830 7322]
 [7664 7184]]
accuracy: 
 0.5004666666666666
precision: 
 0.4952433475803116
recall: 
 0.4838362068965517
classification_report: 
               precision    recall  f1-score   support

           0       0.51      0.52      0.51     15152
           1       0.50      0.48      0.49     14848

    accuracy                           0.50     30000
   macro avg       0.50      0.50      0.50     30000
weighted avg       0.50      0.50      0.50     30000



## training all aglorithms

In [35]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import AdaBoostClassifier

ImportError: cannot import name 'XGBClassifier' from 'xgboost' (unknown location)

In [28]:
params = {
    "Gradient Boosting Classifier": {
        'loss': ['deviance', 'exponential'],  # Classification loss functions
        'learning_rate': [0.1, 0.01, 0.05, 0.001],
        'criterion': ['friedman_mse'],
        'n_estimators': [8, 16, 32, 64, 128, 256],
        'subsample': [0.6, 0.7, 0.75, 0.8, 0.85, 0.9],
        'max_features': ['auto', 'sqrt', 'log2']
    },
    "XGBoost Classifier": {
        'learning_rate': [0.1, 0.01, 0.05, 0.001],
        'n_estimators': [8, 16, 32, 64, 128, 256],
        'subsample': [0.6, 0.7, 0.75, 0.8, 0.85, 0.9],
        'max_depth': [3, 4, 5, 6, 7, 8],
        'min_child_weight': [1, 2, 3, 4]
    },
    "CatBoost Classifier": {
        'depth': [6, 8, 10],
        'learning_rate': [0.01, 0.05, 0.1],
        'iterations': [30, 50, 100],
        'loss_function': ['Logloss', 'CrossEntropy'],
        'eval_metric': ['Logloss', 'AUC'],
    },
    "AdaBoost Classifier": {
        'learning_rate': [0.1, 0.01, 0.5, 0.001],
        'n_estimators': [8, 16, 32, 64, 128, 256],
        'algorithm': ['SAMME', 'SAMME.R'],
    }
}

In [None]:
models = {
    "Gradient Boosting Classifier": GradientBoostingClassifier(),
    "XGBoost Classifier": XGBClassifier(),
    "CatBoost Classifier": CatBoostClassifier(),
    "AdaBoost Classifier": AdaBoostClassifier(),
}


In [30]:

class ModelTraining:
    def __init__(self):
        self.model_list = []
        self.accuracy_list = []
        self.best_params_dict = {}

    def train_model(self, models, params, X_train, y_train, X_test, y_test):
        for i, model_name in enumerate(models.keys()):
            model = models[model_name]
            param = params[model_name]

            grid_search_cv = RandomizedSearchCV(estimator=model, param_distributions=param, cv=5)
            grid_search_cv.fit(X_train, y_train)
            
            best_params = grid_search_cv.best_params_
            model.set_params(**best_params)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            self.report = evaluate(true=y_test,pred=y_pred)

            print(model_name)
            self.model_list.append(model_name)

            self.best_params_dict[model_name] = best_params
            self.accuracy_list.append(report['accuracy'])

    def return_accuracy_list(self):
        return self.accuracy_list
    
    def best_params(self):
        return self.best_params_dict


In [31]:
train_model = ModelTraining()

In [None]:
models = {
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "XGBRegressor": XGBRegressor(),
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor(),
}


In [32]:
train_model.train_model(models=models,params=params,X_train=X_train,y_train=y_train,X_test=X_test,y_test=y_test)

NameError: name 'models' is not defined