In [188]:
# import necessary libraries

# data manipulation
import pandas as pd
import numpy as np

# Exploratory analysis
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')


# model training
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import auc, precision_recall_curve, accuracy_score, roc_auc_score, precision_score, recall_score, classification_report
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# data peprocessing
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# for saving model
import joblib
import pickle


In [172]:
df = pd.read_csv('churn.csv') # import data
df.head() # showing first 5 rows of the data

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


### Feature and Target data

In [173]:
X = df.drop(columns=['RowNumber', 'CustomerId', 'Surname','Exited'], axis=1) # creating feature space
y = df['Exited'] # target data

In [174]:
# converting int into ordinal category

X['Tenure'] = X['Tenure'].astype(str) 
X['NumOfProducts'] = X['NumOfProducts'].astype(str) 
X['HasCrCard'] = X['HasCrCard'].astype(str)
X['IsActiveMember'] = X['IsActiveMember'].astype(str)

In [175]:
# Create Column Transformers
numerical_features = X.select_dtypes(exclude="object").columns
categorical_features = X.select_dtypes(include="object").columns

In [176]:
numerical_features, categorical_features

(Index(['CreditScore', 'Age', 'Balance', 'EstimatedSalary'], dtype='object'),
 Index(['Geography', 'Gender', 'Tenure', 'NumOfProducts', 'HasCrCard',
        'IsActiveMember'],
       dtype='object'))

In [177]:
# categorical_df = pd.get_dummies(df[categorical_features], prefix_sep="__",
#                                   columns=categorical_features)  
# categorical_df = categorical_df.astype(int)                          
# corr_df = pd.concat([df[numerical_features],categorical_df, df['Exited'] ] , axis=1) 

# corr_mat = corr_df.corr()
# corr_target = abs(corr_mat["Exited"])

# # Finding relevant features by filtering our correlation matrix with the features which has greater than 0.2 correlation with target data
# relevant_features = corr_target[corr_target>0.2]
# relevant_features =pd.DataFrame(relevant_features)
# # plotting the heat map of relevant features

# plt.figure(figsize=(25,20))
# sns.heatmap(corr_mat, annot=True, cmap=plt.cm.Reds)
# plt.title(' Correlation of relevant features ' , size = 15)
# plt.xlabel('Target data',  size = 15)
# plt.ylabel('Features',  size = 15)
# plt.xticks(size=15)
# plt.yticks(size=15)
# plt.show()

### Train and Test split

In [178]:
# split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=42,
                                                    stratify=y,
                                                    test_size=0.2)

In [179]:
X.columns

Index(['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary'],
      dtype='object')

### Data Transformation

In [180]:
''' standard scaling numeric features and one hot encoding categorical features'''

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()
preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, categorical_features),
         ("StandardScaler", numeric_transformer, numerical_features),        
    ]
)

In [181]:
# transforming train and test data
X_train_scaled = preprocessor.fit_transform(X_train)
X_test_scaled = preprocessor.transform(X_test)


### Saving Preprocessor for future data

In [190]:
# joblib.dump(preprocessor, "preprocessor.pkl")

In [182]:
X_train_scaled.shape, X_test_scaled.shape # checking shape of train and test set

((8000, 28), (2000, 28))

In [183]:
y.value_counts() # checking counts of classes

Exited
0    7963
1    2037
Name: count, dtype: int64

As we observe, our data is imbalanced, with class 0 counts being almost four times those of class 1. To address this issue, we can employ balancing techniques such as oversampling, undersampling, or a combination of both.

However, using undersampling alone has a significant drawback, as it leads to information loss. Hence, I typically avoid relying solely on undersampling.

To balance the data, we can utilize Synthetic Minority Oversampling Technique (SMOTE) for oversampling class 1. Additionally, altering class weights during model implementation is another approach.

### However, for this study, we won't be implementing any sampling methods as our primary goal is model deployment on GCP. Instead, we'll focus on proper model evaluation metrics.
### Given the data's nature, it's crucial to emphasize class 1, which corresponds to customer churning. Thus, optimizing the model's Recall, aimed at reducing false negatives, is our priority.
### Considering the data imbalance, the Precision-Recall curve is also more suitable evaluation tool for different models.

## Model Implementation

In [184]:
# model implementation

models = {
    "Logistic Regression": LogisticRegression(), 
    "Support Vector Classifier": SVC(),
    "Random Forest Classifier": RandomForestClassifier(),
    "XGBClassifier": XGBClassifier(), 
    "CatBoosting Classifier": CatBoostClassifier(verbose=False)
}
model_list = [] # creating empty list to add model name
recall = [] # creating empty list to store test recall
precision_recall_curve_auc = [] # creating empty list to store test auc


for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train_scaled, y_train) # model fitting


    # predictions
    y_train_pred = model.predict(X_train_scaled) # train prediction
    y_test_pred = model.predict(X_test_scaled) # test prediction

    # Evaluate Models

    pr_train, re_train, th_train = precision_recall_curve(y_train, y_train_pred) 
    pr_test, re_test, th_test = precision_recall_curve(y_test, y_test_pred)
    recall_train = recall_score(y_train, y_train_pred)
    recall_test = recall_score(y_test, y_test_pred)
    auc_train = auc(re_train, pr_train)
    auc_test = auc(re_test, pr_test)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print("Model Performance for Training set:")
    print('Recall: {:.2f}'.format(recall_train))
    print('Precision Recall Curve AUC: {:.2f}'.format(auc_train))

    print("-"*20)

    print("Model Performance for Testing set:")
    print('Recall: {:.2f}'.format(recall_test))
    print('Precision Recall Curve AUC: {:.2f}'.format(auc_test))
    recall.append(recall_test)
    precision_recall_curve_auc.append(auc_test)
    print("#"*20)
    print('\n')

Logistic Regression
Model Performance for Training set:
Recall: 0.38
Precision Recall Curve AUC: 0.61
--------------------
Model Performance for Testing set:
Recall: 0.36
Precision Recall Curve AUC: 0.59
####################


Support Vector Classifier
Model Performance for Training set:
Recall: 0.43
Precision Recall Curve AUC: 0.70
--------------------
Model Performance for Testing set:
Recall: 0.41
Precision Recall Curve AUC: 0.68
####################


Random Forest Classifier
Model Performance for Training set:
Recall: 1.00
Precision Recall Curve AUC: 1.00
--------------------
Model Performance for Testing set:
Recall: 0.45
Precision Recall Curve AUC: 0.65
####################


XGBClassifier
Model Performance for Training set:
Recall: 0.83
Precision Recall Curve AUC: 0.92
--------------------
Model Performance for Testing set:
Recall: 0.50
Precision Recall Curve AUC: 0.65
####################


CatBoosting Classifier
Model Performance for Training set:
Recall: 0.64
Precision Recal

### Creating DataFrame of models' results

In [185]:
pd.DataFrame(list(zip(model_list, recall, precision_recall_curve_auc)), columns=['Model Name', 'Recall', 'AUC']).sort_values(by=["Recall", "AUC"],ascending=False)

Unnamed: 0,Model Name,Recall,AUC
4,CatBoosting Classifier,0.496314,0.69088
3,XGBClassifier,0.496314,0.652554
2,Random Forest Classifier,0.447174,0.651266
1,Support Vector Classifier,0.405405,0.679869
0,Logistic Regression,0.358722,0.593893


### As Catboost Classifier has the highets recall and AUC. We will implement Catboost. 

In [186]:
# model implementation
cat_model = CatBoostClassifier(verbose= False)
cat_model.fit(X_train_scaled, y_train) # model fit
y_pred_test = cat_model.predict(X_test_scaled) # prediction in test data

recall_test = recall_score(y_test, y_pred_test) # recall score
print("Recall:", round(recall_test,3))

Recall: 0.496


## Since our primary goal is to deploy the application on GCP, I'm currently not focusing on tuning the model's hyperparameters. We can do it later using gridserchcv and using cross validation.

## Saving Catboost Classifier Trained model in pickle file

In [187]:
# with open('customer_churn_catboost_model.pkl', 'wb') as pickle_file:
#     pickle.dump(cat_model, pickle_file)