# 1 Getting Ready

## 1.1 Import Required Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from numpy import arange
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import cross_validate
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import make_scorer, accuracy_score, f1_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from fpdf import FPDF 
from sklearn.pipeline import Pipeline
import pickle
import datetime
from datetime import datetime as dt
from datetime import timedelta
from keras.models import model_from_json

from imblearn.over_sampling import SMOTE

ModuleNotFoundError: No module named 'imblearn'

## 1.2 Setting up Environment

In [16]:
%matplotlib inline
plt.style.use('fivethirtyeight')
pd.options.display.max_columns = None
pd.options.display.max_rows = None

# 2 Load Preprocessed Data for Model Building

## 2.1 Load from pickle

In [17]:
df_churn = pd.read_pickle('./data/training/churn.pickle')

## 2.2 Have a Look into the Training Data

In [18]:
df_churn.head()

Unnamed: 0,TENURE,MONTHLY_CHARGES,CHURN_STATUS,GENDER_F,GENDER_M
0,1,29.85,0,1,0
1,34,56.95,0,0,1
2,2,53.85,1,0,1
3,45,42.3,0,0,1
4,2,70.7,1,1,0


## 2.3 Make a copy of Training Data to work on

In [19]:
df = df_churn.copy()

# 3 Learn more about the Dataset - Meta Info

In [20]:
df.shape

(7043, 5)

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7043 entries, 0 to 7042
Data columns (total 5 columns):
TENURE             7043 non-null int64
MONTHLY_CHARGES    7043 non-null float64
CHURN_STATUS       7043 non-null int64
GENDER_F           7043 non-null uint8
GENDER_M           7043 non-null uint8
dtypes: float64(1), int64(2), uint8(2)
memory usage: 233.8 KB


In [22]:
df.columns

Index(['TENURE', 'MONTHLY_CHARGES', 'CHURN_STATUS', 'GENDER_F', 'GENDER_M'], dtype='object')

In [23]:
df.describe()

Unnamed: 0,TENURE,MONTHLY_CHARGES,CHURN_STATUS,GENDER_F,GENDER_M
count,7043.0,7043.0,7043.0,7043.0,7043.0
mean,32.371149,64.761692,0.26537,0.495244,0.504756
std,24.559481,30.090047,0.441561,0.500013,0.500013
min,0.0,18.25,0.0,0.0,0.0
25%,9.0,35.5,0.0,0.0,0.0
50%,29.0,70.35,0.0,0.0,1.0
75%,55.0,89.85,1.0,1.0,1.0
max,72.0,118.75,1.0,1.0,1.0


# 4 Prepare X (Set of Independent Variables or Features) and y (Dependent or Target Variable)

In [24]:
X = df.drop('CHURN_STATUS', axis=1)
y = df['CHURN_STATUS'].values

# 5 Splitting Data into Train, Validation and Test Set

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_train,y_train,test_size=0.25, random_state=0)

# 6 Oversampling for balancing data

NOT NOW!

# 7 Logistic  Regression

## 7.1 Hyper Params Estimation: Logistic Regression

In [26]:
print("Starting ------ Logistic Regression")

pipe = Pipeline(steps=[
    ('logistic', LogisticRegression())
])

param_grid ={
    'logistic__penalty':('l2', 'elasticnet', 'none'),
    'logistic__solver':('newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'),
    'logistic__max_iter':[50,100,500]
}

model=GridSearchCV(estimator=pipe,
                         param_grid=param_grid,
                         scoring='roc_auc', 
                         n_jobs=-1,
                         pre_dispatch='2*n_jobs', 
                         cv=5, 
                         verbose=1,
                         return_train_score=False)

model.fit(X_train,y_train)

pkl_filename = "./models/logistic_regression.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(model, file)

X_val_np = X_val.to_numpy()
predicted = model.predict(X_val_np)
proba = model.predict_proba(X_val_np)

accuracy=accuracy_score(y_val, predicted)

CM = confusion_matrix(y_val, predicted)
(TN,FN,TP,FP) = (CM[0][0],CM[1][0],CM[1][1],CM[0][1])
FPR = FP/(FP+TN)



print("Best Params-")
print(model.best_params_)

print("\n")
print("Algorithm: ", 'Logistic Regression')
print("Accuracy: ",accuracy)
# print("Recall: ", recall)
# print("F1-support: ", f1)
# print("FPR: ", FPR)
# print("Runtime: ", delta)

print("Best Params-")
print(model.best_params_)

Starting ------ Logistic Regression
Fitting 5 folds for each of 45 candidates, totalling 225 fits
Best Params-
{'logistic__max_iter': 50, 'logistic__penalty': 'l2', 'logistic__solver': 'newton-cg'}


Algorithm:  Logistic Regression
Accuracy:  0.7757274662881476
Best Params-
{'logistic__max_iter': 50, 'logistic__penalty': 'l2', 'logistic__solver': 'newton-cg'}


        nan        nan        nan        nan 0.81392592 0.81392592
        nan 0.81101193 0.80528974 0.81392734 0.81390881 0.81390732
 0.81346267 0.81096632        nan        nan        nan        nan
        nan 0.81392592 0.81392592        nan 0.81344269 0.81096491
 0.81392734 0.81390881 0.81390732 0.81391301 0.81388164        nan
        nan        nan        nan        nan 0.81392592 0.81392592
        nan 0.81390164 0.81389024]


## 7.2 Build Baseline Model: Logistic Regression

DO IT YOURSELF

# 8 Try to Do the following!

1. Logistic Regression
2. XG Boost Classifier
3. Support Vector Classifier
4. SGD Classifier
5. Decision Tree
6. Random Forest Tree
7. Extra-Tree Classifier
8. Gaussian Naïve Bayes Classifier
9. Linear Discriminant Model
10. Gradient Boosting Classifier
11. Bagging Classifier
12. Ada-Boost Classifier
13. Hist-Gradient Boosting Classifier
14. Stacking Classifier
15. Voting Classifier
16. Neural Networks
17. Deep Neural Networks

# Thank You