# 1 Getting Ready

## 1.1 Import Required Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from numpy import arange
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import cross_validate
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import make_scorer, accuracy_score, f1_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from fpdf import FPDF 
from sklearn.pipeline import Pipeline
import pickle
import datetime
from datetime import datetime as dt
from datetime import timedelta
from keras.models import model_from_json

Using TensorFlow backend.


## 1.2 Setting up Environment

In [2]:
%matplotlib inline
plt.style.use('fivethirtyeight')
pd.options.display.max_columns = None
pd.options.display.max_rows = None

# 2 Load Preprocessed Data for Model Building

## 2.1 Load from pickle

In [3]:
df_churn = pd.read_pickle('./data/training/churn.pickle')

## 2.2 Have a Look into the Training Data

In [4]:
df_churn.head()

Unnamed: 0,TENURE,MONTHLY_CHARGES,CHURN_STATUS
0,1,29.85,0
1,34,56.95,0
2,2,53.85,1
3,45,42.3,0
4,2,70.7,1


## 2.3 Make a copy of Training Data to work on

In [5]:
df = df_churn.copy()

# 3 Learn more about the Dataset - Meta Info

In [6]:
df.shape

(7043, 3)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 3 columns):
TENURE             7043 non-null int64
MONTHLY_CHARGES    7043 non-null float64
CHURN_STATUS       7043 non-null int64
dtypes: float64(1), int64(2)
memory usage: 165.2 KB


In [8]:
df.columns

Index(['TENURE', 'MONTHLY_CHARGES', 'CHURN_STATUS'], dtype='object')

In [9]:
df.describe()

Unnamed: 0,TENURE,MONTHLY_CHARGES,CHURN_STATUS
count,7043.0,7043.0,7043.0
mean,32.371149,64.761692,0.26537
std,24.559481,30.090047,0.441561
min,0.0,18.25,0.0
25%,9.0,35.5,0.0
50%,29.0,70.35,0.0
75%,55.0,89.85,1.0
max,72.0,118.75,1.0


# 4 Prepare X (Set of Independent Variables or Features) and y (Dependent or Target Variable)

In [10]:
X = df.drop('CHURN_STATUS', axis=1)
y = df['CHURN_STATUS'].values

# 5 Splitting Data into Train, Validation and Test Set

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_train,y_train,test_size=0.25, random_state=0)

# 6 Oversampling for balancing data

NOT NOW!

# 7 Logistic  Regression

## 7.1 Hyper Params Estimation: Logistic Regression

In [13]:
print("Starting ------ Logistic Regression")
start_ts=datetime.datetime.now() 
pipe = Pipeline(steps=[
    ('logistic', LogisticRegression())
])

param_grid ={
    'logistic__penalty':('l1', 'l2', 'elasticnet', 'none'),
    'logistic__solver':('newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'),
    'logistic__max_iter':[50,100,500]
}

model=GridSearchCV(estimator=pipe,
                         param_grid=param_grid,
                         scoring='roc_auc', 
                         n_jobs=-1,
                         pre_dispatch='2*n_jobs', 
                         cv=5, 
                         verbose=1,
                         return_train_score=False)

model.fit(X_train,y_train)

pkl_filename = "./models/logistic_regression.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(model, file)

X_val_np = X_val.to_numpy()
predicted = model.predict(X_val_np)
proba = model.predict_proba(X_val_np)

precision=precision_score(y_val, predicted, average='weighted')
recall=recall_score(y_val, predicted, average='weighted')
f1=f1_score(y_val, predicted, average='weighted')
accuracy=accuracy_score(y_val, predicted)

CM = confusion_matrix(y_val, predicted)
(TN,FN,TP,FP) = (CM[0][0],CM[1][0],CM[1][1],CM[0][1])
FPR = FP/(FP+TN)

end_ts=datetime.datetime.now()
delta=(end_ts-start_ts)

print("Best Params-")
print(model.best_params_)

print("\n")
print("Algorithm: ", 'Logistic Regression')
print("Accuracy: ",accuracy)
print("Recall: ", recall)
print("F1-support: ", f1)
print("FPR: ", FPR)
print("Runtime: ", str(delta))

print("Best Params-")
print(model.best_params_)

Starting ------ Logistic Regression
Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.1s


Best Params-
{'logistic__max_iter': 100, 'logistic__penalty': 'l1', 'logistic__solver': 'liblinear'}


Algorithm:  Logistic Regression
Accuracy:  0.7764371894960965
Recall:  0.7764371894960965
F1-support:  0.7613701373778752
FPR:  0.09578544061302682
Runtime:  0:00:03.855622
Best Params-
{'logistic__max_iter': 100, 'logistic__penalty': 'l1', 'logistic__solver': 'liblinear'}


[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    3.8s finished


## 7.2 Build Baseline Model: Logistic Regression

DO IT YOURSELF

# Thank You