In [58]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from numpy import arange
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import cross_validate
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import make_scorer, accuracy_score, f1_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from fpdf import FPDF 
from sklearn.pipeline import Pipeline
import pickle
import datetime
from datetime import datetime as dt
from datetime import timedelta
from keras.models import model_from_json

In [59]:
%matplotlib inline
plt.style.use('fivethirtyeight')
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [60]:
df_churn = pd.read_pickle('./data/training/churn.pickle')

In [61]:
df_churn.head()

Unnamed: 0,TENURE,MONTHLY_CHARGES,CHURN_STATUS
0,1,29.85,0
1,34,56.95,0
2,2,53.85,1
3,45,42.3,0
4,2,70.7,1


In [62]:
df = df_churn.copy()

In [63]:
df.shape

(7043, 3)

In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 3 columns):
TENURE             7043 non-null int64
MONTHLY_CHARGES    7043 non-null float64
CHURN_STATUS       7043 non-null int64
dtypes: float64(1), int64(2)
memory usage: 165.1 KB


In [65]:
df.describe()

Unnamed: 0,TENURE,MONTHLY_CHARGES,CHURN_STATUS
count,7043.0,7043.0,7043.0
mean,32.371149,64.761692,0.26537
std,24.559481,30.090047,0.441561
min,0.0,18.25,0.0
25%,9.0,35.5,0.0
50%,29.0,70.35,0.0
75%,55.0,89.85,1.0
max,72.0,118.75,1.0


In [66]:
df.columns

Index(['TENURE', 'MONTHLY_CHARGES', 'CHURN_STATUS'], dtype='object')

In [67]:
X = df.drop('CHURN_STATUS', axis = 1)
y = df['CHURN_STATUS'].values

In [68]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.20, random_state = 0)

In [69]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state = 0)

## Hyper params Estimation: logistic Regression

In [78]:
tstart = dt.now()
print("Starting----------------- logistic Regression")

pipe = Pipeline(steps=[
    ('logistic', LogisticRegression())
])

param_grid ={
    'logistic__penalty':('l1', 'l2', 'elasticnet', 'none'),
    'logistic__solver':('newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'),
    'logistic__max_iter':[50,100,500]
}

model=GridSearchCV(estimator=pipe,
                         param_grid=param_grid,
                         scoring='roc_auc', 
                         n_jobs=-1,
                         pre_dispatch='2*n_jobs', 
                         cv=5, 
                         verbose=1,
                         return_train_score=False)

model.fit(X_train, y_train)

pkl_filename = './models/logistic_regression.pkl'

with open(pkl_filename, 'wb') as file:
    pickle.dump(model, file)

X_val_np = X_val.to_numpy()
predicted = model.predict(X_val_np)
proba = model.predict_proba(X_val_np)

accuracy = accuracy_score(y_val, predicted)

CM = confusion_matrix(y_val, predicted)
(TN, FN, TP, FP) = (CM[0][0], CM[1][0], CM[1][1], CM[0][1])

FPR = FP/(FP+TN)
recall = TP/(TP+FN)
precision = TP / (TP+FP)
f1 = 2*(recall*precision)/(recall+precision)
tend = dt.now()
delta = tend-tstart

print("Best Params-")
print(model.best_params_)

print("\n")
print("Algorithm: ", 'Logistic Regression')
print("Accuracy: ",accuracy)
print("Recall: ", recall)
print("F1-support: ", f1)
print("FPR: ", FPR)
print("Runtime: ", delta)

print("Best Params-")
#print(model.best_params_)


Starting----------------- logistic Regression
Fitting 5 folds for each of 60 candidates, totalling 300 fits
Best Params-
{'logistic__max_iter': 500, 'logistic__penalty': 'l1', 'logistic__solver': 'saga'}


Algorithm:  Logistic Regression
Accuracy:  0.7785663591199432
Recall:  0.41643835616438357
F1-support:  0.4935064935064935
FPR:  0.09482758620689655
Runtime:  0:00:03.700895
Best Params-
{'logistic__max_iter': 500, 'logistic__penalty': 'l1', 'logistic__solver': 'saga'}


 0.81406292 0.81405719 0.80788737 0.80157259        nan        nan
        nan        nan        nan 0.81406292 0.81406292        nan
 0.80792447 0.80159682        nan        nan 0.81410849        nan
 0.80793298 0.81406292 0.81406292 0.81405719 0.81266232 0.80792016
        nan        nan        nan        nan        nan 0.81406292
 0.81406292        nan 0.81264811 0.80793441        nan        nan
 0.81410849        nan 0.81412698 0.81406292 0.81406292 0.81405719
 0.81404296 0.81408284        nan        nan        nan        nan
        nan 0.81406292 0.81406292        nan 0.81403298 0.81408715]


In [80]:
base_model = LogisticRegression(penalty = 'l1', solver='saga', max_iter = 500, verbose = 1)

In [81]:
base_model.fit(X_train,y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 378 epochs took 0 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished


LogisticRegression(max_iter=500, penalty='l1', solver='saga', verbose=1)

In [83]:
predicted = base_model.predict(X_val_np)
accuracy = accuracy_score(y_val, predicted)
print(f"Accuracy is:{accuracy}")

Accuracy is:0.7785663591199432
