<p style="font-size:300%; text-align:center"> Telco Customer Churn modeling</p>
<p style="font-size:150%; text-align:center"> Focused customer retention programs <br> MOD3 Project - 4. Modeling</p>

In [1]:
# import important libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import itertools

In [2]:
# import important cleaned data
df = pd.read_csv("data/telco_clean.csv") # replace this 
#df.sample(6).T

In [3]:
df.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

## Seperate x, y and make dummy vatriables 

In [4]:
df = pd.get_dummies(df)
display(df.columns)

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'OnlineSecurity', 'OnlineBackup',
       'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
       'PaperlessBilling', 'MonthlyCharges', 'TotalCharges', 'Churn',
       'InternetService_0', 'InternetService_DSL',
       'InternetService_Fiber_optic', 'Contract_Month_to_month',
       'Contract_One_year', 'Contract_Two_year', 'PaymentMethod_Bank_transfer',
       'PaymentMethod_Credit_card', 'PaymentMethod_Electronic_check',
       'PaymentMethod_Mailed_check'],
      dtype='object')

In [5]:
# I manually droped the selected columns was to removes columns like "OnlineSecurity_No_internet" so the model results is more 
# interpretable 
df.drop(['InternetService_0','Contract_One_year','PaymentMethod_Bank_transfer'], axis=1, inplace=True)
display(df.columns)

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'OnlineSecurity', 'OnlineBackup',
       'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
       'PaperlessBilling', 'MonthlyCharges', 'TotalCharges', 'Churn',
       'InternetService_DSL', 'InternetService_Fiber_optic',
       'Contract_Month_to_month', 'Contract_Two_year',
       'PaymentMethod_Credit_card', 'PaymentMethod_Electronic_check',
       'PaymentMethod_Mailed_check'],
      dtype='object')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7032 entries, 0 to 7031
Data columns (total 24 columns):
gender                            7032 non-null int64
SeniorCitizen                     7032 non-null int64
Partner                           7032 non-null int64
Dependents                        7032 non-null int64
tenure                            7032 non-null int64
PhoneService                      7032 non-null int64
MultipleLines                     7032 non-null int64
OnlineSecurity                    7032 non-null int64
OnlineBackup                      7032 non-null int64
DeviceProtection                  7032 non-null int64
TechSupport                       7032 non-null int64
StreamingTV                       7032 non-null int64
StreamingMovies                   7032 non-null int64
PaperlessBilling                  7032 non-null int64
MonthlyCharges                    7032 non-null float64
TotalCharges                      7032 non-null float64
Churn                    

In [7]:
X = df.drop(['Churn'], axis = 1)
y = df['Churn']

## Baseline model

__which evaluation metric is most essential for this project ?__ <br>
$$ \text{Precision} = \frac{\text{Number of True Positives}}{\text{Number of Predicted Positives}} = \frac{\text{TP}}{\text{TP+FP}} $$    

$$ \text{Recall} = \frac{\text{Number of True Positives}}{\text{Number of Actual Total Positives}} = \frac{\text{TP}}{\text{TP+FN}}$$  
  
$$ \text{Accuracy} = \frac{\text{Number of True Positives + True Negatives}}{\text{Total Observations}} = \frac{\text{TP + TN}}{\text{TP+TN+FP+FN}}  $$

0 - not churned
1 - churned.
We want to predict churned (1) as accurately as possible. So, __FN__ is bad for us. Meaning model predict not churn but the customer actually churned. Then __recall__ is the important score for us. When evaluating between models we should pay more attention to recall and try to reduce FN as much as possible.

In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import roc_curve, auc
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

import plotly.graph_objs as go
import plotly.offline as py#visualization
import plotly.figure_factory as ff
py.init_notebook_mode(connected=True)#visualization



In [9]:
def results(model, X_train, y_train, X_test, Y_test, cf, notfig):
    # fit model and get predictions 
    model.fit(X_train, y_train)
    #predictions
    prediction_train = model.predict(X_train)
    prediction_test = model.predict(X_test)
    
    print('-'*90)
    print(model)
    
    print('-'*90)
    print('Training accuracy Score:', accuracy_score(y_train, prediction_train))
    print('Model accuracy Score:', accuracy_score(y_test, prediction_test))

    print('-'*90)
    print('Classification Matrix:')
    print(classification_report(y_test, prediction_test))    
    
    print('-'*90)
    #print('Confusion Matrix:\n', confusion_matrix(y_test, prediction_test))
    print('Confusion Matrix:\n', pd.crosstab(y_test, prediction_test, rownames=['Actual'], colnames=['Predicted'],margins = True))
    conf_matrix = confusion_matrix(y_test, prediction_test)
    z=[list(conf_matrix[i]) for i in [1,0]]
    x = ["not Churn", "churn"]
    y = ["Churn", "not churn"]
    # change each element of z to type string for annotations
    z_text = [[str(y) for y in x] for x in z]
    # set up figure 
    fig1 = ff.create_annotated_heatmap(z, x=["not Churn", "churn"], y=["Churn", "not churn"],
                                      annotation_text=z_text, colorscale='Viridis')
    fig1.update_layout( autosize=False, width=350, height=350,
        yaxis=dict( title_text="Actual", tickmode="array", titlefont=dict(size=21)),
        xaxis=dict(title_text="Predics", tickmode="array", titlefont=dict(size=21)),
        )
    fig1.show()    
    
    #print('-'*60)    
    #data = {'y_Actual':    list(y_test.values),
    #        'y_Predicted': list(prediction_test)}
    #df = pd.DataFrame(data, columns=['y_Actual','y_Predicted'])
    #confusion_matrix = pd.crosstab(df['y_Actual'], df['y_Predicted'], rownames=['Actual'], colnames=['Predicted'])
    ## this code is edited rest us untouched
    #fig, ax = plt.subplots(figsize=(4,4))
    #ax = sns.heatmap(confusion_matrix, annot=True, fmt='g')
    #bottom, top = ax.get_ylim()
    #ax.set_ylim(bottom+0.5, top-0.5)
    #plt.show()
    
    #roc_auc_score
    if notfig != 'fig2':
        # 
        y_score_test = model.decision_function(X_test)
        test_fpr, test_tpr, test_thresholds = roc_curve(y_test, y_score_test)
        test_model_roc_auc = roc_auc_score(y_test,y_score_test) 

        y_score_train = model.decision_function(X_train)
        train_fpr, train_tpr, train_thresholds = roc_curve(y_train, y_score_train)
        train_model_roc_auc = roc_auc_score(y_train,y_score_train) 

        #plot roc curve          #rgb(31, 119, 180)   #rgb(22, 96, 167)
        trace1 = go.Scatter(x = train_fpr,y = train_tpr,
                            name = "Train_Roc : " + str(train_model_roc_auc),
                            line = dict(color = ('blue'),width = 2))
        trace2 = go.Scatter(x = test_fpr,y = test_tpr,
                            name = "test_Roc : " + str(test_model_roc_auc),
                            line = dict(color = ('orange'),width = 2))
        trace3 = go.Scatter(x = [0,1],y=[0,1],
                            line = dict(color = ('rgb(205, 12, 24)'),width = 2,
                            dash = 'dot'))
        data2 = [trace1,trace2,trace3]
        fig2 = go.Figure(data=data2)
        fig2.update_layout( autosize=False, width=700, height=500, title='Receiver operating characteristic',
            yaxis=dict(title_text="True positive rate"),
            xaxis=dict(title_text="False positive rate"),
            )
        fig2.show()

    #best coefficients and important features
    if notfig != 'fig3':
        #
        if   cf == "coefficients" :
            coefficients  = pd.DataFrame(model.coef_.ravel())
        elif cf == "features" :
            coefficients  = pd.DataFrame(model.feature_importances_)

        cols    = [i for i in X.columns]
        column_df     = pd.DataFrame(cols)
        coef_sumry    = (pd.merge(coefficients,column_df,left_index= True,
                                  right_index= True, how = "left"))
        coef_sumry.columns = ["coefficients","features"]
        coef_sumry    = coef_sumry.sort_values(by = "coefficients",ascending = False)
        trace4 = go.Bar(x = coef_sumry["features"],y = coef_sumry["coefficients"],
                        name = "coefficients",
                        marker = dict(color = coef_sumry["coefficients"],
                                      colorscale = "Picnic",
                                      line = dict(width = .6,color = "black")))
        data = [trace4]
        fig3  = go.Figure(data=data)
        fig3.update_layout(title_text='important '+cf)
        #fig.update_xaxes(tickangle=45)
        fig3.show()

In [10]:
#model_performances
def output_tracer(model_performances, metric,color) :
    tracer = go.Bar(y = model_performances["Model"] ,
                    x = model_performances[metric],
                    orientation = "h",name = metric ,
                    marker = dict(line = dict(width =.7),
                                  color = color)
                   )
    return tracer
def plot_tracers(model_performances):
    layout = go.Layout(dict(title = "Model performances",
                            plot_bgcolor  = "rgb(243,243,243)",
                            paper_bgcolor = "rgb(243,243,243)",
                            xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                         title = "metric",
                                         zerolinewidth=1,
                                         ticklen=5,gridwidth=2),
                            yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                         zerolinewidth=1,ticklen=5,gridwidth=2),
                            margin = dict(l = 250),
                            height = 780
                           )
                      )


    trace1  = output_tracer(model_performances, "Accuracy_score","#6699FF")
    trace2  = output_tracer(model_performances, 'Recall_score',"red")
    trace3  = output_tracer(model_performances, 'Precision',"#33CC99")
    trace4  = output_tracer(model_performances, 'f1_score',"lightgrey")

    data = [trace1,trace2,trace3,trace4]
    fig = go.Figure(data=data,layout=layout)
    fig.show()


In [11]:
#gives model report in dataframe
def model_report(model,training_x,testing_x,training_y,testing_y,name) :
    model.fit(training_x,training_y)
    predictions  = model.predict(testing_x)
    accuracy     = accuracy_score(testing_y,predictions)
    recallscore  = recall_score(testing_y,predictions)
    precision    = precision_score(testing_y,predictions)
    roc_auc      = roc_auc_score(testing_y,predictions)
    f1score      = f1_score(testing_y,predictions) 

    
    df = pd.DataFrame({"Model"           : [name],
                       "Accuracy_score"  : [accuracy],
                       "Recall_score"    : [recallscore],
                       "Precision"       : [precision],
                       "f1_score"        : [f1score],
                       "Area_under_curve": [roc_auc],
                      })
    return df


In [12]:
# make a test-train split
split_size=0.5
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=split_size,random_state=42) #stratify=y

In [13]:
# What does the split data diemention look like 
print("Train dataset: {0}{1}".format(X_train.shape, y_train.shape))
print("Test dataset: {0}{1}".format(X_test.shape, y_test.shape))

Train dataset: (3516, 23)(3516,)
Test dataset: (3516, 23)(3516,)


In [14]:
#scale features 
ss = StandardScaler()
# Scale the train and test data
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [15]:
# Let's create a baseline model for all models. Let's statr with as minimum amount of parameters as possible 

# LogisticRegression
name = "Logistic Regression"
logit = LogisticRegression()
model1 = model_report(logit,X_train,X_test,y_train,y_test,name)

# DecisionTree Classifier
name = "Decision Tree"
decision_tree = DecisionTreeClassifier(random_state = 0)
# X_train_resampled,X_test,y_train_resampled,y_test,name
model2 = model_report(decision_tree,X_train,X_test,y_train,y_test,name)

# KNeighborsClassifier
name = "KNN Classifier"
knn = KNeighborsClassifier(n_neighbors=3)
model3 = model_report(knn,X_train,X_test,y_train,y_test,name)

name = "Naive Bayes"
gnb = GaussianNB(priors=None)
model4 = model_report(gnb,X_train,X_test,y_train,y_test,name)

name = "Random Forest Classifier"
rfc = RandomForestClassifier(random_state=0)
model5 = model_report(rfc,X_train,X_test,y_train,y_test,name)

name = "SVM Classifier Linear"
svc  = SVC(gamma='auto', kernel='linear')
model6 = model_report(svc,X_train,X_test,y_train,y_test,name)

name = "XGBoost Classifier"
xgc = XGBClassifier()
model7 = model_report(xgc,X_train,X_test,y_train,y_test,name)

#concat all models
model_performances = pd.concat([model1,model2,model3,model4,
                                model5,model6,model7],axis = 0).reset_index()

model_performances = model_performances.drop(columns = "index",axis =1)
display(model_performances)

plot_tracers(model_performances)




The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.



Unnamed: 0,Model,Accuracy_score,Recall_score,Precision,f1_score,Area_under_curve
0,Logistic Regression,0.802901,0.539278,0.662321,0.5945,0.719328
1,Decision Tree,0.727816,0.495754,0.492097,0.493919,0.654248
2,KNN Classifier,0.762799,0.5138,0.562791,0.537181,0.683862
3,Naive Bayes,0.732651,0.782378,0.500679,0.610605,0.748415
4,Random Forest Classifier,0.788111,0.463907,0.645495,0.539839,0.685333
5,SVM Classifier Linear,0.801195,0.532909,0.659658,0.589548,0.716144
6,XGBoost Classifier,0.794937,0.501062,0.652835,0.566967,0.701774


### Improving model with SMOTE

In [16]:
'''
smote
refre to install docs 
https://imbalanced-learn.readthedocs.io/en/stable/install.html
'''
smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_sample(X_train, y_train) 
print(pd.Series(y_train_smote).value_counts())

1    2589
0    2589
dtype: int64


In [17]:
# LogisticRegression
name = "Logistic Regression"
logit = LogisticRegression()
model1 = model_report(logit,X_train_smote,X_test,y_train_smote,y_test,name)

# DecisionTree Classifier
name = "Decision Tree"
decision_tree = DecisionTreeClassifier(random_state = 0)
# X_train_resampled,X_test,y_train_resampled,y_test,name
model2 = model_report(decision_tree,X_train_smote,X_test,y_train_smote,y_test,name)

# KNeighborsClassifier
name = "KNN Classifier"
knn = KNeighborsClassifier(n_neighbors=3)
model3 = model_report(knn,X_train_smote,X_test,y_train_smote,y_test,name)

name = "Naive Bayes"
gnb = GaussianNB(priors=None)
model4 = model_report(gnb,X_train_smote,X_test,y_train_smote,y_test,name)

name = "Random Forest Classifier"
rfc = RandomForestClassifier(random_state=0)
model5 = model_report(rfc,X_train_smote,X_test,y_train_smote,y_test,name)

name = "SVM Classifier Linear"
svc  = SVC(gamma='auto', kernel='linear')
model6 = model_report(svc,X_train_smote,X_test,y_train_smote,y_test,name)

name = "XGBoost Classifier"
xgc = XGBClassifier()
model7 = model_report(xgc,X_train_smote,X_test,y_train_smote,y_test,name)

#concat all models
model_performances = pd.concat([model1,model2,model3,model4,
                                model5,model6,model7],axis = 0).reset_index()

model_performances = model_performances.drop(columns = "index",axis =1)
display(model_performances)

plot_tracers(model_performances)





The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.



Unnamed: 0,Model,Accuracy_score,Recall_score,Precision,f1_score,Area_under_curve
0,Logistic Regression,0.758532,0.788747,0.533381,0.636403,0.768111
1,Decision Tree,0.721274,0.55414,0.48244,0.51581,0.66829
2,KNN Classifier,0.706485,0.676221,0.467009,0.552472,0.696891
3,Naive Bayes,0.713026,0.81104,0.478997,0.602286,0.744098
4,Random Forest Classifier,0.775313,0.536093,0.588578,0.561111,0.699476
5,SVM Classifier Linear,0.7281,0.81104,0.49546,0.615137,0.754394
6,XGBoost Classifier,0.787258,0.670913,0.590654,0.628231,0.750375


with SMOTE we can see that the model accuray score reduces a little but the __recall__ improves considerablly. As recall is the main evaluation metric of interest for churn dataset it is better to use SMOTE. 
From the evaluation comparison above clearly Logistic regression and SVM classifiers give the best performance and worth exploring further. However, since XGboots is been known to perform well with the correct set of hyper parameters let's explot XGboost further as well. 

## Improving model accuracy 

In [18]:
def get_pl(name, classifier, parms, score='recall'):
    # function
    pl = Pipeline([(name, classifier)])

    gs = GridSearchCV(pl, parms, scoring=score, cv=5, n_jobs=-1)

    gs.fit(X_train_smote, y_train_smote)

    best_parameters = gs.best_params_
    print("Grid Search found the following optimal parameters: ")
    for param_name in sorted(best_parameters.keys()):
        print("%s: %r" % (param_name, best_parameters[param_name]))

    if score == 'accuracy':
        # optimize for accuracy
        training_preds = gs.predict(X_train_smote)
        val_preds = gs.predict(X_test)
        training_accuracy = accuracy_score(y_train_smote, training_preds)
        val_accuracy = accuracy_score(y_test, val_preds)

        print("")
        print("Training accuracy : {:.4}%".format(training_accuracy * 100))
        print("Validation accuracy: {:.4}%".format(val_accuracy * 100))
    elif score == 'recall':
        # optimize for recall
        training_preds = gs.predict(X_train_smote)
        val_preds = gs.predict(X_test)
        training_recall = recall_score(y_train_smote, training_preds)
        val_recall = recall_score(y_test, val_preds)

        print("")
        print("Training recall: {:.4}%".format(training_recall * 100))
        print("Validation recall: {:.4}%".format(val_recall * 100))
    return None

### Logistic Regression

Following parameters maybe set and changed to get better accuracy 

* "penalty": 'l2' => Churn dataset have lot of multicoliniarity
* 'C': [100, 10, 1, 0.1, 0.01, 0.001]  => have to test strenth of regularization is best
* 'fit_intercept': True,   
* 'random_state': [11],
* 'solver': 'liblinear',
* 'max_iter': [200]


In [19]:
# call function 
param_grid = {    
    "logit__penalty": ['l2'],
    'logit__C': [100, 10, 1, 0.1, 0.01, 0.001],
    'logit__fit_intercept': [True, False],
    'logit__random_state': [11],
    'logit__solver': ['liblinear'],
    'logit__max_iter': [200],
}

get_pl('logit', LogisticRegression(), param_grid, 'recall')

Grid Search found the following optimal parameters: 
logit__C: 0.01
logit__fit_intercept: False
logit__max_iter: 200
logit__penalty: 'l2'
logit__random_state: 11
logit__solver: 'liblinear'

Training recall: 91.12%
Validation recall: 87.79%


In [20]:
# after Gridsearch
logreg_1 = LogisticRegression(C=0.01, fit_intercept= False, max_iter= 200, penalty= 'l2', 
                              random_state= 11, solver= 'liblinear')
# model results
results(logreg_1, X_train_smote, y_train_smote, X_test, y_test, 'coefficients', None)

------------------------------------------------------------------------------------------
LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=False,
                   intercept_scaling=1, l1_ratio=None, max_iter=200,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=11, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)
------------------------------------------------------------------------------------------
Training accuracy Score: 0.7641946697566628
Model accuracy Score: 0.689419795221843
------------------------------------------------------------------------------------------
Classification Matrix:
              precision    recall  f1-score   support

           0       0.93      0.62      0.75      2574
           1       0.46      0.88      0.60       942

    accuracy                           0.69      3516
   macro avg       0.70      0.75      0.67      3516
weighted avg      

In [21]:
# call function 
param_grid = {    
    "logit__penalty": ['l2'],
    'logit__C': [100, 10, 1, 0.1, 0.01, 0.001],
    'logit__fit_intercept': [True, False],
    'logit__random_state': [11],
    'logit__solver': ['liblinear'],
    'logit__max_iter': [200],
}

get_pl('logit', LogisticRegression(), param_grid, 'accuracy')

Grid Search found the following optimal parameters: 
logit__C: 100
logit__fit_intercept: True
logit__max_iter: 200
logit__penalty: 'l2'
logit__random_state: 11
logit__solver: 'liblinear'

Training accuracy : 77.91%
Validation accuracy: 75.85%


### SVM

Following parameters maybe set and changed to get better accuracy 

* kernal: [‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’]
* C:  [100, 10, 1, 0.1, 0.01, 0.001] => how soft is the soft margin. low C means errors matter less
* degree: [2, 3] => highest order of poly kernal. ignored for other kernals 
* 'tol': [1e-4] => stopping criterion increased a little here



In [22]:
# Pipeline par grid for SVC
param_grid = {    
    'svm__kernel': ['linear', 'poly', 'rbf'],
    'svm__C': [100, 0.1, 0.01, 0.001],
    'svm__degree': [2],
    'svm__tol': [1e-4],
    'svm__gamma': ['auto'],
}

get_pl('svm', SVC(), param_grid, 'recall')

Grid Search found the following optimal parameters: 
svm__C: 0.001
svm__degree: 2
svm__gamma: 'auto'
svm__kernel: 'poly'
svm__tol: 0.0001

Training recall: 91.12%
Validation recall: 89.7%


In [23]:
# accuracy matric of the best set of parameters  
clf_svm_1 = SVC(kernel ='poly', C=0.1, degree= 3,  tol = 0.0001, gamma = 'auto')

results(clf_svm_1, X_train_smote, y_train_smote, X_test, y_test, 'coefficients', 'fig3')

------------------------------------------------------------------------------------------
SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='poly',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.0001, verbose=False)
------------------------------------------------------------------------------------------
Training accuracy Score: 0.7945152568559289
Model accuracy Score: 0.7306598407281001
------------------------------------------------------------------------------------------
Classification Matrix:
              precision    recall  f1-score   support

           0       0.91      0.70      0.79      2574
           1       0.50      0.82      0.62       942

    accuracy                           0.73      3516
   macro avg       0.71      0.76      0.71      3516
weighted avg       0.80      0.73      0.75      3516

-----------------------------------------------------------

In [24]:
# Pipeline par grid for SVC
param_grid = {    
    'svm__kernel': ['linear', 'poly', 'rbf'],
    'svm__C': [100, 0.1, 0.01, 0.001],
    'svm__degree': [2],
    'svm__tol': [1e-4],
    'svm__gamma': ['auto'],
}

get_pl('svm', SVC(), param_grid, 'accuracy')

Grid Search found the following optimal parameters: 
svm__C: 100
svm__degree: 2
svm__gamma: 'auto'
svm__kernel: 'rbf'
svm__tol: 0.0001

Training accuracy : 95.93%
Validation accuracy: 73.32%


In [25]:
# accuracy matric of the best set of parameters  
clf_svm_1 = SVC(kernel ='rbf', C=0.001, degree= 2,  tol = 0.0001, gamma = 'auto')

results(clf_svm_1, X_train_smote, y_train_smote, X_test, y_test,'coefficients', 'fig3')

------------------------------------------------------------------------------------------
SVC(C=0.001, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=2, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.0001, verbose=False)
------------------------------------------------------------------------------------------
Training accuracy Score: 0.761297798377752
Model accuracy Score: 0.7778725824800911
------------------------------------------------------------------------------------------
Classification Matrix:
              precision    recall  f1-score   support

           0       0.88      0.81      0.84      2574
           1       0.57      0.70      0.63       942

    accuracy                           0.78      3516
   macro avg       0.73      0.75      0.74      3516
weighted avg       0.80      0.78      0.78      3516

-----------------------------------------------------------

### XGBoost

In [26]:
param_grid = {    
    'xgb__learning_rate': [0.1],
    'xgb__max_depth': [3, 9, 12],
    'xgb__min_child_weight': [10, 18],
    'xgb__subsample': [0.3, 0.9],
    'xgb__n_estimators': [5, 30, 100, 250],
}

get_pl('xgb', XGBClassifier(), param_grid, 'recall')

Grid Search found the following optimal parameters: 
xgb__learning_rate: 0.1
xgb__max_depth: 3
xgb__min_child_weight: 18
xgb__n_estimators: 5
xgb__subsample: 0.3

Training recall: 86.37%
Validation recall: 82.27%


In [27]:
# Create XBoost classifer object
clf_xgb = XGBClassifier(learning_rate=0.1, max_depth=3,min_child_weight=10,n_estimators=5,subsample= 0.3)
results(clf_xgb, X_train_smote, y_train_smote, X_test, y_test, 'features', 'fig2')

# Plot features importances
#imp = pd.Series(data=clf_xgb.feature_importances_, index=X.columns).sort_values(ascending=False)
#plt.figure(figsize=(10,12))
#plt.title("Feature importance")
#ax = sns.barplot(y=imp.index, x=imp.values,palette="coolwarm", orient='h')

------------------------------------------------------------------------------------------
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=10, missing=None, n_estimators=5, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=0.3, verbosity=1)
------------------------------------------------------------------------------------------
Training accuracy Score: 0.7800308999613751
Model accuracy Score: 0.7229806598407281
------------------------------------------------------------------------------------------
Classification Matrix:
              precision    recall  f1-score   support

           0       0.91      0.69      0.78      2574
           1       0.49      0.82      

### Random Forrest

parameters: 
* criterion':  loss function. gini  entropy
* n_estimator: The number of trees in the forrest.
* max_depth:  depth of a given tree 
* max_features: every split takes only subset of features. Thus, this will pick how many number of features 
               are tacken at every split [auto, none, sqrt, log2, 0.9....etc]
* min_samples_leaf: The minimum number of samples in newly created tree. 
* oob_score: out of bag score improves accuracy. 

param_grid = {    
    'rf__criterion': ['gini','entropy'],
    'rf__max_depth': [5, 9, 15, 20],
    'rf__max_features':['auto', 0.9, 0.5],
    'rf__min_samples_split': [5,20,50],
    'rf__min_samples_leaf': [15,20,30],
    'rf__n_estimators': [1,5,10]
}


In [28]:
# Pipeline par grid for SVC
param_grid = {    
    'rf__criterion': ['gini','entropy'],
    'rf__max_depth': [5, 9, 15, 20],
    'rf__max_features':['auto', 0.9, 0.5],
    'rf__min_samples_split': [5,20,50],
    'rf__min_samples_leaf': [3, 5, 10],
    'rf__n_estimators': [1,5,10]
}

get_pl('rf', RandomForestClassifier(), param_grid, 'recall')

Grid Search found the following optimal parameters: 
rf__criterion: 'gini'
rf__max_depth: 5
rf__max_features: 'auto'
rf__min_samples_leaf: 3
rf__min_samples_split: 20
rf__n_estimators: 10

Training recall: 85.86%
Validation recall: 76.33%


In [29]:
# let's try the best random forrest 
clf_rf = RandomForestClassifier(criterion='gini' , max_depth =5, max_features= 'auto', 
                                min_samples_leaf = 10, min_samples_split= 50, n_estimators=10)
results(clf_rf, X_train_smote, y_train_smote, X_test, y_test, 'features', 'fig2')

# Plot features importances
#imp = pd.Series(data=clf_rf.feature_importances_, index=X.columns).sort_values(ascending=False)
#plt.figure(figsize=(10,12))
#plt.title("Feature importance")
#ax = sns.barplot(y=imp.index, x=imp.values,palette="coolwarm", orient='h')

------------------------------------------------------------------------------------------
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=5, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=10, min_samples_split=50,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
------------------------------------------------------------------------------------------
Training accuracy Score: 0.7993433758207802
Model accuracy Score: 0.7352104664391353
------------------------------------------------------------------------------------------
Classification Matrix:
              precision    recall  f1-score   support

           0       0.91      0.71      0.80      2574
           1       0.

In [None]:
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO  
from sklearn import tree
from IPython.display import Image  
import pydotplus
from IPython.display import SVG,display
from graphviz import Source

clf_rf = RandomForestClassifier(criterion='gini' , max_depth =5, max_features= 'auto', 
                                min_samples_leaf = 10, min_samples_split= 50, n_estimators=10)
clf_rf.fit(X_train_smote, y_train_smote)
estimated_tree = clf_rf.estimators_[9]
graph = Source(tree.export_graphviz(estimated_tree,out_file=None,
                                    rounded=True,proportion = False,
                        feature_names = X.columns, 
                        precision  = 2,
                        class_names=["0","1"],
                        filled = True))
#graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
#graph.write_png('img/rf.png')
#Image(graph.create_png())
display(graph)

## Decision Tree

In [30]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier(criterion="entropy", max_depth=5)
results(clf, X_train_smote, y_train_smote, X_test, y_test, 'features', 'fig2')

------------------------------------------------------------------------------------------
DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=5,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')
------------------------------------------------------------------------------------------
Training accuracy Score: 0.7999227500965623
Model accuracy Score: 0.7482935153583617
------------------------------------------------------------------------------------------
Classification Matrix:
              precision    recall  f1-score   support

           0       0.89      0.75      0.81      2574
           1       0.52      0.74      0.61       942

    accuracy                           0.75      3

In [None]:
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO  
from IPython.display import Image  
import pydotplus

col_names = list(X.columns)
dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True,feature_names = col_names,class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('img/tree1.png')
Image(graph.create_png())

## Feature engineering
Can we change or remove columns to remove colliniarity and improve accuracy 

### MLE

In [31]:
import statsmodels.api as sm
# statsmodels has inbuilt MLE methods and easy to get the most important features based on p-values
# Create intercept term required for sm.Logit, see documentation for more information
X_sm = X
y_sm = y
X_sm = sm.add_constant(X_sm)

# Fit model
logit_model = sm.Logit(y_sm, X_sm)

# Get results of the fit
result = logit_model.fit()
result.summary()


Method .ptp is deprecated and will be removed in a future version. Use numpy.ptp instead.



Optimization terminated successfully.
         Current function value: 0.414269
         Iterations 8


0,1,2,3
Dep. Variable:,Churn,No. Observations:,7032.0
Model:,Logit,Df Residuals:,7008.0
Method:,MLE,Df Model:,23.0
Date:,"Tue, 02 Jun 2020",Pseudo R-squ.:,0.2845
Time:,12:22:58,Log-Likelihood:,-2913.1
converged:,True,LL-Null:,-4071.7
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-1.2818,0.232,-5.528,0.000,-1.736,-0.827
gender,-0.0218,0.065,-0.337,0.736,-0.149,0.105
SeniorCitizen,0.2168,0.085,2.564,0.010,0.051,0.382
Partner,-0.0004,0.078,-0.005,0.996,-0.153,0.152
Dependents,-0.1485,0.090,-1.655,0.098,-0.324,0.027
tenure,-0.0606,0.006,-9.716,0.000,-0.073,-0.048
PhoneService,0.1715,0.649,0.264,0.792,-1.100,1.443
MultipleLines,0.4484,0.177,2.530,0.011,0.101,0.796
OnlineSecurity,-0.2054,0.179,-1.150,0.250,-0.556,0.145


In [None]:
X_sm = X_sm[['gender', 'Partner', 'Dependents', 'PhoneService', 'OnlineSecurity', 
             'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
             'StreamingMovies', 'MonthlyCharges','PaymentMethod_Credit_card',
            'PaymentMethod_Mailed_check']]
display(X_sm.columns)

In [None]:
X_sm_train, X_sm_test, y_sm_train, y_sm_test = train_test_split(X_sm,y_sm,test_size=split_size,random_state=42)
smote = SMOTE()
X_sm_train_smote, y_sm_train_smote = smote.fit_sample(X_sm_train, y_sm_train) 
print(pd.Series(y_train_smote).value_counts())

In [None]:
# LogisticRegression
name = "Logistic Regression"
logit = LogisticRegression(C=0.01, fit_intercept= False, max_iter= 200, penalty= 'l2', 
                              random_state= 11, solver= 'liblinear')
model1 = model_report(logit, X_sm_train_smote,X_sm_test,y_sm_train_smote,y_sm_test,name)

name = "Random Forest Classifier"
rfc = RandomForestClassifier(criterion='gini' , max_depth =5, max_features= 'auto', 
                                min_samples_leaf = 10, min_samples_split= 50, n_estimators=10)
model2 = model_report(rfc,X_sm_train_smote,X_sm_test,y_sm_train_smote,y_sm_test,name)

name = "SVM Classifier Linear"
svc  = SVC(kernel ='poly', C=0.1, degree= 3,  tol = 0.0001, gamma = 'auto')
model3 = model_report(svc,X_sm_train_smote,X_sm_test,y_sm_train_smote,y_sm_test,name)

#name = "XGBoost Classifier"
#xgc = XGBClassifier(learning_rate=0.1, max_depth=3,min_child_weight=10,n_estimators=5,subsample= 0.3)
#model4 = model_report(xgc,X_sm_train_smote,X_sm_test,y_sm_train_smote,y_sm_test,name)

#concat all models ,model4
model_performances = pd.concat([model1,model2,model3],axis = 0).reset_index()

model_performances = model_performances.drop(columns = "index",axis =1)
display(model_performances)

plot_tracers(model_performances)


In [None]:
# Create Decision Tree classifer object
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(criterion='gini' , max_depth =5, max_features= 'auto', 
                                min_samples_leaf = 10, min_samples_split= 50, n_estimators=10)
results(forest, X_sm_train_smote, y_sm_train_smote, X_sm_test, y_sm_test, 'features', 'fig2')

### Recursive feature elimination

In [None]:
X.columns

In [None]:
from sklearn.feature_selection import RFE

logit = LogisticRegression()

rfe = RFE(logit,10)
rfe = rfe.fit(X_train_smote, y_train_smote.ravel()) # .values.ravel()

rfe.support_
rfe.ranking_

#identified columns Recursive Feature Elimination
idc_rfe = pd.DataFrame({"rfe_support" :rfe.support_,
                       "columns" : list(X.columns),
                       "ranking" : rfe.ranking_,
                      })
display(idc_rfe.sort_values(by=['ranking']))
cols = idc_rfe[idc_rfe["rfe_support"] == True]["columns"].tolist()
#display(X[cols])

#separating train and test data for selected cols 
X_rfe = X[cols]
                    
X_rfe_train, X_rfe_test, y_rfe_train, y_rfe_test = train_test_split(X_rfe,y,test_size=split_size,random_state=42)
smote = SMOTE()
X_rfe_train_smote, y_rfe_train_smote = smote.fit_sample(X_rfe_train, y_rfe_train) 

logit_rfe = LogisticRegression(C=0.001, fit_intercept= False, max_iter= 200, penalty= 'l2', 
                              random_state= 11, solver= 'liblinear')
#applying model
results(logit_rfe, X_rfe_train_smote, y_rfe_train_smote, X_rfe_test, y_rfe_test, 'coefficients', None)
