# Modeling

Here I'll try to create a model which can predict a better results.<br>
**Better results** ==
1. The model should predict as many churned customers as possible.
2. FN (False Negative) must be low
3. Recall is important here and must be high
* If model's F1 score is high, then the model is doing well all around.

In [1]:
# import important libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# for modeling
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import roc_curve, auc, accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, accuracy_score, f1_score
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import plot_confusion_matrix
#import xgboost as xgb
from sklearn.svm import SVC


# if we want to see all columns, we set this parametr on
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# use custom function
%run -i 'py/dataframecheck.py'

# set style for plots
sns.set(style="whitegrid")

In [2]:
# read dataset
df = pd.read_csv("data/mod_churn.csv")

To prepare the dataset for modeling, we need to encode categorical features to numbers. This means encoding "Yes", "No" to 0 and 1 so that algorithm can work with the data. This process is called onehot encoding.

In [3]:
# define caterogical columns and numeric columns
cat_cols=['gender', 'seniorcitizen', 'partner', 'dependents',
       'phoneservice', 'multiplelines', 'internetservice', 'onlinesecurity',
       'onlinebackup', 'deviceprotection', 'techsupport', 'streamingtv',
       'streamingmovies', 'contract', 'paperlessbilling', 'paymentmethod']
numeric_cols=['tenure','monthlycharges', 'totalcharges']

In [4]:
# One-hot encode
df = pd.get_dummies(df)

In [5]:
df

Unnamed: 0,seniorcitizen,tenure,monthlycharges,totalcharges,churn,partner_No,partner_Yes,dependents_No,dependents_Yes,multiplelines_No,multiplelines_No phone service,multiplelines_Yes,internetservice_DSL,internetservice_Fiber optic,internetservice_No,onlinebackup_No,onlinebackup_No internet service,onlinebackup_Yes,deviceprotection_No,deviceprotection_No internet service,deviceprotection_Yes,techsupport_No,techsupport_No internet service,techsupport_Yes,streamingtv_No,streamingtv_No internet service,streamingtv_Yes,contract_Month-to-month,contract_One year,contract_Two year,paperlessbilling_No,paperlessbilling_Yes,paymentmethod_Bank transfer (automatic),paymentmethod_Credit card (automatic),paymentmethod_Electronic check,paymentmethod_Mailed check
0,0,1,29.85,29.85,0,0,1,1,0,0,1,0,1,0,0,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0
1,0,34,56.95,1889.50,0,1,0,1,0,1,0,0,1,0,0,1,0,0,0,0,1,1,0,0,1,0,0,0,1,0,1,0,0,0,0,1
2,0,2,53.85,108.15,1,1,0,1,0,1,0,0,1,0,0,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1
3,0,45,42.30,1840.75,0,1,0,1,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,1,1,0,0,0,1,0,1,0,1,0,0,0
4,0,2,70.70,151.65,1,1,0,1,0,1,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,0,24,84.80,1990.50,0,0,1,0,1,0,0,1,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,0,1
7028,0,72,103.20,7362.90,0,0,1,0,1,0,0,1,0,1,0,0,0,1,0,0,1,1,0,0,0,0,1,0,1,0,0,1,0,1,0,0
7029,0,11,29.60,346.45,0,0,1,0,1,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0
7030,1,4,74.40,306.60,1,0,1,1,0,0,0,1,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1


In [6]:
# z-score normalizing: SScaler
df_z = df.fillna(value=0) 
for col in df_z.columns:
    df_z[col] = (df - df.mean())/df.std()

df_z.head()

Unnamed: 0,seniorcitizen,tenure,monthlycharges,totalcharges,churn,partner_No,partner_Yes,dependents_No,dependents_Yes,multiplelines_No,multiplelines_No phone service,multiplelines_Yes,internetservice_DSL,internetservice_Fiber optic,internetservice_No,onlinebackup_No,onlinebackup_No internet service,onlinebackup_Yes,deviceprotection_No,deviceprotection_No internet service,deviceprotection_Yes,techsupport_No,techsupport_No internet service,techsupport_Yes,streamingtv_No,streamingtv_No internet service,streamingtv_Yes,contract_Month-to-month,contract_One year,contract_Two year,paperlessbilling_No,paperlessbilling_Yes,paymentmethod_Bank transfer (automatic),paymentmethod_Credit card (automatic),paymentmethod_Electronic check,paymentmethod_Mailed check
0,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296
1,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296
2,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296
3,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296
4,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296,-0.440296


How we can see this normalization can't be helpfull

In [7]:
# min-max scaling for normalize columns MinMaxScaler
df_m = df.fillna(value=0) 
for col in df_m.columns:
    df_m[col] = (df_m[col] - min(df_m[col]))/ (max(df_m[col]) - min(df_m[col])) 

df_m.head()

Unnamed: 0,seniorcitizen,tenure,monthlycharges,totalcharges,churn,partner_No,partner_Yes,dependents_No,dependents_Yes,multiplelines_No,multiplelines_No phone service,multiplelines_Yes,internetservice_DSL,internetservice_Fiber optic,internetservice_No,onlinebackup_No,onlinebackup_No internet service,onlinebackup_Yes,deviceprotection_No,deviceprotection_No internet service,deviceprotection_Yes,techsupport_No,techsupport_No internet service,techsupport_Yes,streamingtv_No,streamingtv_No internet service,streamingtv_Yes,contract_Month-to-month,contract_One year,contract_Two year,paperlessbilling_No,paperlessbilling_Yes,paymentmethod_Bank transfer (automatic),paymentmethod_Credit card (automatic),paymentmethod_Electronic check,paymentmethod_Mailed check
0,0.0,0.0,0.115423,0.001275,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.0,0.464789,0.385075,0.215867,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.014085,0.354229,0.01031,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0.0,0.619718,0.239303,0.210241,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.014085,0.521891,0.01533,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [8]:
# log transformation foe numeric columns
#df_l = df.fillna(value=0)

#for col in numeric_cols:
#    df_l[col] = np.diff(np.log(df_l.loc[col])) 

#df_l.head() 
    
#    np.diff(np.log(df.price))

## Create a baseline model

In [9]:
def evaluate_predictions(model, X_test, y_train, y_hat_train,y_test, y_hat_test):
    # Evaluate predictions
    print('-'*40)
    print('Accuracy score for Training Dataset = ', accuracy_score(y_train, y_hat_train))
    print('Accuracy score for Testing Dataset = ', accuracy_score(y_test, y_hat_test))


    print('-'*40)
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_hat_test))

    print('-'*40)
    print('Classification Matrix:')
    print(classification_report(y_test, y_hat_test))
    
    plot_confusion_matrix(model, X_test, y_test,cmap='bone_r')  
    
    fpr, tpr, thresholds = roc_curve(y_test, y_hat_test)
    print('AUC: {}'.format(auc(fpr, tpr)))

In [10]:
# Split data into X and y
y=df['churn'].copy()

# Define X
X = df.drop(columns=['churn'], axis=1)

# Split the data into a training and a test set and set stratify=y to help with imbalance data
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.4,
                                                 random_state=42)

In [11]:
classifier_list = [ LogisticRegression(solver='lbfgs'),
                    KNeighborsClassifier(),
                    GaussianNB(priors=None),
                    RandomForestClassifier()]

for clf in classifier_list:
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    precision = precision_score(y_test, predictions) 
    recall=recall_score(y_test, predictions)
    accuracy = accuracy_score(y_test, predictions)
    
    
# Precision_score = tp / (tp + fp)
# Accuracy_score = (# of correctly assigned rows) / (All rows)

    print(#clf, '\n \n',classification_report(y_test, predictions), 
          #'\n \nPrecision Score: ' , precision,
          #'\nRecall Score: ' , recall,
          '\nAccuracy Score: ', accuracy)#,
          #'\n\n----------------------------------------------------------------\n\n')



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



Accuracy Score:  0.8023462495556346

Accuracy Score:  0.7721293992179168

Accuracy Score:  0.710273729114824

Accuracy Score:  0.7817276928546036


In [12]:

LR: 0.802803 (0.014983)
LDA: 0.794035 (0.018261)
KNN: 0.766537 (0.022790)
CART: 0.730984 (0.014947)
NB: 0.752548 (0.017705)
SVM: 0.767720 (0.010604)

TypeError: 'float' object is not callable

### Build a logistic regression base model using statsmodels

In [13]:
# Split data into X and y
y=df_m['churn'].copy()

# Define X
X = df_m.drop(columns=['churn'], axis=1)

# Split the data into a training and a test set and set stratify=y to help with imbalance data
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.4,
                                                 random_state=42)

In [14]:
# define y
y=df_m['churn'].copy()

# Define X
X = df_m.drop(columns=['churn'], axis=1)

# Create intercept term required for sm.Logit, see documentation for more information
X = sm.add_constant(X)

# Fit model
logit_model = sm.Logit(y, X)

# Get results of the fit
result = logit_model.fit()
result.summary()

NameError: name 'sm' is not defined

In [None]:
# modeling with LogesticREgression
#create an instance and fit the model 
logmodel = LogisticRegression()
logmodel.fit(X_train, y_train)

#predictions
y_hat_train=logmodel.predict(X_train)
y_hat_test = logmodel.predict(X_test)

evaluate_predictions(logmodel, X_test, y_train, y_hat_train,y_test, y_hat_test)


## Try SMOTE for normalize dataset

In [None]:
print('Original class distribution: \n')
print(y.value_counts())
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_sample(X_train, y_train) 
# Preview synthetic sample class distribution
print('-----------------------------------------')
print('Synthetic sample class distribution: \n')
print(pd.Series(y_train_resampled).value_counts())

In [None]:
logmodel = LogisticRegression()
logmodel.fit(X_train_resampled, y_train_resampled)

#predictions
y_hat_train=logmodel.predict(X_train_resampled)
y_hat_test = logmodel.predict(X_test)

evaluate_predictions(logmodel, X_test, y_train_resampled, y_hat_train,y_test, y_hat_test)


In [None]:
logmodel = LogisticRegression(fit_intercept=False, C=1e12, solver='liblinear')
logmodel.fit(X_train_resampled, y_train_resampled)

#predictions
y_hat_train=logmodel.predict(X_train_resampled)
y_hat_test = logmodel.predict(X_test)

# Evaluate predictions
print('-'*40)
print('Accuracy score for Training Dataset = ', accuracy_score(y_train_resampled, y_hat_train))
print('Accuracy score for Testing Dataset = ', accuracy_score(y_test, y_hat_test))


print('-'*40)
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_hat_test))

print('-'*40)
print('Classification Matrix:')
print(classification_report(y_test, y_hat_test))

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_hat_test)
print('AUC: {}'.format(auc(fpr, tpr)))

In [None]:
# To get the weights of all the variables
weights_LogReg = pd.Series(logmodel.coef_[0], index=X.columns.values)
print(weights_LogReg)
plt.figure(figsize=(6, 15))
weights_LogReg.sort_values().plot(kind='barh')

As we can see, some variables are negatively correlated with the predicted variable (Churn), while some have positively. Negative correlation means that likeliness of churn decreases with that variable.<br>
Let me interpret some findings from above plot:<br>
As we have seen in our EDA, having a 2 month contract reduces chances of churn. 2 month contract along with tenure have the most negative relation with Churn as predicted by logistic regressions. Total charges, monthly contracts, fibre optic internet services and seniority can lead to higher churn rates. This is interesting because although fibre optic services are faster, customers are likely to churn because of it. I don't understad why this is happening.<br>
Till the time let's have a look at other algorithms.

## Conclusions

Using SMOTE we can improve our score
__________

## Try to Decision Tree Model
Our dayta set nas a lot of Binary columns (0/1).

In [None]:
# modeling with Decision Tree
#create an instance and fit the model 
treemodel = DecisionTreeClassifier(random_state=123)
treemodel.fit(X_train, y_train)

#predictions
y_hat_train= treemodel.predict(X_train)
y_hat_test = treemodel.predict(X_test)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Evaluate predictions
print('-'*40)
print('Accuracy score for Training Dataset = ', accuracy_score(y_train, y_hat_train))
print('Accuracy score for Testing Dataset = ', accuracy_score(y_test, y_hat_test))


print('-'*40)
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_hat_test))

print('-'*40)
print('Classification Matrix:')
print(classification_report(y_test, y_hat_test))

Accuracy score for Training Dataset =  0.9992889310263096 >> then Accuracy score for Testing Dataset =  0.7163170991823676 
It show us oversampling

In [None]:
print('Original class distribution: \n')
print(y.value_counts())
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_sample(X_train, y_train) 
# Preview synthetic sample class distribution
print('-----------------------------------------')
print('Synthetic sample class distribution: \n')
print(pd.Series(y_train_resampled).value_counts())

In [None]:
treemodel = DecisionTreeClassifier(random_state=123)
treemodel.fit(X_train_resampled, y_train_resampled)

#predictions
y_hat_train=treemodel.predict(X_train_resampled)
y_hat_test = treemodel.predict(X_test)

# Evaluate predictions
print('-'*40)
print('Accuracy score for Training Dataset = ', accuracy_score(y_train_resampled, y_hat_train))
print('Accuracy score for Testing Dataset = ', accuracy_score(y_test, y_hat_test))


print('-'*40)
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_hat_test))

print('-'*40)
print('Classification Matrix:')
print(classification_report(y_test, y_hat_test))

In [None]:
The same ploblem. 

In [None]:
randommodel = RandomForestClassifier(random_state=123)
randommodel.fit(X_train, y_train)

#predictions
y_hat_train=randommodel.predict(X_train_resampled)
y_hat_test = randommodel.predict(X_test)

# Evaluate predictions
print('-'*40)
print('Accuracy score for Training Dataset = ', accuracy_score(y_train_resampled, y_hat_train))
print('Accuracy score for Testing Dataset = ', accuracy_score(y_test, y_hat_test))


print('-'*40)
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_hat_test))

print('-'*40)
print('Classification Matrix:')
print(classification_report(y_test, y_hat_test))

In [None]:
rfc = RandomForestClassifier()
param_grid = {
     'criterion':['gini','entropy'],
    'max_depth':[2,3,4,5,20],
    'min_samples_split':[5,20,50],
    'min_samples_leaf':[15,20,30],
    'n_estimators': [1,5,10]
}
gs = GridSearchCV(rfc, param_grid, cv=3, n_jobs=-1)
gs.fit(X_train, y_train)

gs.best_params_

In [None]:
 'n_estimators': [1,5,10]

In [None]:
rfc = RandomForestClassifier(criterion='entropy', random_state=44, min_samples_leaf = 20, min_samples_split = 20, max_depth = 20,n_estimators=10)
rfc.fit(X_train, y_train)

In [None]:
#predictions
y_hat_train=rfc.predict(X_train)
y_hat_test = rfc.predict(X_test)

# Evaluate predictions
print('-'*40)
print('Accuracy score for Training Dataset = ', accuracy_score(y_train, y_hat_train))
print('Accuracy score for Testing Dataset = ', accuracy_score(y_test, y_hat_test))


print('-'*40)
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_hat_test))

print('-'*40)
print('Classification Matrix:')
print(classification_report(y_test, y_hat_test))

In [None]:
rfc = RandomForestClassifier(criterion='entropy', random_state=44, min_samples_leaf = 3, min_samples_split = 10, max_depth = None)
rfc.fit(X_train_resampled, y_train_resampled)

In [None]:
#predictions
y_hat_train=rfc.predict(X_train_resampled)
y_hat_test = rfc.predict(X_test)

# Evaluate predictions
print('-'*40)
print('Accuracy score for Training Dataset = ', accuracy_score(y_train_resampled, y_hat_train))
print('Accuracy score for Testing Dataset = ', accuracy_score(y_test, y_hat_test))


print('-'*40)
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_hat_test))

print('-'*40)
print('Classification Matrix:')
print(classification_report(y_test, y_hat_test))

# SVC

In [None]:
from sklearn.svm import SVC
svcmodel = SVC(gamma='auto')
svcmodel.fit(X_train, y_train)

#predictions
y_hat_train= svcmodel.predict(X_train_resampled)
y_hat_test = svcmodel.predict(X_test)

# Evaluate predictions
print('-'*40)
print('Accuracy score for Training Dataset = ', accuracy_score(y_train_resampled, y_hat_train))
print('Accuracy score for Testing Dataset = ', accuracy_score(y_test, y_hat_test))


print('-'*40)
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_hat_test))

print('-'*40)
print('Classification Matrix:')
print(classification_report(y_test, y_hat_test))

In [None]:
print('Original class distribution: \n')
print(y.value_counts())
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_sample(X_train, y_train) 
# Preview synthetic sample class distribution
print('-----------------------------------------')
print('Synthetic sample class distribution: \n')
print(pd.Series(y_train_resampled).value_counts())

In [None]:
from sklearn.svm import SVC
svcmodel = SVC(gamma='auto')
svcmodel.fit(X_train_resampled, y_train_resampled)

#predictions
y_hat_train= svcmodel.predict(X_train_resampled)
y_hat_test = svcmodel.predict(X_test)

# Evaluate predictions
print('-'*40)
print('Accuracy score for Training Dataset = ', accuracy_score(y_train_resampled, y_hat_train))
print('Accuracy score for Testing Dataset = ', accuracy_score(y_test, y_hat_test))


print('-'*40)
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_hat_test))

print('-'*40)
print('Classification Matrix:')
print(classification_report(y_test, y_hat_test))

In [None]:
svcmodel = SVC()
param_grid = {
    'C': [1,10,20],
    'kernel': ['rbf','linear']
}
gs = GridSearchCV(svcmodel, param_grid)
gs.fit(X_train, y_train)

gs.best_params_

In [None]:
svc_ = SVC(C=1, kernel='linear')
svc_.fit(X_train, y_train)

In [None]:
#predictions
y_hat_train= svc_.predict(X_train)
y_hat_test = svc_.predict(X_test)

# Evaluate predictions
print('-'*40)
print('Accuracy score for Training Dataset = ', accuracy_score(y_train, y_hat_train))
print('Accuracy score for Testing Dataset = ', accuracy_score(y_test, y_hat_test))


print('-'*40)
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_hat_test))

print('-'*40)
print('Classification Matrix:')
print(classification_report(y_test, y_hat_test))

In [None]:
from sklearn.model_selection import train_test_split

# Split data into X and y
y=df['churn_Yes'].copy()

# Define X

X = df.drop(columns=['churn_Yes'], axis=1)


# Split the data into a training and a test set and set stratify=y to help with imbalance data
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.4,
                                                 random_state=42,stratify=y)

df.head()

In [None]:
print(len(X_train), len(X_test))

In [None]:
# modeling with LogesticREgression
#create an instance and fit the model 
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression()
logmodel.fit(X_train, y_train)

#predictions
y_hat_train=logmodel.predict(X_train)
y_hat_test = logmodel.predict(X_test)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Evaluate predictions
print('-'*40)
print('Accuracy score for Training Dataset = ', accuracy_score(y_train, y_hat_train))
print('Accuracy score for Testing Dataset = ', accuracy_score(y_test, y_hat_test))


print('-'*40)
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_hat_test))

print('-'*40)
print('Classification Matrix:')
print(classification_report(y_test, y_hat_test))

In [None]:
#Converting Gender column to numeric value
#datset_churn['Gender'].unique() # Print unique values in the column
df['gender_num'] = df['gender'].map( {'Female': 1, 'Male': 0} ).astype(int) #Map Categorical to Numerical Values
# For Partner & Dependant , we created Family Column . Converting Family column to numeric value
#datset_churn['Family'].unique() # Print unique values in the column
df['family_num'] = df['partner'].map( {'Yes': 1, 'No': 0} ).astype(int) #Map Categorical to Numerical Values
df.head()

In [None]:
params = {'random_state': 0, 'n_jobs': 4, 'n_estimators': 5000, 'max_depth': 8}
# One-hot encode
df = pd.get_dummies(df)

df

In [None]:
# Split data into X and y
y=df['churn_Yes'].copy()

# Define X

x = df.drop(columns=['churn_Yes'], axis=1)

# Fit RandomForest Classifier
clf = RandomForestClassifier(**params)
clf = clf.fit(x, y)

# Plot features importances
imp = pd.Series(data=clf.feature_importances_, index=x.columns).sort_values(ascending=False)
plt.figure(figsize=(10,12))
plt.title("Feature importance")
ax = sns.barplot(y=imp.index, x=imp.values, palette="Blues_d", orient='h')



In [None]:
# Split data into X and y
y=df['churn_Yes'].copy()

# Define X

x = df.drop(columns=['churn_Yes'], axis=1)

# Fit RandomForest Classifier
clf = RandomForestClassifier()
clf = clf.fit(x, y)

# Plot features importances
imp = pd.Series(data=clf.feature_importances_, index=x.columns).sort_values(ascending=False)
plt.figure(figsize=(10,12))
plt.title("Feature importance")
ax = sns.barplot(y=imp.index, x=imp.values, palette="Blues_d", orient='h')



In [None]:
# Evaluate predictions
print('-'*40)
print('Accuracy score for Training Dataset = ', accuracy_score(y_train_resampled, y_hat_train))
print('Accuracy score for Testing Dataset = ', accuracy_score(y_test, y_hat_test))


print('-'*40)
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_hat_test))

print('-'*40)
print('Classification Matrix:')
print(classification_report(y_test, y_hat_test))

In [None]:
imp

In [None]:
df2 = pd.read_csv("data/clean_churn.csv")

df2.replace(to_replace='Yes', value=1, inplace=True)
df2.replace(to_replace='No', value=0, inplace=True)
df2['gender'].replace(to_replace='Male', value=1, inplace=True)
df2['gender'].replace(to_replace='Female', value=0, inplace=True)

In [None]:
df2.tail(3)

In [None]:
new_df = df2.iloc[:, :-3]
df_cat = pd.get_dummies(new_df)
df_cat.head(7)

In [None]:
df_cat.shape

In [None]:
# Sklearn imports
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

#from xgboost import XGBClassifier

In [None]:
X = df_cat
y = df2['churn'].values

features = X.columns.values
scaler = MinMaxScaler(feature_range = (0, 1))
scaler.fit(X)
X = pd.DataFrame(scaler.transform(X))
X.columns = features

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

pred_log_reg = log_reg.predict(X_test)
print('Accuracy:', metrics.accuracy_score(y_test, pred_log_reg), '\n')

# To get the weights of all the variables
weights_LogReg = pd.Series(log_reg.coef_[0], index=X.columns.values)
print(weights_LogReg)
plt.figure(figsize=(6, 15))
weights_LogReg.sort_values().plot(kind='barh')

As we can see, some variables are negatively correlated with the predicted variable (Churn), while some have positively. Negative correlation means that likeliness of churn decreases with that variable.

Let me interpret some findings from above plot,

As we have seen in our EDA, having a 2 month contract reduces chances of churn. 2 month contract along with tenure have the most negative relation with Churn as predicted by logistic regressions.
Total charges, monthly contracts, fibre optic internet services and seniority can lead to higher churn rates. This is interesting because although fibre optic services are faster, customers are likely to churn because of it.
I don't understad why this is happening.

Till the time let's have a look at other algorithms.

In [None]:
rf = RandomForestClassifier(n_estimators=1000,
                            oob_score = True,
                            n_jobs = -1,
                            random_state =50,
                            max_features = "auto",
                            max_leaf_nodes = 30)
rf.fit(X_train, y_train)

# Make predictions
pred_rf = rf.predict(X_test)
print('Accuracy:', metrics.accuracy_score(y_test, pred_rf), '\n')
importances = rf.feature_importances_
weights_RanFor = pd.Series(importances,
                 index=X.columns.values)
print(weights_RanFor)
plt.figure(figsize=(6, 15))
weights_RanFor.sort_values().plot(kind = 'barh')

From random forest algorithm, monthly contract, tenure and total charges are the most important predictor variables to predict churn.
The results from random forest are very similar to that of the logistic regression and in line to what we had expected from our EDA

In [None]:
model_AdaB = AdaBoostClassifier()
model_AdaB.fit(X_train,y_train)
preds_adaB = model_AdaB.predict(X_test)
metrics.accuracy_score(y_test, preds_adaB)
print('Accuracy:', metrics.accuracy_score(y_test, preds_adaB), '\n')
importances = model_AdaB.feature_importances_
weights_AdaB = pd.Series(importances,
                 index=X.columns.values)
print(weights_AdaB)
plt.figure(figsize=(6, 15))
weights_RanFor.sort_values().plot(kind = 'barh')

In [None]:
model_svm = SVC(kernel='linear') 
model_svm.fit(X_train, y_train)
pred_svm = model_svm.predict(X_test)
print('Accuracy:', metrics.accuracy_score(y_test, pred_svm), '\n')

feature_importance = list(zip(model_svm.coef_[0], X.columns.values))
print(feature_importance)

With SVM-Linear Kernel, I am able to achieve the slightly higher than all accuracy on test data to almost 80.19%.

In [None]:
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO
#from io import StringIO
from IPython.display import Image
import pydotplus
clf= DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_hat_test = clf.predict(X_test)
dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,
                filled=True, rounded=True,
                special_characters=True,feature_names = df[X_train].columns)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('churn.png')
Image(graph.create_png())

In [None]:
!pip3 install sklearn.externals.six

In [None]:
!pip install scikit-learn==0.22

In [None]:
#Convertin the predictor variable in a binary numeric variable
df['churn'].replace(to_replace='Yes', value=1, inplace=True)
df['churn'].replace(to_replace='No',  value=0, inplace=True)

#Let's convert all the categorical variables into dummy variables
df_dummies = pd.get_dummies(df)
df_dummies.head()

# We will use the data frame where we had created dummy variables
y = df_dummies['churn'].values
X = df_dummies.drop(columns = ['churn'])

# Scaling all the variables to a range of 0 to 1
from sklearn.preprocessing import MinMaxScaler
features = X.columns.values
scaler = MinMaxScaler(feature_range = (0,1))
scaler.fit(X)
X = pd.DataFrame(scaler.transform(X))
X.columns = features

In [None]:
# Create Train & Test Data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [None]:
# Running logistic regression model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear')
result = model.fit(X_train, y_train)

In [None]:
result

In [None]:
from sklearn import metrics
prediction_test = model.predict(X_test)
# Print the prediction accuracy
print (metrics.accuracy_score(y_test, prediction_test))

In [None]:
#predictions
y_hat_train=model.predict(X_train)
y_hat_test = model.predict(X_test)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Evaluate predictions
print('-'*40)
print('Accuracy score for Training Dataset = ', accuracy_score(y_train, y_hat_train))
print('Accuracy score for Testing Dataset = ', accuracy_score(y_test, y_hat_test))


print('-'*40)
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_hat_test))

print('-'*40)
print('Classification Matrix:')
print(classification_report(y_test, y_hat_test))

In [None]:

#scale numeric features 
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
# Scale the train and test data
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [None]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 41)

smote = SMOTE()

X_train_resampled, y_train_resampled = sm.fit_sample(X_train,y_train)
X_train_resampled = pd.DataFrame(data=X_train_resampled,columns=columns )
y_train_resampled = pd.DataFrame(data=y_train_resampled,columns=["churn"])
# we can Check the numbers of our data 

X_train_resampled, y_train_resampled = smote.fit_sample(X_train, y_train) 
print(pd.Series(y_train_resampled).value_counts())

In [None]:
# read dataset
df = pd.read_csv("data/clean_churn.csv")

In [None]:
X = df.iloc[:,1:16].values # Feature Variable
y = df.iloc[:,16].values # Target Variable

#Dividing data into test & train splitting 70% data for training anf 30% for test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)
print('There are {} samples in the training set and {} samples in the test set'.format(X_train.shape[0], X_test.shape[0]))

In [None]:
# Making a list of all classifiers
classifier_model = [LogisticRegression(),KNeighborsClassifier(),GaussianNB(),SVC(),DecisionTreeClassifier(),RandomForestClassifier(), SGDClassifier(), AdaBoostClassifier()]

# Creating empty list to store the performance details
classifier_model_list= []
classifier_accuracy_test = []
classifier_accuracy_train = []
f1score = []
precisionscore = []
recallscore = []
avg_pre_rec_score = []
cv_score = []

for classifier_list in classifier_model:
    classifier = classifier_list
 
    # Fitting the training set into classification model
    classifier.fit(X_train,y_train)
    
    # Predicting the output on test datset
    y_pred_test = classifier.predict(X_test)    
    score_test = accuracy_score(y_test, y_pred_test)
    
    # Predicting the output on training datset
    y_pred_train = classifier.predict(X_train) 
    score_train = accuracy_score(y_train, y_pred_train)
    
    # Cross Validation Score on training test
    scores = cross_val_score(classifier, X_train,y_train, cv=10)
    cv_score.append(scores.mean())
    
    #Keeping the model and accuracy score into a list
    classifier_model_list.append(classifier_list.__class__.__name__)
    classifier_accuracy_test.append(round(score_test,4))
    classifier_accuracy_train.append(round(score_train,4))
    
    #Precision, Recall and F1 score
    f1score.append(f1_score(y_test, y_pred_test))
    precisionscore.append(precision_score(y_test, y_pred_test))
    recallscore.append(recall_score(y_test, y_pred_test))
    
    #Calculating Average Precision Recall Score
    try:
        y_pred_score = classifier.decision_function(X_test)
    except:
        y_pred_score = classifier.predict_proba(X_test)[:,1]
    
    from sklearn.metrics import average_precision_score
    average_precision = average_precision_score(y_test, y_pred_score)
    avg_pre_rec_score.append(average_precision)
    
    
    #Confusion Matrix
    plot_confusion_matrix(classifier_list.__class__.__name__, y_test, y_pred_test)
    plot_prec_rec_curve(classifier_list.__class__.__name__, y_test, y_pred_score)