# Notebook - Modelling

In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import env
import acquire

-  Acquire telco_churn data

In [2]:
df = acquire.get_telco_churn_data()

- Set index to customer_id in order to retain individual customer information

In [3]:
df = df.set_index('customer_id')

In [4]:
df.head()

Unnamed: 0_level_0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service_type_id,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,contract_type_id,paperless_billing,payment_type_id,monthly_charges,total_charges,churn
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0002-ORFBO,Female,0,Yes,Yes,9,Yes,No,1,No,Yes,No,Yes,Yes,No,2,Yes,2,65.6,593.3,No
0003-MKNFE,Male,0,No,No,9,Yes,Yes,1,No,No,No,No,No,Yes,1,No,2,59.9,542.4,No
0004-TLHLJ,Male,0,No,No,4,Yes,No,2,No,No,Yes,No,No,No,1,Yes,1,73.9,280.85,Yes
0011-IGKFF,Male,1,Yes,No,13,Yes,No,2,No,Yes,Yes,No,Yes,Yes,1,Yes,1,98.0,1237.85,Yes
0013-EXCHZ,Female,1,Yes,No,3,Yes,No,2,No,No,No,Yes,Yes,No,1,Yes,2,83.9,267.4,Yes


- Explore data

In [5]:
df.isnull().sum()

gender                      0
senior_citizen              0
partner                     0
dependents                  0
tenure                      0
phone_service               0
multiple_lines              0
internet_service_type_id    0
online_security             0
online_backup               0
device_protection           0
tech_support                0
streaming_tv                0
streaming_movies            0
contract_type_id            0
paperless_billing           0
payment_type_id             0
monthly_charges             0
total_charges               0
churn                       0
dtype: int64

In [6]:
df.describe()

Unnamed: 0,senior_citizen,tenure,internet_service_type_id,contract_type_id,payment_type_id,monthly_charges
count,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0
mean,0.162147,32.371149,1.872923,1.690473,2.315633,64.761692
std,0.368612,24.559481,0.737796,0.833755,1.148907,30.090047
min,0.0,0.0,1.0,1.0,1.0,18.25
25%,0.0,9.0,1.0,1.0,1.0,35.5
50%,0.0,29.0,2.0,1.0,2.0,70.35
75%,0.0,55.0,2.0,2.0,3.0,89.85
max,1.0,72.0,3.0,3.0,4.0,118.75


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7043 entries, 0002-ORFBO to 9995-HOTOH
Data columns (total 20 columns):
gender                      7043 non-null object
senior_citizen              7043 non-null int64
partner                     7043 non-null object
dependents                  7043 non-null object
tenure                      7043 non-null int64
phone_service               7043 non-null object
multiple_lines              7043 non-null object
internet_service_type_id    7043 non-null int64
online_security             7043 non-null object
online_backup               7043 non-null object
device_protection           7043 non-null object
tech_support                7043 non-null object
streaming_tv                7043 non-null object
streaming_movies            7043 non-null object
contract_type_id            7043 non-null int64
paperless_billing           7043 non-null object
payment_type_id             7043 non-null int64
monthly_charges             7043 non-null float64
total

In [8]:
df.size

140860

In [9]:
df.shape

(7043, 20)

- Change churn column from yes and no, to 1 and 0

In [10]:
df['churn'] = df['churn'].replace('Yes', 1)
df['churn'] = df['churn'].replace('No', 0)

In [11]:
df.head()

Unnamed: 0_level_0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service_type_id,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,contract_type_id,paperless_billing,payment_type_id,monthly_charges,total_charges,churn
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0002-ORFBO,Female,0,Yes,Yes,9,Yes,No,1,No,Yes,No,Yes,Yes,No,2,Yes,2,65.6,593.3,0
0003-MKNFE,Male,0,No,No,9,Yes,Yes,1,No,No,No,No,No,Yes,1,No,2,59.9,542.4,0
0004-TLHLJ,Male,0,No,No,4,Yes,No,2,No,No,Yes,No,No,No,1,Yes,1,73.9,280.85,1
0011-IGKFF,Male,1,Yes,No,13,Yes,No,2,No,Yes,Yes,No,Yes,Yes,1,Yes,1,98.0,1237.85,1
0013-EXCHZ,Female,1,Yes,No,3,Yes,No,2,No,No,No,Yes,Yes,No,1,Yes,2,83.9,267.4,1


- Change the total charges column from an object to a float, replace erroneous information with nan, and remove rows with nans

In [12]:
df.replace(r'^\s*$', np.nan, regex=True, inplace=True)
df["total_charges"] = df["total_charges"].astype('float')
df = df.dropna()

- Check to see how many rows were dropped.  10 Dropped.

In [13]:
df.shape

(7032, 20)

## Baseline Model (unscaled data)

- Split data between X and y

In [14]:
X = df[['senior_citizen', 'tenure', 'internet_service_type_id', 'contract_type_id', 'payment_type_id', 'monthly_charges', 'total_charges']]

In [15]:
y = df[['churn']]

- Split X and y into Train and Test data sets

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = 123)

X_train.head()

Unnamed: 0_level_0,senior_citizen,tenure,internet_service_type_id,contract_type_id,payment_type_id,monthly_charges,total_charges
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
7718-RXDGG,0,15,2,1,3,68.6,1108.6
7649-SIJJF,0,71,1,3,2,80.1,5585.4
5061-PBXFW,0,32,1,1,3,61.4,1864.65
3011-WQKSZ,0,19,3,1,1,20.0,377.55
9959-WOFKT,0,71,2,3,3,106.7,7382.25


In [17]:
y_train.head()

Unnamed: 0_level_0,churn
customer_id,Unnamed: 1_level_1
7718-RXDGG,0
7649-SIJJF,0
5061-PBXFW,0
3011-WQKSZ,0
9959-WOFKT,0


- Create Decision Tree Classifier? object

In [18]:
clf = DecisionTreeClassifier(max_depth=4, random_state=123)

- Fit X and y train data to the Decision Tree Classifier object

In [19]:
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=123, splitter='best')

- Predictions of churn or not churn based on X_train model

In [20]:
y_pred = clf.predict(X_train)
y_pred[0:50]

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 1])

- Prediction probabilities of churn or not churn based on X_train model

In [21]:
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba

array([[0.76290323, 0.23709677],
       [0.9888664 , 0.0111336 ],
       [0.76290323, 0.23709677],
       ...,
       [0.52147239, 0.47852761],
       [0.925     , 0.075     ],
       [0.925     , 0.075     ]])

- Accuracy of model

In [22]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.80


- Confusion Matrix (predictions on column heads, actuals on row)

In [23]:
confusion_matrix(y_train, y_pred)

array([[3742,  384],
       [ 760,  739]])

In [24]:
sorted(y_train.churn.unique())

[0, 1]

In [25]:
y_train.churn.value_counts()

0    4126
1    1499
Name: churn, dtype: int64

- Pretty version of confusion matrix

In [26]:
labels = sorted(y_train.churn.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,3742,384
1,760,739


Classification Report

Precision: the higher this number is, the more you were able to pinpoint all positives correctly. If this is a low score, you predicted a lot of positives where there were none. tp / (tp + fp)

Recall: if this score is high, you didn’t miss a lot of positives. But as it gets lower, you are not predicting the positives that are actually there. tp / (tp + fn)

f1-score: The balanced harmonic mean of Recall and Precision, giving both metrics equal weight. The higher the F-Measure is, the better.

Support: number of occurrences of each class in where y is true.

In [27]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.91      0.87      4126
           1       0.66      0.49      0.56      1499

    accuracy                           0.80      5625
   macro avg       0.74      0.70      0.72      5625
weighted avg       0.79      0.80      0.79      5625



In [28]:
cm = pd.DataFrame(confusion_matrix(y_train, y_pred))
FP = cm.sum(axis=0) - np.diag(cm)  
FN = cm.sum(axis=1) - np.diag(cm)
TP = np.diag(cm)
TN = cm.values.sum() - (FP + FN + TP)
# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)
# Specificity or true negative rate
TNR = TN/(TN+FP) 
# Precision or positive predictive value
PPV = TP/(TP+FP)
# Negative predictive value
NPV = TN/(TN+FN)
# Fall out or false positive rate
FPR = FP/(FP+TN)
# False negative rate
FNR = FN/(TP+FN)
# False discovery rate
FDR = FP/(TP+FP)
# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)

In [29]:
print(FP)
print(FN)
print(TP)
print(TN)

0    760
1    384
dtype: int64
0    384
1    760
dtype: int64
[3742  739]
0     739
1    3742
dtype: int64


- Accuracy of Decision Tree Classifier: 79% of variance explained

In [30]:
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

Accuracy of Decision Tree classifier on test set: 0.79


## Modelling with scaled and encoded data

In [31]:
import pandas as pd
import numpy as np
import scipy as sp 
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler

import env
import acquire
import prepare

- Make sure to re-acquire new dataset in order to model again.

In [32]:
df = acquire.get_telco_churn_data()

In [33]:
df.head()

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service_type_id,online_security,...,device_protection,tech_support,streaming_tv,streaming_movies,contract_type_id,paperless_billing,payment_type_id,monthly_charges,total_charges,churn
0,0002-ORFBO,Female,0,Yes,Yes,9,Yes,No,1,No,...,No,Yes,Yes,No,2,Yes,2,65.6,593.3,No
1,0003-MKNFE,Male,0,No,No,9,Yes,Yes,1,No,...,No,No,No,Yes,1,No,2,59.9,542.4,No
2,0004-TLHLJ,Male,0,No,No,4,Yes,No,2,No,...,Yes,No,No,No,1,Yes,1,73.9,280.85,Yes
3,0011-IGKFF,Male,1,Yes,No,13,Yes,No,2,No,...,Yes,No,Yes,Yes,1,Yes,1,98.0,1237.85,Yes
4,0013-EXCHZ,Female,1,Yes,No,3,Yes,No,2,No,...,No,Yes,Yes,No,1,Yes,2,83.9,267.4,Yes


- Set index to customer_id in order to retain individual customer information

In [34]:
df = df.set_index('customer_id')

In [35]:
df.head()

Unnamed: 0_level_0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service_type_id,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,contract_type_id,paperless_billing,payment_type_id,monthly_charges,total_charges,churn
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0002-ORFBO,Female,0,Yes,Yes,9,Yes,No,1,No,Yes,No,Yes,Yes,No,2,Yes,2,65.6,593.3,No
0003-MKNFE,Male,0,No,No,9,Yes,Yes,1,No,No,No,No,No,Yes,1,No,2,59.9,542.4,No
0004-TLHLJ,Male,0,No,No,4,Yes,No,2,No,No,Yes,No,No,No,1,Yes,1,73.9,280.85,Yes
0011-IGKFF,Male,1,Yes,No,13,Yes,No,2,No,Yes,Yes,No,Yes,Yes,1,Yes,1,98.0,1237.85,Yes
0013-EXCHZ,Female,1,Yes,No,3,Yes,No,2,No,No,No,Yes,Yes,No,1,Yes,2,83.9,267.4,Yes


## Code Below:

Handles erroneous data and/or outliers by replacing with nans.

    Dropped nans.
    Replaced all 'No', 'Yes' with 0 and 1
    Replaced all 'No phone service' with 0
    Replaced all 'No internet service' with 0

  Created new feature that represents tenure in years.
  
    Combines:
    
        'phone_service' and 'multiple_lines' AS 'has_phone_service'
        'online_security', 'online_backup', 'device_protection', 'tech_support' AS 'security_package'
        
    Drops:
    
        All columns combined into new column
        'payment_type_id', 'paperless_billing'

##### Reference prepare.py for functions:
- clean_data
- combine_and_clean_variables
- encode


In [36]:
df = prepare.clean_data(df)

df = prepare.combine_and_clean_variables(df)

X_train, X_test, y_train, y_test = prepare.split_data(df)

X_train, X_test = prepare.encode(X_train, X_test, 'internet_service_type_id')

X_train['DSL'] = X_train[1]
X_train['Fiber Optic'] = X_train[2]
X_train['None'] = X_train[3]

X_test['DSL'] = X_test[1]
X_test['Fiber Optic'] = X_test[2]
X_test['None'] = X_test[3]

X_train = X_train.drop(columns='internet_service_type_id')
X_test = X_test.drop(columns='internet_service_type_id')
X_train = X_train.drop(columns=[1, 2, 3])
X_test = X_test.drop(columns=[1, 2, 3])

X_train, X_test = prepare.encode(X_train, X_test, 'contract_type_id')

X_train['Month-to-Month'] = X_train[1]
X_train['One Year'] = X_train[2]
X_train['Two Year'] = X_train[3]

X_test['Month-to-Month'] = X_test[1]
X_test['One Year'] = X_test[2]
X_test['Two Year'] = X_test[3]

X_train = X_train.drop(columns='contract_type_id')
X_test = X_test.drop(columns='contract_type_id')
X_train = X_train.drop(columns=[1, 2, 3])
X_test = X_test.drop(columns=[1, 2, 3])

X_train, X_test, scaler = prepare.scale_minmax(X_train, X_test, column_list=['tenure', 'monthly_charges', 'total_charges'])

X_train = X_train.drop(columns=['tenure', 'monthly_charges', 'total_charges', 'tenure_years'])
X_test = X_test.drop(columns=['tenure', 'monthly_charges', 'total_charges', 'tenure_years'])

In [37]:
X_train.head()

Unnamed: 0_level_0,gender,senior_citizen,streaming_tv,streaming_movies,has_phone_service,family,security_package,DSL,Fiber Optic,None,Month-to-Month,One Year,Two Year,tenure_scaled,monthly_charges_scaled,total_charges_scaled
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
7718-RXDGG,1,0,0,0,True,True,False,0.0,1.0,0.0,1.0,0.0,0.0,0.197183,0.500249,0.125756
7649-SIJJF,1,0,0,1,True,True,True,1.0,0.0,0.0,0.0,0.0,1.0,0.985915,0.614848,0.642349
5061-PBXFW,0,0,0,0,True,True,True,1.0,0.0,0.0,1.0,0.0,0.0,0.43662,0.4285,0.212999
3011-WQKSZ,1,0,0,0,True,True,False,0.0,0.0,1.0,1.0,0.0,0.0,0.253521,0.015944,0.041397
9959-WOFKT,1,0,1,1,True,True,True,0.0,1.0,0.0,0.0,0.0,1.0,0.985915,0.87992,0.849694


- Decision Tree Classifier Object with hyper parameter max_depth set at 4.  Any higher could lead to overfitting.

In [38]:
clf = DecisionTreeClassifier(max_depth=4, random_state=123)
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=123, splitter='best')

In [39]:
y_pred = clf.predict(X_train)
y_pred[0:50]

array([1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 1])

In [40]:
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba

array([[0.36480687, 0.63519313],
       [0.9888664 , 0.0111336 ],
       [0.83664773, 0.16335227],
       ...,
       [0.75625   , 0.24375   ],
       [0.925     , 0.075     ],
       [0.925     , 0.075     ]])

### Accuracy Increased by one point, from .79 to .80, with the scaled and encoded data

In [41]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.80


# Create CSV file with customer ID, probability of churn, and prediction of churn

In [42]:
df_final = pd.concat([X_train, X_test])

In [43]:
df_final.shape

(7032, 16)

- Fit original dataframe with clf model to predict churn or not churn

In [44]:
prediction = clf.predict(df_final)
y_pred[0:50]

array([1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 1])

- Fit original dataframe with clf model to predict probability churn or not churn

In [45]:
y_pred_proba = clf.predict_proba(df_final)
y_pred_proba[:,1]

array([0.63519313, 0.0111336 , 0.16335227, ..., 0.12903226, 0.54421769,
       0.63519313])

- Put probabilities and predictions into a pandas dataframe

In [46]:
y_predictions = pd.DataFrame({'probability': y_pred_proba[:,1], 'predictions': (y_pred_proba[:,1] >= .5) == 1})

- Bring back customer_ID in order to assign predictions to each customer  

In [47]:
y_predictions = y_predictions.set_index(df.index)

- Read pandas dataframe to a csv file to be delivered to Development Team

In [48]:
y_predictions.to_csv(r'probability_and_predictions.csv')

In [49]:
y_predictions.head()

Unnamed: 0_level_0,probability,predictions
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0002-ORFBO,0.635193,True
0003-MKNFE,0.011134,False
0004-TLHLJ,0.163352,False
0011-IGKFF,0.163352,False
0013-EXCHZ,0.075,False
