In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from ipywidgets import interactive, FloatSlider
import imblearn.over_sampling

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, roc_curve 

%matplotlib inline

In [29]:
data = pd.read_csv('/Users/jennihawk/Documents/Data Science/Classification/Churn Project/Models/chatr_clean.csv')

In [30]:
data.shape

(7032, 32)

Use grid search or random search to arrive at best possible values of max_depth

### Train - Test Setup
- Utilize Stratify Argument

In [31]:
features_in = ['SeniorCitizen', 'tenure', 'MonthlyCharges',
       'TotalCharges', 'InternetService_Fiber', 'InternetService_No',
       'Contract_One_Year', 'Contract_Two_year', 'PaymentMethod_Crcard',
       'PaymentMethod_Electr_Check', 'PaymentMethod_Mailed_check',
       'MultipleLines_No_phone_serv', 'MultipleLines_Yes', 'Dependents_Yes',
       'gender_Male', 'Partner_Yes', 'PhoneService_Yes',
       'OnlineSecurity_No_internet_serv', 'OnlineSecurity_Yes',
       'OnlineBackup_No_Internet_Serv', 'OnlineBackup_Yes',
       'DeviceProtection_No_internet_serv', 'DeviceProtection_Yes',
       'TechSupport_No_internet_serv', 'TechSupport_Yes',
       'StreamingTV_No_internet_serv', 'StreamingTV_Yes',
       'StreamingMovies_No_internet_serv', 'StreamingMovies_Yes',
       'PaperlessBilling_Yes']

y = data['Churn_Yes']
X = data[features_in]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42,stratify=y)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(5625, 30)
(1407, 30)
(5625,)
(1407,)


### Class Imbalance
Average rate of churn on target columm

In [32]:
np.mean(y_train)

0.2657777777777778

### Setup Oversampling
- 3x positive samples. Rationale: We have 3 to 1 ratio of negative to positive observations. 3 makes sense as a starting point 
- Cross validation confirmed choice. 

In [33]:
# setup for the ratio argument of RandomOverSampler initialization
n_pos = np.sum(y_train == 1)
n_neg = np.sum(y_train == 0)
ratio = {1 : n_pos * 3, 0 : n_neg} 

# randomly oversample positive samples
ROS = imblearn.over_sampling.RandomOverSampler(sampling_strategy = ratio, random_state=42) 

#use the fit_sample method to create the dataset with a desired proportion
#the variables X_tr_rs, y_tr_rs hold the training data with the extra samples we just created
X_tr_oversample, y_tr_oversample = ROS.fit_resample(X_train, y_train)



### Oversampling Cross Validation - Determine how much to increase sample size 
- cross validation strategy for any parameter you want to tune
- here we're doing cross validation on the oversampling to select which number would be best to increase the oversampling. The winner is 3. We're not gaining anything more with higher number. 
- calling cv=10 will stratify behind the scenes

In [34]:
# n_pos = np.sum(y_train == 1)
# n_neg = np.sum(y_train == 0)


# # search for an optimal value 
# oversample_range = [1,2,3,4,5,6]
# oversample_acc_scores = []
# oversample_rec_scores = []
# for oversample_weight in oversample_range:
#     # oversample your data
#     ratio = {1 : n_pos * oversample_weight, 0 : n_neg} 
#     # randomly oversample positive samples: create 4x as many 
#     ROS = imblearn.over_sampling.RandomOverSampler(sampling_strategy = ratio, random_state=42)

#     #use the fit_sample method to create the dataset with a desired proportion
#     #the variables X_tr_rs, y_tr_rs hold the training data with the extra samples we just created
#     X_tr_oversample, y_tr_oversample = ROS.fit_resample(X_train, y_train)
    
#     # cross validate it
#     d_tree = DecisionTreeClassifier()
#     acc_scores = cross_val_score(d_tree, X_tr_oversample, y_tr_oversample, cv=10, scoring='accuracy')
#     rec_scores = cross_val_score(d_tree, X_tr_oversample, y_tr_oversample, cv=10, scoring='recall')
#     oversample_acc_scores.append(acc_scores.mean())
#     oversample_rec_scores.append(rec_scores.mean())


# print("Oversample Recall Scores")
# print(list(zip(oversample_range, oversample_rec_scores)))
# print("Oversample Accuracy Scores")
# print(list(zip(oversample_range, oversample_acc_scores)))

### Decision Tree

##### Model Setup

In [36]:
#scaled data to have apples to apples comparison with logistic regression
tree_pipe = Pipeline([('scaler', StandardScaler()), ('DecTree', DecisionTreeClassifier())])

In [37]:
#Fit Decision Tree Model to Training Data
tree_pipe.fit(X_train, y_train)

#### Hard Class Predictions - No Oversample Train Data

In [38]:
#predict churn / not churn

In [39]:
y_pred = tree_pipe.predict(X_test)

#### Setup up Oversample on Decision Tree Model

In [40]:
oversamp_tree_pipe = Pipeline([('scaler', StandardScaler()), ('DecTree', DecisionTreeClassifier())])

#### Fit the decision tree oversample model

In [41]:
#'X_tr_oversample', 'y_tr_oversample' hold the training data that contains the extra samples
oversamp_tree_pipe.fit(X_tr_oversample, y_tr_oversample)

#### Hard Class Predictions based on oversampling training data

In [42]:
#we keep same X_test
y_pred_oversamp_tree = oversamp_tree_pipe.predict(X_test)

### Random Forest - Setup Oversample
- Selects random samples from a given dataset
- Constructs a decision tree for each sample and get a prediction result from each decision tree
- Perform a vote for each predicted result
- Select the prediction result with the most votes as the final prediction

In [43]:
#estimators = number of trees in the forest - use default 100
randforest_oversamp = Pipeline([('scaler', StandardScaler()), ('RandForest', RandomForestClassifier(n_estimators=100))])

In [45]:
#fit model on oversample train data
randforest_oversamp.fit(X_tr_oversample, y_tr_oversample)

#### Random Forest - Hard Class Predictions for Oversample 

In [46]:
y_pred_randfor_oversamp = randforest_oversamp.predict(X_test)

### Hyperparameter tuning on Random Forest
- n_estimators and max_features

In [71]:
#setup separate initialization because pipeline doesn't work with gridsearch
#Don't need to scale featuers for Random Forest
rf = RandomForestClassifier()

In [55]:
#random forest try n_estimators (number of trees) and max_features
# define the parameter values that should be searched
estimator_range = list(range(90, 120))
print(estimator_range)

[90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119]


In [56]:
# input for max_features
feature_range = list(range(4, 31))
print(feature_range)

[4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]


In [57]:
# makes sense multiple paramters. Test combinations. 
# create a parameter grid: map the parameter names to the values that should be searched
param_grid = dict(n_estimators = estimator_range, max_features = feature_range)
print(param_grid)

{'n_estimators': [90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119], 'max_features': [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]}


In [58]:
# instantiate the grid
# knows to strategy - binary knows to stratify. 
# 10 fold validation on each combination
grid = GridSearchCV(rf, param_grid, cv=10, scoring='recall')

In [59]:
grid

In [60]:
grid.fit(X_tr_oversample, y_tr_oversample)

In [61]:
# view the complete results
grid.cv_results_

{'mean_fit_time': array([0.56088858, 0.56488104, 0.57188759, 0.57443979, 0.58272586,
        0.59153864, 0.59309039, 0.60109293, 0.60738783, 0.61337206,
        0.62363105, 0.62261882, 0.6349879 , 0.63765302, 0.64517658,
        0.65298152, 0.65692003, 0.66149144, 0.66709664, 0.67484283,
        0.68186724, 0.68434675, 0.69117663, 0.70242944, 0.70812604,
        0.71319058, 0.72422135, 0.72317679, 0.72831526, 0.73603621,
        0.6081115 , 0.61672852, 0.62089171, 0.62664037, 0.63873551,
        0.6452122 , 0.65270309, 0.65573452, 0.66013269, 0.66837451,
        0.67406003, 0.68054945, 0.6865979 , 0.69539721, 0.69591625,
        0.7020225 , 0.71263275, 0.72095447, 0.72526667, 0.73136442,
        0.73534458, 0.74630935, 0.7491354 , 0.75887551, 0.76439638,
        0.77054267, 0.77998276, 0.78472309, 0.78900669, 0.8005542 ,
        0.65626702, 0.66240845, 0.67552969, 0.67889869, 0.68501611,
        0.69483888, 0.69764581, 0.70679939, 0.71518016, 0.71947885,
        0.728088  , 0.73377101,

#### Results Random Forest Oversampled, Hyperparameters tuned

In [63]:
# examine the best model
print("Best params: ", grid.best_params_)
print("Best estimator: ", grid.best_estimator_)
print("Best score: ", grid.best_score_)

Best params:  {'max_features': 6, 'n_estimators': 115}
Best estimator:  RandomForestClassifier(max_features=6, n_estimators=115)
Best score:  0.9781722279669106


#### Results Random Forest Oversampled Train Data

In [27]:
print("Random Forest Oversample")
print(f"Precision: {precision_score(y_test, y_pred_randfor_oversamp)}")
print(f"Recall: {recall_score(y_test, y_pred_randfor_oversamp)}")
print(f"F1: {f1_score(y_test, y_pred_randfor_oversamp)}")

Random Forest Oversample
Precision: 0.5728900255754475
Recall: 0.5989304812834224
F1: 0.5856209150326798


### Logistic Regression Model - Setup Oversample

In [64]:
logreg_oversamp = Pipeline([('scaler', StandardScaler()), ('LogReg', LogisticRegression())])

In [65]:
#fit model on oversample train data
logreg_oversamp.fit(X_tr_oversample, y_tr_oversample)

#### Logistic Regression - Hard Class Predictions for Oversample 

In [66]:
y_pred_lr_oversamp = logreg_oversamp.predict(X_test)

#### Results Logistic Regression Oversampled Train Data

In [67]:
print("Logistic Regression Oversample")
print(f"Precision: {precision_score(y_test, y_pred_lr_oversamp)}")
print(f"Recall: {recall_score(y_test, y_pred_lr_oversamp)}")
print(f"F1: {f1_score(y_test, y_pred_lr_oversamp)}")

Logistic Regression Oversample
Precision: 0.4838709677419355
Recall: 0.8021390374331551
F1: 0.6036217303822937


#### Results Decision Tree Oversampled Train Data

In [68]:
print("Decision Tree Model Oversample")
print(f"Precision: {precision_score(y_test, y_pred_oversamp_tree)}")
print(f"Recall: {recall_score(y_test, y_pred_oversamp_tree)}")
print(f"F1: {f1_score(y_test, y_pred_oversamp_tree)}")

Decision Tree Model Oversample
Precision: 0.47757255936675463
Recall: 0.4839572192513369
F1: 0.4807436918990704


#### Results Decision Tree No Oversample Train Data
- The reason that these aren't drastically better than with oversample train data is because it can handle feature imbalance better

In [69]:
print("Decision Tree No Oversample ")
print(f"Precision: {precision_score(y_test, y_pred)}")
print(f"Recall: {recall_score(y_test, y_pred)}")
print(f"F1: {f1_score(y_test, y_pred)}")

Decision Tree No Oversample 
Precision: 0.44533333333333336
Recall: 0.446524064171123
F1: 0.4459279038718291
