Work on this next: In train_test_split, you can use stratify = y . For cross-validation, you can use StratifiedKFold .
stratify is used to ensure that both the train and test split sets have the same proportion of labels as the original dataset. More reading here:
https://machinelearningmastery.com/train-test-split-for-evaluating-machine-learning-algorithms/
StratifiedKFold ensures that each fold of the dataset has the same proportion of observations with a given label. Specifically helpful when dealing with imbalance dataset in classification.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from ipywidgets import interactive, FloatSlider
import imblearn.over_sampling

from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, roc_curve 

%matplotlib inline

In [2]:
data = pd.read_csv('/Users/jennihawk/Documents/Data Science/Classification/Churn Project/Models/chatr_clean.csv')

In [3]:
data.shape

(7032, 32)

Use grid search or random search to arrive at best possible values of max_depth

### Train - Test Setup
- Utilize Stratify Argument

In [4]:
features_in = ['SeniorCitizen', 'tenure', 'MonthlyCharges',
       'TotalCharges', 'InternetService_Fiber', 'InternetService_No',
       'Contract_One_Year', 'Contract_Two_year', 'PaymentMethod_Crcard',
       'PaymentMethod_Electr_Check', 'PaymentMethod_Mailed_check',
       'MultipleLines_No_phone_serv', 'MultipleLines_Yes', 'Dependents_Yes',
       'gender_Male', 'Partner_Yes', 'PhoneService_Yes',
       'OnlineSecurity_No_internet_serv', 'OnlineSecurity_Yes',
       'OnlineBackup_No_Internet_Serv', 'OnlineBackup_Yes',
       'DeviceProtection_No_internet_serv', 'DeviceProtection_Yes',
       'TechSupport_No_internet_serv', 'TechSupport_Yes',
       'StreamingTV_No_internet_serv', 'StreamingTV_Yes',
       'StreamingMovies_No_internet_serv', 'StreamingMovies_Yes',
       'PaperlessBilling_Yes']

y = data['Churn_Yes']
X = data[features_in]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, random_state=42,stratify=y)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(3516, 30)
(3516, 30)
(3516,)
(3516,)


### Class Imbalance
Average rate of churn on target columm

In [5]:
np.mean(y_train)

0.2659271899886234

### Setup Oversampling

In [6]:
# setup for the ratio argument of RandomOverSampler initialization
n_pos = np.sum(y_train == 1)
n_neg = np.sum(y_train == 0)
ratio = {1 : n_pos * 4, 0 : n_neg} 

# randomly oversample positive samples: create 4x as many 
ROS = imblearn.over_sampling.RandomOverSampler(sampling_strategy = ratio, random_state=42) 

#use the fit_sample method to create the dataset with a desired proportion
#the variables X_tr_rs, y_tr_rs hold the training data with the extra samples we just created
X_tr_oversample, y_tr_oversample = ROS.fit_resample(X_train, y_train)



### Decision Tree

##### Model Setup

In [7]:
#scaled data to have apples to apples comparison with logistic regression
tree_pipe = Pipeline([('scaler', StandardScaler()), ('DecTree', DecisionTreeClassifier())])

In [8]:
#Fit Decision Tree Model to Training Data
tree_pipe.fit(X_train, y_train)

#### Hard Class Predictions - No Oversample Train Data

In [9]:
#predict churn / not churn

In [10]:
y_pred = tree_pipe.predict(X_test)

#### Setup up Oversample on Decision Tree Model

In [11]:
oversamp_tree_pipe = Pipeline([('scaler', StandardScaler()), ('DecTree', DecisionTreeClassifier())])

#### Fit the decision tree oversample model

In [12]:
#'X_tr_oversample', 'y_tr_oversample' hold the training data that contains the extra samples
oversamp_tree_pipe.fit(X_tr_oversample, y_tr_oversample)

#### Hard Class Predictions based on oversampling training data

In [13]:
#we keep same X_test
y_pred_oversamp_tree = oversamp_tree_pipe.predict(X_test)

### Random Forest - Setup Oversample
- Selects random samples from a given dataset
- Constructs a decision tree for each sample and get a prediction result from each decision tree
- Perform a vote for each predicted result
- Select the prediction result with the most votes as the final prediction

In [14]:
#estimators = number of trees in the forest - use default 100
randforest_oversampe = Pipeline([('scaler', StandardScaler()), ('RandForest', RandomForestClassifier(n_estimators=100))])

In [15]:
#fit model on oversample train data
randforest_oversampe.fit(X_tr_oversample, y_tr_oversample)

#### Random Forest - Hard Class Predictions for Oversample 

In [16]:
y_pred_randfor_oversamp = randforest_oversampe.predict(X_test)

### Logistic Regression Model - Setup Oversample

In [17]:
logreg_oversamp = Pipeline([('scaler', StandardScaler()), ('LogReg', LogisticRegression())])

In [18]:
#fit model on oversample train data
logreg_oversamp.fit(X_tr_oversample, y_tr_oversample)

#### Logistic Regression - Hard Class Predictions for Oversample 

In [19]:
y_pred_lr_oversamp = logreg_oversamp.predict(X_test)

#### Results Logistic Regression Oversampled Train Data

In [20]:
print("Logistic Regression Oversample")
print(f"Precision: {precision_score(y_test, y_pred_lr_oversamp)}")
print(f"Recall: {recall_score(y_test, y_pred_lr_oversamp)}")
print(f"F1: {f1_score(y_test, y_pred_lr_oversamp)}")

Logistic Regression Oversample
Precision: 0.4581196581196581
Recall: 0.860813704496788
F1: 0.5979918185198958


#### Results Random Forest Oversampled Train Data

In [21]:
print("Random Forest Oversample")
print(f"Precision: {precision_score(y_test, y_pred_randfor_oversamp)}")
print(f"Recall: {recall_score(y_test, y_pred_randfor_oversamp)}")
print(f"F1: {f1_score(y_test, y_pred_randfor_oversamp)}")

Random Forest Oversample
Precision: 0.5665961945031712
Recall: 0.5738758029978587
F1: 0.5702127659574467


#### Results Decision Tree Oversampled Train Data

In [22]:
print("Decision Tree Model Oversample")
print(f"Precision: {precision_score(y_test, y_pred_oversamp_tree)}")
print(f"Recall: {recall_score(y_test, y_pred_oversamp_tree)}")
print(f"F1: {f1_score(y_test, y_pred_oversamp_tree)}")

Decision Tree Model Oversample
Precision: 0.49889135254988914
Recall: 0.4817987152034261
F1: 0.4901960784313726


#### Results Decision Tree No Oversample Train Data

In [23]:
print("Decision Tree No Oversample ")
print(f"Precision: {precision_score(y_test, y_pred)}")
print(f"Recall: {recall_score(y_test, y_pred)}")
print(f"F1: {f1_score(y_test, y_pred)}")

Decision Tree No Oversample 
Precision: 0.49690721649484537
Recall: 0.5160599571734475
F1: 0.5063025210084034
