### Decision Tree,  Random Forest, XGBoost
- Explore how Decision Tree, Random Forest and XGBoost perform without hyperparamter tuning
- Cross validation to determine optimal oversample value based on recall metric
- Create model with and without oversample
- While scaling not required for tree based models it was implemented in the workflow

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier


from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from ipywidgets import interactive, FloatSlider
import imblearn.over_sampling

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, roc_curve 

import time

%matplotlib inline

In [2]:
data = pd.read_csv('/Users/jennihawk/Documents/Data Science Projects/Churn Project/Models/chatr_clean.csv')

In [3]:
data.head()

Unnamed: 0,customerID,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,InternetService_Fiber,InternetService_No,Contract_One_Year,Contract_Two_year,PaymentMethod_Crcard,...,DeviceProtection_No_internet_serv,DeviceProtection_Yes,TechSupport_No_internet_serv,TechSupport_Yes,StreamingTV_No_internet_serv,StreamingTV_Yes,StreamingMovies_No_internet_serv,StreamingMovies_Yes,PaperlessBilling_Yes,Churn_Yes
0,7590-VHVEG,0,1.0,29.85,29.85,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,5575-GNVDE,0,34.0,56.95,1889.5,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
2,3668-QPYBK,0,2.0,53.85,108.15,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
3,7795-CFOCW,0,45.0,42.3,1840.75,0,0,1,0,0,...,0,1,0,1,0,0,0,0,0,0
4,9237-HQITU,0,2.0,70.7,151.65,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1


In [4]:
data.shape

(7032, 32)

### Train - Test Setup
- Utilize 'stratify = y' so class proportions are preserved

In [5]:
y = data['Churn_Yes']
X = data.drop(['Churn_Yes', 'customerID'], axis =1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42,stratify=y)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(5625, 30)
(1407, 30)
(5625,)
(1407,)


### Class Imbalance
Average rate of churn on target columm

In [6]:
np.mean(y_train)

0.2657777777777778

### Oversampling Cross Validation -  How much to increase sample size? 
- cross validation strategy for any parameter to tune
- optimal oversample value = 3
- Fit cross validation on DecisionTreeClassifier
- cv=10 knows to stratify when binary numbers

In [7]:
n_pos = np.sum(y_train == 1)
n_neg = np.sum(y_train == 0)


# search for an optimal oversample value
oversample_range = [1,2,3,4,5,6]
oversample_acc_scores = []
oversample_rec_scores = []
for oversample_weight in oversample_range:
    # oversample your data
    ratio = {1 : n_pos * oversample_weight, 0 : n_neg} 
    # randomly oversample positive samples
    ROS = imblearn.over_sampling.RandomOverSampler(sampling_strategy = ratio, random_state=42)

    #use the fit_resample method to create the dataset with a desired proportion
    #the variables X_tr_rs, y_tr_rs hold the training data with the extra samples we just created
    X_tr_oversample, y_tr_oversample = ROS.fit_resample(X_train, y_train)
    
    # cross validate it
    d_tree = DecisionTreeClassifier()
    acc_scores = cross_val_score(d_tree, X_tr_oversample, y_tr_oversample, cv=10, scoring='accuracy')
    rec_scores = cross_val_score(d_tree, X_tr_oversample, y_tr_oversample, cv=10, scoring='recall')
    oversample_acc_scores.append(acc_scores.mean())
    oversample_rec_scores.append(rec_scores.mean())


print("Oversample Recall Scores")
print(list(zip(oversample_range, oversample_rec_scores)))
print("Oversample Accuracy Scores")
print(list(zip(oversample_range, oversample_acc_scores)))



Oversample Recall Scores
[(1, 0.49896196868008946), (2, 0.8993311036789298), (3, 0.973271953547566), (4, 0.9914715719063546), (5, 0.9969244178138579), (6, 0.9983277591973243)]
Oversample Accuracy Scores
[(1, 0.7281682395403375), (2, 0.8414325842696628), (3, 0.8972782956202117), (4, 0.9175074183976261), (5, 0.9339934360984883), (6, 0.9403053435114505)]


### Setup Oversampling
- Used 3x positive samples
- Rationale: There's 3 to 1 ratio of neg to pos observations. Cross validation supported this for decision tree model

In [8]:
# setup for the ratio argument of RandomOverSampler initialization
n_pos = np.sum(y_train == 1)
n_neg = np.sum(y_train == 0)
ratio = {1 : n_pos * 3, 0 : n_neg} 

# randomly oversample positive samples
ROS = imblearn.over_sampling.RandomOverSampler(sampling_strategy = ratio, random_state=42) 

#use the fit_resample method to create the dataset with a desired proportion
#the variables X_tr__oversample, y_tr_oversample hold the training data with the extra samples we just created
X_tr_oversample, y_tr_oversample = ROS.fit_resample(X_train, y_train)



### Decision Tree

##### Model Setup

In [9]:
tree_pipe = Pipeline([('scaler', StandardScaler()), ('DecTree', DecisionTreeClassifier())])

In [10]:
#Fit model to the training data
tree_pipe.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler()),
                ('DecTree', DecisionTreeClassifier())])

#### Hard Class Predictions with no oversample - results on test data

In [11]:
y_pred = tree_pipe.predict(X_test)

#### Setup up Oversample on Decision Tree Model

In [12]:
oversamp_tree_pipe = Pipeline([('scaler', StandardScaler()), ('DecTree', DecisionTreeClassifier())])

In [13]:
# Fit the decision tree oversample model
#'X_tr_oversample', 'y_tr_oversample' hold the training data that contains the extra samples
oversamp_tree_pipe.fit(X_tr_oversample, y_tr_oversample)

Pipeline(steps=[('scaler', StandardScaler()),
                ('DecTree', DecisionTreeClassifier())])

#### Hard Class Predictions on oversampled training data

In [14]:
#we keep same X_test
y_pred_oversamp_tree = oversamp_tree_pipe.predict(X_test)

#### TEST SCORES: Decision Tree Oversampled Model

In [15]:
print("Decision Tree Model Oversample")
print(f"Precision: {precision_score(y_test, y_pred_oversamp_tree)}")
print(f"Recall: {recall_score(y_test, y_pred_oversamp_tree)}")
print(f"F1: {f1_score(y_test, y_pred_oversamp_tree)}")

Decision Tree Model Oversample
Precision: 0.47619047619047616
Recall: 0.48128342245989303
F1: 0.4787234042553191


#### TEST SCORES: Decision Tree No Oversample
- The likely reason that these aren't drastically better than oversampled scores is because it can handle feature imbalance better

In [16]:
print("Decision Tree No Oversample ")
print(f"Precision: {precision_score(y_test, y_pred)}")
print(f"Recall: {recall_score(y_test, y_pred)}")
print(f"F1: {f1_score(y_test, y_pred)}")

Decision Tree No Oversample 
Precision: 0.44935064935064933
Recall: 0.4625668449197861
F1: 0.45586297760210803


### Random Forest - Setup Oversample

In [17]:
#n_estimators = number of trees in the forest - use default 100
randforest_oversamp = Pipeline([('scaler', StandardScaler()), ('RandForest', RandomForestClassifier(n_estimators=100))])

In [18]:
#fit model on oversample train data
randforest_oversamp.fit(X_tr_oversample, y_tr_oversample)

Pipeline(steps=[('scaler', StandardScaler()),
                ('RandForest', RandomForestClassifier())])

#### Random Forest - Hard Class Predictions for Oversample 

In [19]:
y_pred_randfor_oversamp = randforest_oversamp.predict(X_test)

#### TEST SCORES:  Random Forest Oversampled Model 

In [22]:
print("Random Forest Oversample")
print(f"Precision: {precision_score(y_test, y_pred_randfor_oversamp)}")
print(f"Recall: {recall_score(y_test, y_pred_randfor_oversamp)}")
print(f"F1: {f1_score(y_test, y_pred_randfor_oversamp)}")

Random Forest Oversample
Precision: 0.5871313672922251
Recall: 0.5855614973262032
F1: 0.5863453815261044


### XGBoost Model Setup

In [20]:
xgboost = XGBClassifier()
xgboost.fit(X_tr_oversample, y_tr_oversample)

y_pred_xgboost = xgboost.predict(X_test)

#### TEST SCORES: XGBoost

In [21]:
print("XG Boost Oversample")
print(f"Recall: {recall_score(y_test, y_pred_xgboost)}")
print(f"Precision: {precision_score(y_test, y_pred_xgboost)}")
print(f"F1: {f1_score(y_test, y_pred_xgboost)}")
print(f"Accuracy: {accuracy_score(y_test, y_pred_xgboost)}")

XG Boost Oversample
Recall: 0.6818181818181818
Precision: 0.5323590814196242
F1: 0.5978898007033997
Accuracy: 0.7562189054726368
