### Decision Tree and Random Forest 
- Setup oversampling
- Hyperparameters not tuned

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from ipywidgets import interactive, FloatSlider
import imblearn.over_sampling

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, roc_curve 

import time

%matplotlib inline

In [2]:
data = pd.read_csv('/Users/jennihawk/Documents/Data Science Projects/Churn Project/Models/chatr_clean.csv')

In [3]:
data.head()

Unnamed: 0,customerID,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,InternetService_Fiber,InternetService_No,Contract_One_Year,Contract_Two_year,PaymentMethod_Crcard,...,DeviceProtection_No_internet_serv,DeviceProtection_Yes,TechSupport_No_internet_serv,TechSupport_Yes,StreamingTV_No_internet_serv,StreamingTV_Yes,StreamingMovies_No_internet_serv,StreamingMovies_Yes,PaperlessBilling_Yes,Churn_Yes
0,7590-VHVEG,0,1.0,29.85,29.85,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,5575-GNVDE,0,34.0,56.95,1889.5,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
2,3668-QPYBK,0,2.0,53.85,108.15,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
3,7795-CFOCW,0,45.0,42.3,1840.75,0,0,1,0,0,...,0,1,0,1,0,0,0,0,0,0
4,9237-HQITU,0,2.0,70.7,151.65,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1


In [4]:
data.shape

(7032, 32)

### Train - Test Setup
- Utilize 'stratify = y' so class proportions are preserved

In [5]:
y = data['Churn_Yes']
X = data.drop(['Churn_Yes', 'customerID'], axis =1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42,stratify=y)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(5625, 30)
(1407, 30)
(5625,)
(1407,)


### Class Imbalance
Average rate of churn on target columm

In [6]:
np.mean(y_train)

0.2657777777777778

### Setup Oversampling
- Used 3x positive samples. Rationale: We have 3 to 1 ratio of negative to positive observations.
- Cross validation confirmed choice. 

In [7]:
# setup for the ratio argument of RandomOverSampler initialization
n_pos = np.sum(y_train == 1)
n_neg = np.sum(y_train == 0)
ratio = {1 : n_pos * 3, 0 : n_neg} 

# randomly oversample positive samples
ROS = imblearn.over_sampling.RandomOverSampler(sampling_strategy = ratio, random_state=42) 

#use the fit_resample method to create the dataset with a desired proportion
#the variables X_tr__oversample, y_tr_oversample hold the training data with the extra samples we just created
X_tr_oversample, y_tr_oversample = ROS.fit_resample(X_train, y_train)



### Oversampling Cross Validation - Determine how much to increase sample size 
- cross validation strategy for any parameter you want to tune
- doing cross validation on the oversampling to select which number would be best to increase the oversampling. The winner is 3.
- cv=10 knows to stratify when binary numbers

In [8]:
n_pos = np.sum(y_train == 1)
n_neg = np.sum(y_train == 0)


# search for an optimal oversample value
oversample_range = [1,2,3,4,5,6]
oversample_acc_scores = []
oversample_rec_scores = []
for oversample_weight in oversample_range:
    # oversample your data
    ratio = {1 : n_pos * oversample_weight, 0 : n_neg} 
    # randomly oversample positive samples
    ROS = imblearn.over_sampling.RandomOverSampler(sampling_strategy = ratio, random_state=42)

    #use the fit_resample method to create the dataset with a desired proportion
    #the variables X_tr_rs, y_tr_rs hold the training data with the extra samples we just created
    X_tr_oversample, y_tr_oversample = ROS.fit_resample(X_train, y_train)
    
    # cross validate it
    d_tree = DecisionTreeClassifier()
    acc_scores = cross_val_score(d_tree, X_tr_oversample, y_tr_oversample, cv=10, scoring='accuracy')
    rec_scores = cross_val_score(d_tree, X_tr_oversample, y_tr_oversample, cv=10, scoring='recall')
    oversample_acc_scores.append(acc_scores.mean())
    oversample_rec_scores.append(rec_scores.mean())


print("Oversample Recall Scores")
print(list(zip(oversample_range, oversample_rec_scores)))
print("Oversample Accuracy Scores")
print(list(zip(oversample_range, oversample_acc_scores)))



Oversample Recall Scores
[(1, 0.48627293064876953), (2, 0.9016722408026755), (3, 0.9721583678014636), (4, 0.9914715719063546), (5, 0.9967907279742857), (6, 0.9983277591973243)]
Oversample Accuracy Scores
[(1, 0.7288806154118443), (2, 0.842556179775281), (3, 0.8969290551374192), (4, 0.9169139465875371), (5, 0.9347698921856902), (6, 0.9403816793893129)]


### Decision Tree
- While Scaling not required for tree based algorithms it was conducted in the workflow 

##### Model Setup

In [9]:
tree_pipe = Pipeline([('scaler', StandardScaler()), ('DecTree', DecisionTreeClassifier())])

In [10]:
#Fit Decision Tree Model to Training Data
tree_pipe.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler()),
                ('DecTree', DecisionTreeClassifier())])

#### Hard Class Predictions on no oversample test data

In [12]:
y_pred = tree_pipe.predict(X_test)

#### Setup up Oversample on Decision Tree Model

In [13]:
oversamp_tree_pipe = Pipeline([('scaler', StandardScaler()), ('DecTree', DecisionTreeClassifier())])

#### Fit the decision tree oversample model

In [14]:
#'X_tr_oversample', 'y_tr_oversample' hold the training data that contains the extra samples
oversamp_tree_pipe.fit(X_tr_oversample, y_tr_oversample)

Pipeline(steps=[('scaler', StandardScaler()),
                ('DecTree', DecisionTreeClassifier())])

#### Hard Class Predictions based on oversampling training data

In [15]:
#we keep same X_test
y_pred_oversamp_tree = oversamp_tree_pipe.predict(X_test)

### Random Forest - Setup Oversample
- Selects random samples from a given dataset
- Constructs a decision tree for each sample and get a prediction result from each decision tree
- Perform a vote for each predicted result
- Select the prediction result with the most votes as the final prediction

In [16]:
#estimators = number of trees in the forest - use default 100
randforest_oversamp = Pipeline([('scaler', StandardScaler()), ('RandForest', RandomForestClassifier(n_estimators=100))])

In [17]:
#fit model on oversample train data
randforest_oversamp.fit(X_tr_oversample, y_tr_oversample)

Pipeline(steps=[('scaler', StandardScaler()),
                ('RandForest', RandomForestClassifier())])

#### Random Forest - Hard Class Predictions for Oversample 

In [18]:
y_pred_randfor_oversamp = randforest_oversamp.predict(X_test)

#### Results Random Forest Oversampled Model -  TEST Results

In [19]:
print("Random Forest Oversample")
print(f"Precision: {precision_score(y_test, y_pred_randfor_oversamp)}")
print(f"Recall: {recall_score(y_test, y_pred_randfor_oversamp)}")
print(f"F1: {f1_score(y_test, y_pred_randfor_oversamp)}")

Random Forest Oversample
Precision: 0.5631313131313131
Recall: 0.5962566844919787
F1: 0.5792207792207793


### Logistic Regression Model - Setup Oversample

In [20]:
logreg_oversamp = Pipeline([('scaler', StandardScaler()), ('LogReg', LogisticRegression())])

In [21]:
#fit model on oversample train data
logreg_oversamp.fit(X_tr_oversample, y_tr_oversample)

Pipeline(steps=[('scaler', StandardScaler()), ('LogReg', LogisticRegression())])

#### Logistic Regression - Hard Class Predictions for Oversample 

In [22]:
y_pred_lr_oversamp = logreg_oversamp.predict(X_test)

#### Logistic Regression Oversampled Model - TEST SCORES

In [23]:
print("Logistic Regression Oversample")
print(f"Precision: {precision_score(y_test, y_pred_lr_oversamp)}")
print(f"Recall: {recall_score(y_test, y_pred_lr_oversamp)}")
print(f"F1: {f1_score(y_test, y_pred_lr_oversamp)}")

Logistic Regression Oversample
Precision: 0.41818181818181815
Recall: 0.9224598930481284
F1: 0.5754795663052543


#### Decision Tree Oversampled Model - TEST SCORES

In [24]:
print("Decision Tree Model Oversample")
print(f"Precision: {precision_score(y_test, y_pred_oversamp_tree)}")
print(f"Recall: {recall_score(y_test, y_pred_oversamp_tree)}")
print(f"F1: {f1_score(y_test, y_pred_oversamp_tree)}")

Decision Tree Model Oversample
Precision: 0.47493403693931396
Recall: 0.48128342245989303
F1: 0.4780876494023904


#### Decision Tree No Oversample - TEST SCORES
- The reason that these aren't drastically better than with oversample train data is because it can handle feature imbalance better

In [25]:
print("Decision Tree No Oversample ")
print(f"Precision: {precision_score(y_test, y_pred)}")
print(f"Recall: {recall_score(y_test, y_pred)}")
print(f"F1: {f1_score(y_test, y_pred)}")

Decision Tree No Oversample 
Precision: 0.44050632911392407
Recall: 0.46524064171123
F1: 0.4525357607282185
