It’s not dissimilar to setting up a LogisticRegression model. You split your data into train/test and then fit the classifiers. Then you are able to evaluate and compare the metrics.

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from ipywidgets import interactive, FloatSlider
import imblearn.over_sampling

from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, roc_curve 

%matplotlib inline

In [2]:
data = pd.read_csv('/Users/jennihawk/Documents/Data Science/Classification/Churn Project/Models/chatr_clean.csv')

In [27]:
#data.head()

Use grid search or random search to arrive at best possible values of max_depth

### Model Setup - Decision Tree

In [4]:
#Defaults Decision Tree
#sklearn.tree.DecisionTreeClassifier(*, criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, class_weight=None, ccp_alpha=0.0

In [5]:
features_in = ['SeniorCitizen', 'tenure', 'MonthlyCharges',
       'TotalCharges', 'InternetService_Fiber', 'InternetService_No',
       'Contract_One_Year', 'Contract_Two_year', 'PaymentMethod_Crcard',
       'PaymentMethod_Electr_Check', 'PaymentMethod_Mailed_check',
       'MultipleLines_No_phone_serv', 'MultipleLines_Yes', 'Dependents_Yes',
       'gender_Male', 'Partner_Yes', 'PhoneService_Yes',
       'OnlineSecurity_No_internet_serv', 'OnlineSecurity_Yes',
       'OnlineBackup_No_Internet_Serv', 'OnlineBackup_Yes',
       'DeviceProtection_No_internet_serv', 'DeviceProtection_Yes',
       'TechSupport_No_internet_serv', 'TechSupport_Yes',
       'StreamingTV_No_internet_serv', 'StreamingTV_Yes',
       'StreamingMovies_No_internet_serv', 'StreamingMovies_Yes',
       'PaperlessBilling_Yes']

y = data['Churn_Yes']
X = data[features_in]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

#scaled data to have apples to apples comparison with logistic regression
tree_pipe = Pipeline([('scaler', StandardScaler()), ('DecTree', DecisionTreeClassifier())])

### Fit Decision Tree Model to Training Data

In [6]:
tree_pipe.fit(X_train, y_train)

### Class Imbalance
Average rate of churn on target columm

In [7]:
np.mean(y_train)

0.26753886992794845

#### Hard Class Predictions - No Oversample Train Data

In [8]:
#predict churn / not churn

In [9]:
y_pred = tree_pipe.predict(X_test)

#### Setup Oversampling

In [11]:
# setup for the ratio argument of RandomOverSampler initialization
n_pos = np.sum(y_train == 1)
n_neg = np.sum(y_train == 0)
ratio = {1 : n_pos * 4, 0 : n_neg} 

# randomly oversample positive samples: create 4x as many 
ROS = imblearn.over_sampling.RandomOverSampler(sampling_strategy = ratio, random_state=42) 

#use the fit_sample method to create the dataset with a desired proportion
#the variables X_tr_rs, y_tr_rs hold the training data with the extra samples we just created
X_tr_oversample, y_tr_oversample = ROS.fit_resample(X_train, y_train)



#### Setup up Oversample on Decision Tree Model

In [12]:
oversamp_tree_pipe = Pipeline([('scaler', StandardScaler()), ('DecTree', DecisionTreeClassifier())])

#### Fit the decision tree oversample model

In [13]:
#'X_tr_oversample', 'y_tr_oversample' hold the training data that contains the extra samples
oversamp_tree_pipe.fit(X_tr_oversample, y_tr_oversample)

#### Hard Class Predictions based on oversampling training data

In [14]:
#we keep same X_test
y_pred_oversamp_tree = oversamp_tree_pipe.predict(X_test)

#### Setup up Oversample on Logistic Regression Model

In [22]:
logreg_oversamp = Pipeline([('scaler', StandardScaler()), ('LogReg', LogisticRegression())])

In [23]:
logreg_oversamp.fit(X_tr_oversample, y_tr_oversample)

#### Hard Class Predictions for Oversample Logistic Regression

In [24]:
y_pred_lr_oversamp = logreg_oversamp.predict(X_test)

#### Results Logistic Regression Oversampled Train Data

In [25]:
print("Logistic Regression Oversample")
print(f"Precision: {precision_score(y_test, y_pred_lr_oversamp)}")
print(f"Recall: {recall_score(y_test, y_pred_lr_oversamp)}")
print(f"F1: {f1_score(y_test, y_pred_lr_oversamp)}")

Logistic Regression Oversample
Precision: 0.46162927981109797
Recall: 0.8537117903930131
F1: 0.5992337164750958


#### Results Decision Tree Oversampled Train Data

In [18]:
print("Decision Tree Model Oversample")
print(f"Precision: {precision_score(y_test, y_pred_oversamp_tree)}")
print(f"Recall: {recall_score(y_test, y_pred_oversamp_tree)}")
print(f"F1: {f1_score(y_test, y_pred_oversamp_tree)}")

Decision Tree Model Oversample
Precision: 0.48237885462555063
Recall: 0.4781659388646288
F1: 0.48026315789473684


#### Results Decision Tree No Oversample Train Data

In [26]:
print("Decision Tree No Oversample ")
print(f"Precision: {precision_score(y_test, y_pred)}")
print(f"Recall: {recall_score(y_test, y_pred)}")
print(f"F1: {f1_score(y_test, y_pred)}")

Decision Tree No Oversample 
Precision: 0.45435684647302904
Recall: 0.4781659388646288
F1: 0.46595744680851064
