# Random Forest, Gradient Boosting, Neural Network

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import * #import all metrics
from sklearn.neural_network import MLPClassifier

In [4]:
data=pd.read_excel("Churn.xls")

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 21 columns):
Account Length    3333 non-null int64
VMail Message     3333 non-null int64
Day Mins          3333 non-null float64
Eve Mins          3333 non-null float64
Night Mins        3333 non-null float64
Intl Mins         3333 non-null float64
CustServ Calls    3333 non-null int64
Churn             3333 non-null int64
Int'l Plan        3333 non-null int64
VMail Plan        3333 non-null int64
Day Calls         3333 non-null int64
Day Charge        3333 non-null float64
Eve Calls         3333 non-null int64
Eve Charge        3333 non-null float64
Night Calls       3333 non-null int64
Night Charge      3333 non-null float64
Intl Calls        3333 non-null int64
Intl Charge       3333 non-null float64
State             3333 non-null object
Area Code         3333 non-null int64
Phone             3333 non-null object
dtypes: float64(8), int64(11), object(2)
memory usage: 546.9+ KB


In [6]:
data.var()==0 #no column with 0 var

Account Length    False
VMail Message     False
Day Mins          False
Eve Mins          False
Night Mins        False
Intl Mins         False
CustServ Calls    False
Churn             False
Int'l Plan        False
VMail Plan        False
Day Calls         False
Day Charge        False
Eve Calls         False
Eve Charge        False
Night Calls       False
Night Charge      False
Intl Calls        False
Intl Charge       False
Area Code         False
dtype: bool

In [7]:
#if # of columns is big, we can check using sum() function
sum(data.var()==0) 

0

In [8]:
#can be without sum
sum(data.isna().any()) 

0

In [9]:
#let's check for unique number of categories for object variables
#we see there is no output printed which means we don't have a categorical variable with only one category
for i in data.columns:
    if len(data[i].unique())==1:
        print(i)

In [10]:
#let's drop Phone, Area Code(we have already state) as these variables don't give any useful information
data.drop(["Area Code","Phone"],axis=1,inplace=True)

In [11]:
#to check whether all calculations are correct
print(data.shape)

(3333, 19)


In [12]:
#let's create dummy variables, we didn't separate object variabes as pd.get_dummies() function gets only objects
data=pd.get_dummies(data, drop_first=True)

In [13]:
print(data.shape) #now we have more columns (68 instead of 19) because of dummy variables were created

(3333, 68)


In [14]:
#let's create Y and X variables
Y=data.Churn
X=data.drop("Churn",axis=1)

In [15]:
#let's do train test split
X0,X1,Y0,Y1=train_test_split(X,Y,test_size=0.25,random_state=42)

In [16]:
#let's run models
DT=DecisionTreeClassifier(random_state=42)
RF=RandomForestClassifier(random_state=42)
GB=GradientBoostingClassifier(random_state=42)

In [17]:
#let's create dictionaries for models and scores in order to be able to see summary of the results of all models together
models={"Decision Tree": DT, 
        "Random Forest": RF,
        "GradientBoosting":GB}
scores={"Accuracy": accuracy_score,
        "ROC AUC": roc_auc_score,
       "Recall": recall_score}

In [18]:
#let's use for loop to fit models and print metrics for each model
for model_name, model in models.items():
    print("\n",model_name)
    model.fit(X0,Y0)
    prediction=model.predict(X1)
    for score_name, score in scores.items():
        print(score_name,score(Y1,prediction).round(2)*100)


 Decision Tree
Accuracy 92.0
ROC AUC 85.0
Recall 74.0

 Random Forest
Accuracy 93.0
ROC AUC 78.0
Recall 56.99999999999999

 GradientBoosting
Accuracy 95.0
ROC AUC 85.0
Recall 72.0


In [19]:
#lET'S USE BALANCED MODELS and change hyperparameters (GridSearchCv can also be used)
DT2=DecisionTreeClassifier(random_state=42,class_weight="balanced",max_depth=5)
RF2=RandomForestClassifier(random_state=42,class_weight="balanced",max_depth=5,n_estimators=300)
GB2=GradientBoostingClassifier(random_state=42) #no class weight
NN=MLPClassifier(random_state=42)

In [20]:
#let's create new dictionaries for models and scores
models2={"Decision Tree": DT2, 
        "Random Forest": RF2,
        "GradientBoosting":GB2,
        "Neural network":NN}
scores2={"Accuracy": accuracy_score,
        "ROC AUC": roc_auc_score,
       "Recall": recall_score}

In [21]:
#let's use for loop to fit models and print metrics for each model
for model_name, model in models2.items():
    print("\n",model_name)
    model.fit(X0,Y0)
    prediction=model.predict(X1)
    for score_name, score in scores2.items():
        print(score_name,score(Y1,prediction).round(2)*100)


 Decision Tree
Accuracy 94.0
ROC AUC 88.0
Recall 80.0

 Random Forest
Accuracy 91.0
ROC AUC 88.0
Recall 82.0

 GradientBoosting
Accuracy 95.0
ROC AUC 85.0
Recall 72.0

 Neural network
Accuracy 85.0
ROC AUC 54.0
Recall 10.0
