# RANDOM FOREST MODEL

# Libraries and data

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv("cancer_reg_refined.csv", encoding='latin-1')
cancer_df = data.copy()
cancer_df.head(2)
# Without the encoding paremeter, this error presents itself:
# UnicodeDecodeError: 'utf-8' codec can't decode byte 0xf1 in position 41137: invalid continuation byte

Unnamed: 0,avgAnnCount,incidenceRate,studyPerCap,AvgHouseholdSize,PctNoHS18_24,PctHS18_24,PctUnemployed16_Over,PctBlack,PctAsian,PctOtherRace,BirthRate,TARGET_deathRate,binnedInc,Geography
0,1397.0,489.8,499.748,2.54,11.5,39.5,8.0,2.595,4.822,1.843,6.119,164.9,BIN_5,WEST
1,173.0,411.6,23.111,2.34,6.1,22.4,7.8,0.969,2.246,3.741,4.333,161.3,BIN_4,WEST


# Random Forest

In [3]:
# Getting Dummy variables, dropping one to avoid the dummy variable trap
cancer_df = pd.get_dummies(data = cancer_df, drop_first = True)
cancer_df.head(2)

Unnamed: 0,avgAnnCount,incidenceRate,studyPerCap,AvgHouseholdSize,PctNoHS18_24,PctHS18_24,PctUnemployed16_Over,PctBlack,PctAsian,PctOtherRace,BirthRate,TARGET_deathRate,binnedInc_BIN_2,binnedInc_BIN_3,binnedInc_BIN_4,binnedInc_BIN_5,Geography_EAST,Geography_WEST
0,1397.0,489.8,499.748,2.54,11.5,39.5,8.0,2.595,4.822,1.843,6.119,164.9,0,0,0,1,0,1
1,173.0,411.6,23.111,2.34,6.1,22.4,7.8,0.969,2.246,3.741,4.333,161.3,0,0,1,0,0,1


In [4]:
# Isolating x and y
y = cancer_df['TARGET_deathRate']
x = cancer_df.drop(columns = ['TARGET_deathRate'])
x.head(1)

Unnamed: 0,avgAnnCount,incidenceRate,studyPerCap,AvgHouseholdSize,PctNoHS18_24,PctHS18_24,PctUnemployed16_Over,PctBlack,PctAsian,PctOtherRace,BirthRate,binnedInc_BIN_2,binnedInc_BIN_3,binnedInc_BIN_4,binnedInc_BIN_5,Geography_EAST,Geography_WEST
0,1397.0,489.8,499.748,2.54,11.5,39.5,8.0,2.595,4.822,1.843,6.119,0,0,0,1,0,1


In [5]:
# Training and test set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 666 )
x_train.head(2)

Unnamed: 0,avgAnnCount,incidenceRate,studyPerCap,AvgHouseholdSize,PctNoHS18_24,PctHS18_24,PctUnemployed16_Over,PctBlack,PctAsian,PctOtherRace,BirthRate,binnedInc_BIN_2,binnedInc_BIN_3,binnedInc_BIN_4,binnedInc_BIN_5,Geography_EAST,Geography_WEST
781,782.0,457.0,73.537,2.5,18.0,33.7,8.5,10.273,1.669,0.653,5.939,0,0,0,1,1,0
1210,58.0,405.4,0.0,2.44,15.6,13.1,9.7,0.662,0.441,1.537,6.036,1,0,0,0,0,1


In [6]:
# Model
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators = 500, random_state = 666,
                              criterion = 'squared_error',
                              max_depth = 12, 
                               max_features = 'sqrt') 
model.fit(x_train, y_train)

RandomForestRegressor(max_depth=12, max_features='sqrt', n_estimators=500,
                      random_state=666)

# Assessing Model Accuracy

In [7]:
predictions = model.predict(x_test)
predictions[:5]

array([184.22395197, 155.60916297, 186.68979181, 177.71001774,
       195.11580337])

In [8]:
# Accuracy Assessment
from sklearn.metrics import mean_absolute_error, mean_squared_error
print("MAE:", round(mean_absolute_error(y_test, predictions),2))
print("RMSE:", round(np.sqrt(mean_squared_error(y_test, predictions)),2))
MAPE = mean_absolute_error(y_test, predictions)/np.average(y_test)
print('MAPE:', round(MAPE * 100,3),'%')

MAE: 15.39
RMSE: 20.51
MAPE: 8.609 %


# Hyperparameter Tuning

In [9]:
# The hyperparameters we are going to tune are:
    # Number of Decision Trees in the Forest
    # The maximum depth of the individual trees
    # The number of random features to consider are each split

In [10]:
param_grid = {
    'n_estimators': [100, 300, 500, 800, 1100],
    'criterion' : ["squared_error"],
    'max_features': ['sqrt', 'auto', 0.33],
    'max_depth': [3,6,9,12],
    'random_state' : [666]}

In [11]:
# Running the piece of code below, determines which parameters conjure the best results for 
# the RF Regressor.

# Because it takes some time to run, we run it only once.


# from sklearn.model_selection import GridSearchCV
# grid_search = GridSearchCV(RandomForestRegressor(),
                           # param_grid=param_grid, cv = 5)
# grid_search.fit(x_train, y_train)

# print(grid_search.best_params_)

# The results :
# {'criterion': 'squared_error', 'max_depth': 12, 'max_features': 'sqrt', 
# 'n_estimators': 1100, 'random_state': 666}

# Rerunning the model


In [12]:
model = RandomForestRegressor(n_estimators = 1100,
                              random_state = 666,
                              criterion = 'squared_error',
                              max_depth = 12, 
                               max_features = 'sqrt') 
model.fit(x_train, y_train)

RandomForestRegressor(max_depth=12, max_features='sqrt', n_estimators=1100,
                      random_state=666)

In [13]:
predictions = model.predict(x_test)
predictions[:5]

array([184.43669278, 156.81151691, 186.78678878, 177.18052671,
       193.92175792])

In [14]:
print("MAE:", round(mean_absolute_error(y_test, predictions),2))
print("RMSE:", round(np.sqrt(mean_squared_error(y_test, predictions)),2))
MAPE = mean_absolute_error(y_test, predictions)/np.average(y_test)
print('MAPE:', round(MAPE * 100,3),'%')

MAE: 15.38
RMSE: 20.49
MAPE: 8.602 %
