# Random Forest Demo

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.decomposition import PCA

In [None]:
# Load in the cleaned dataframe
df = pd.read_csv("../engineered_df_with_category_interactions.csv")
df.head(3)

Unnamed: 0,age,height,weight,ap_hi,ap_lo,gender_1,gender_2,cholesterol_1,cholesterol_2,cholesterol_3,...,alco_1 active_0,alco_1 active_1,alco_1 BMI,alco_1 MAP,active_0 BMI,active_0 MAP,active_1 BMI,active_1 MAP,BMI MAP,cardio
0,18393.0,168.0,62.0,110.0,80.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,21.967,90.0,1977.03,0
1,20228.0,156.0,85.0,140.0,90.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,34.928,106.667,3725.664976,1
2,18857.0,165.0,64.0,130.0,70.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,23.508,90.0,0.0,0.0,2115.72,1


In [None]:
# Define features and target
X = df.drop("cardio", axis=1)
y = df['cardio']

In [None]:
# Train test split on the data to have prediction results
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.8,random_state=0,stratify=y)
X_train.head(3)

Unnamed: 0,age,height,weight,ap_hi,ap_lo,gender_1,gender_2,cholesterol_1,cholesterol_2,cholesterol_3,...,alco_0 MAP,alco_1 active_0,alco_1 active_1,alco_1 BMI,alco_1 MAP,active_0 BMI,active_0 MAP,active_1 BMI,active_1 MAP,BMI MAP
17802,17532.0,154.0,70.0,180.0,110.0,1.0,0.0,1.0,0.0,0.0,...,133.333,0.0,0.0,0.0,0.0,0.0,0.0,29.516,133.333,3935.456828
26207,22219.0,160.0,60.0,120.0,80.0,1.0,0.0,1.0,0.0,0.0,...,93.333,0.0,0.0,0.0,0.0,23.437,93.333,0.0,0.0,2187.445521
15874,22741.0,174.0,69.0,125.0,80.0,0.0,1.0,1.0,0.0,0.0,...,95.0,0.0,0.0,0.0,0.0,0.0,0.0,22.79,95.0,2165.05


In [None]:
# Train a random forest with optimal hyper-parameters
rf = RandomForestClassifier(n_estimators=210, max_depth=10)
rf.fit(X_train, y_train)

## Demo

In [None]:
# Load in un-engineered feature set for ease of viewing
df2 = pd.read_csv("./cleaned_df.csv")
df2.head(3)

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,BMI,MAP
0,18393,2,168,62.0,110,80,1,1,0,0,1,0,21.967,90.0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1,34.928,106.667
2,18857,1,165,64.0,130,70,3,1,0,0,0,1,23.508,90.0


In [None]:
Xc = df2.drop('cardio', axis=1)
yc = df2.cardio
Xc_train,Xc_test,yc_train,yc_test = train_test_split(Xc,yc,random_state=0,train_size=0.8,stratify=yc)
Xc_test.reset_index(drop=True,inplace=True)
yc_test.reset_index(drop=True,inplace=True)
Xidx_test = X_test.reset_index(drop=True)

#### Predictions

In [None]:
# Gather a correct positive prediction
Xc_test.loc[1]

age            19570.000
gender             1.000
height           156.000
weight            78.000
ap_hi            140.000
ap_lo             80.000
cholesterol        1.000
gluc               1.000
smoke              0.000
alco               0.000
active             1.000
BMI               32.051
MAP              100.000
Name: 1, dtype: float64

In [None]:
# What will this one be?
Xc_test.loc[7]

age            23391.000
gender             1.000
height           155.000
weight            70.000
ap_hi            120.000
ap_lo             80.000
cholesterol        1.000
gluc               1.000
smoke              0.000
alco               0.000
active             1.000
BMI               29.136
MAP               93.333
Name: 7, dtype: float64

In [None]:
toPredict = pd.DataFrame()
toPredict = toPredict.append(Xidx_test.loc[7])
pred = rf.predict(toPredict)
print('Prediction was', pred)
print('Actual was', yc_test.loc[7])

Prediction was [1]
Actual was 1


  toPredict = toPredict.append(Xidx_test.loc[7])


In [None]:
# What about this one?
Xc_test.loc[70]

age            16737.000
gender             2.000
height           170.000
weight            70.000
ap_hi            140.000
ap_lo             90.000
cholesterol        1.000
gluc               1.000
smoke              1.000
alco               0.000
active             1.000
BMI               24.221
MAP              106.667
Name: 70, dtype: float64

In [None]:
toPredict = pd.DataFrame()
toPredict = toPredict.append(Xidx_test.loc[70])
pred = rf.predict(toPredict)
print('Prediction was', pred)
print('Actual was', yc_test.loc[70])

Prediction was [1]
Actual was 1


  toPredict = toPredict.append(Xidx_test.loc[70])


In [None]:
# Gather a correct negative prediction
Xc_test.loc[2]

age            15164.000
gender             1.000
height           158.000
weight            57.000
ap_hi            100.000
ap_lo             70.000
cholesterol        1.000
gluc               1.000
smoke              0.000
alco               0.000
active             0.000
BMI               22.833
MAP               80.000
Name: 2, dtype: float64

In [None]:
# What will this one be?
Xc_test.loc[3]

age            20555.000
gender             1.000
height           162.000
weight            83.000
ap_hi            120.000
ap_lo             80.000
cholesterol        1.000
gluc               3.000
smoke              0.000
alco               0.000
active             1.000
BMI               31.626
MAP               93.333
Name: 3, dtype: float64

In [None]:
toPredict = pd.DataFrame()
toPredict = toPredict.append(Xidx_test.loc[3])
pred = rf.predict(toPredict)
print('Prediction was', pred)
print('Actual was', yc_test.loc[3])

Prediction was [0]
Actual was 1


  toPredict = toPredict.append(Xidx_test.loc[3])
