# Lab | Random Forests

- For this lab, you will be using the CSV files provided in the files_for_lab folder.

In [1]:
from imblearn.over_sampling import SMOTE
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score


import warnings
warnings.filterwarnings('ignore')

pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [2]:
numerical = pd.read_csv('numerical.csv')
categorical = pd.read_csv('categorical.csv')
target = pd.read_csv('target.csv')

In [3]:
target.shape

(95412, 2)

In [4]:
target.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95412 entries, 0 to 95411
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   TARGET_B  95412 non-null  int64  
 1   TARGET_D  95412 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 1.5 MB


In [5]:
for col in target:
        print (target[col].nunique(), '\n')
        print(target[col].value_counts(), '\n')

2 

0    90569
1     4843
Name: TARGET_B, dtype: int64 

71 

0.00      90569
10.00       941
15.00       591
20.00       577
5.00        503
25.00       392
12.00       161
7.00        126
6.00        124
11.00       118
8.00        114
30.00        99
21.00        92
3.00         86
16.00        85
14.00        81
4.00         76
9.00         71
50.00        68
13.00        64
17.00        63
23.00        40
35.00        39
18.00        37
19.00        27
40.00        27
26.00        26
22.00        23
100.00       22
2.00         21
24.00        13
32.00        12
12.50        11
36.00         9
27.00         9
45.00         8
38.00         8
1.00          7
75.00         7
28.00         6
37.00         6
47.00         5
200.00        4
51.00         3
31.00         3
33.00         3
60.00         3
46.00         2
42.00         2
34.00         2
44.00         2
41.00         2
53.00         2
43.00         2
29.00         2
44.21         1
7.50          1
10.70         1
13.92     

In [6]:
# drop 'target D' column since we don't need it in this lab
target = target.drop(['TARGET_D'], axis = 1)

### 1. Apply the Random Forests algorithm but this time only by upscaling the data using SMOTE.

### 2. Note that since SMOTE works on numerical data only, we will first encode the categorical variables in this case.

In [7]:
#encoding categorical then concat to numerical and target

encoder = OneHotEncoder(drop='first').fit(categorical)
encoded_categorical = encoder.transform(categorical).toarray()
encoded_categorical = pd.DataFrame(encoded_categorical)

df = pd.concat([numerical, encoded_categorical, target], axis = 1)

In [8]:
# SMOTE
# Uses knn to create rows with similar features from the minority classes.

smote = SMOTE()

y = df['TARGET_B']
X = df.drop(['TARGET_B'], axis = 1)

X_sm, y_sm = smote.fit_resample(X, y)
y_sm.value_counts()

0    90569
1    90569
Name: TARGET_B, dtype: int64

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.25, random_state=0)

In [12]:
# Building the model

clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

0.8819697471568952


In [14]:
# For cross validation

clf = RandomForestClassifier(max_depth=2, random_state=0)
cross_val_scores = cross_val_score(clf, X_train, y_train, cv=10)
print(cross_val_scores)

[0.8820845  0.88127484 0.8863536  0.88597718 0.88781745 0.88568274
 0.87868973 0.8812661  0.88833272 0.87655502]


In [15]:
# Building the model - max depth = 5

clf = RandomForestClassifier(max_depth=5, random_state=0)
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

0.94218836259247


In [18]:
# For cross validation - max depth = 5

clf = RandomForestClassifier(max_depth=5, random_state=0)
cross_val_scores = cross_val_score(clf, X_train, y_train, cv=5)
print(cross_val_scores)

[0.94023039 0.94229141 0.94056163 0.94085388 0.94155318]
