# Lab | Random Forests

- For this lab, you will be using the CSV files provided in the files_for_lab folder.

In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score

### 1. Apply the Random Forests algorithm but this time only by upscaling the data using SMOTE.

In [2]:
numerical = pd.read_csv('files_for_lab/numerical.csv')
numerical.head()

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,CARDGIFT,MINRAMNT,MAXRAMNT,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2
0,0,60.0,5,9,0,0,39,34,18,10,...,14,5.0,12.0,10.0,4,7.741935,95515,0,4,39
1,1,46.0,6,9,16,0,15,55,11,6,...,1,10.0,25.0,25.0,18,15.666667,148535,0,2,1
2,1,61.611649,3,1,2,0,20,29,33,6,...,14,2.0,16.0,5.0,12,7.481481,15078,1,4,60
3,0,70.0,1,4,2,0,23,14,31,3,...,7,2.0,11.0,10.0,9,6.8125,172556,1,4,41
4,0,78.0,3,2,60,1,28,9,53,26,...,8,3.0,15.0,15.0,14,6.864865,7112,1,2,26


In [3]:
categorical =  pd.read_csv('files_for_lab/categorical.csv')
categorical.head()

Unnamed: 0,STATE,CLUSTER,HOMEOWNR,GENDER,DATASRCE,RFA_2R,RFA_2A,GEOCODE2,DOMAIN_A,DOMAIN_B,...,DOB_YR,DOB_MM,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM
0,IL,36,H,F,3,L,E,C,T,2,...,37,12,92,8,94,2,95,12,89,11
1,CA,14,H,M,3,L,G,A,S,1,...,52,2,93,10,95,12,95,12,93,10
2,NC,43,U,M,3,L,E,C,R,2,...,0,2,91,11,92,7,95,12,90,1
3,CA,44,U,F,3,L,E,C,R,2,...,28,1,87,11,94,11,95,12,87,2
4,FL,16,H,F,3,L,F,A,S,2,...,20,1,93,10,96,1,96,1,79,3


In [4]:
targets = pd.read_csv('files_for_lab/target.csv')
targets.head()

Unnamed: 0,TARGET_B,TARGET_D
0,0,0.0
1,0,0.0
2,0,0.0
3,0,0.0
4,0,0.0


### 2. Note that since SMOTE works on numerical data only, we will first encode the categorical variables in this case.

In [5]:
encoder = OneHotEncoder(drop='first').fit(categorical)
encoded_categorical = encoder.transform(categorical).toarray()
encoded_categorical = pd.DataFrame(encoded_categorical)

In [6]:
data = pd.concat([numerical, encoded_categorical, targets], axis = 1)
regression_target = data['TARGET_D']
data.head()

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,311,312,313,314,315,316,317,318,TARGET_B,TARGET_D
0,0,60.0,5,9,0,0,39,34,18,10,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0.0
1,1,46.0,6,9,16,0,15,55,11,6,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0.0
2,1,61.611649,3,1,2,0,20,29,33,6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0
3,0,70.0,1,4,2,0,23,14,31,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0
4,0,78.0,3,2,60,1,28,9,53,26,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0


In [7]:
y = data['TARGET_B']
X = data.drop(['TARGET_B'], axis = 1)

In [8]:
smote = SMOTE()
X_sm, y_sm = smote.fit_resample(X, y)
y_sm.value_counts()

1    90569
0    90569
Name: TARGET_B, dtype: int64

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.25, random_state=42)

In [11]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

In [12]:
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

0.9819587059732804


In [13]:
clf = RandomForestClassifier(max_depth=2, random_state=0)
cross_val_scores = cross_val_score(clf, X_train, y_train, cv=10)
print(np.mean(cross_val_scores))

0.9775050980460576
