# Lab | Random Forests

- For this lab, you will be using the CSV files provided in the files_for_lab folder.

### 1. Apply the Random Forests algorithm but this time only by upscaling the data using SMOTE.

In [1]:
from imblearn.over_sampling import SMOTE
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn import tree
import warnings
warnings.filterwarnings('ignore')

In [2]:
categorical = pd.read_csv('/Users/elissadejong/lab-random-forests/files_for_lab/categorical.csv')
categorical.head()

Unnamed: 0,STATE,CLUSTER,HOMEOWNR,GENDER,DATASRCE,RFA_2R,RFA_2A,GEOCODE2,DOMAIN_A,DOMAIN_B,...,DOB_YR,DOB_MM,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM
0,IL,36,H,F,3,L,E,C,T,2,...,37,12,92,8,94,2,95,12,89,11
1,CA,14,H,M,3,L,G,A,S,1,...,52,2,93,10,95,12,95,12,93,10
2,NC,43,U,M,3,L,E,C,R,2,...,0,2,91,11,92,7,95,12,90,1
3,CA,44,U,F,3,L,E,C,R,2,...,28,1,87,11,94,11,95,12,87,2
4,FL,16,H,F,3,L,F,A,S,2,...,20,1,93,10,96,1,96,1,79,3


In [3]:
numerical = pd.read_csv('/Users/elissadejong/lab-random-forests/files_for_lab/numerical.csv')
numerical.head()

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,CARDGIFT,MINRAMNT,MAXRAMNT,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2
0,0,60.0,5,9,0,0,39,34,18,10,...,14,5.0,12.0,10.0,4,7.741935,95515,0,4,39
1,1,46.0,6,9,16,0,15,55,11,6,...,1,10.0,25.0,25.0,18,15.666667,148535,0,2,1
2,1,61.611649,3,1,2,0,20,29,33,6,...,14,2.0,16.0,5.0,12,7.481481,15078,1,4,60
3,0,70.0,1,4,2,0,23,14,31,3,...,7,2.0,11.0,10.0,9,6.8125,172556,1,4,41
4,0,78.0,3,2,60,1,28,9,53,26,...,8,3.0,15.0,15.0,14,6.864865,7112,1,2,26


In [4]:
targets = pd.read_csv('/Users/elissadejong/lab-random-forests/files_for_lab/target.csv')
targets.head()

Unnamed: 0,TARGET_B,TARGET_D
0,0,0.0
1,0,0.0
2,0,0.0
3,0,0.0
4,0,0.0


In [5]:
targets['TARGET_B'].value_counts()

0    90569
1     4843
Name: TARGET_B, dtype: int64

### 2. Note that since SMOTE works on numerical data only, we will first encode the categorical variables in this case.

In [6]:
encoder = OneHotEncoder(drop='first').fit(categorical)
enc_categorical = encoder.transform(categorical).toarray()
enc_categorical = pd.DataFrame(enc_categorical)

In [7]:
data = pd.concat([numerical, enc_categorical, targets], axis = 1)
target_regression = data['TARGET_D']

In [8]:
y = data['TARGET_B']
X = data.drop(['TARGET_B'], axis=1)

In [9]:
smote = SMOTE()

X_sm, y_sm = smote.fit_resample(X, y)
y_sm.value_counts()

0    90569
1    90569
Name: TARGET_B, dtype: int64

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.25, random_state=0)

X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

y_train_regression = X_train['TARGET_D']
y_test_regression = X_test['TARGET_D']

In [11]:
# param_grid = {
#     'n_estimators': [50, 100, 150, 200, 500, 1000],
#     'min_samples_split': [2, 4],
#     'min_samples_leaf' : [1, 2],
#     'max_features': ['sqrt']
#     }
# clf = RandomForestClassifier(random_state=100)

# grid_search = GridSearchCV(clf, param_grid, cv=5,return_train_score=True,n_jobs=-1)
# grid_search.fit(X_train,y_train)
# grid_search.best_params_ 

In [12]:
clf = RandomForestClassifier(random_state=0, max_features='sqrt', min_samples_leaf=1, min_samples_split=4, n_estimators=10)
cross_val_scores = cross_val_score(clf, X_train, y_train, cv=10)
print(np.mean(cross_val_scores))

0.989974479021873


In [13]:
clf.fit( X_train, y_train)
X_train.head()
feature_names = X_train.columns
feature_names = list(feature_names)

df = pd.DataFrame(list(zip(feature_names, clf.feature_importances_)))
df.columns = ['columns_name', 'score_feature_importance']
df.sort_values(by=['score_feature_importance'], ascending = False)

Unnamed: 0,columns_name,score_feature_importance
634,TARGET_D,0.263369
585,270,0.050649
379,64,0.045189
571,256,0.035314
378,63,0.031604
...,...,...
532,217,0.000000
531,216,0.000000
530,215,0.000000
529,214,0.000000


### Questions:
- I dont really understand yet how to interpret the result i obtained from the cross validation score. Also, i only worked with n_estimators=10 because it took a crazy long time to run larger numbers. 
- I dont understand how in the train-test-split we were able to generate the regression variables (y_train_regression = X_train['TARGET_D'] & y_test_regression = X_test['TARGET_D']) like this and how this really works, in particular if you would continue with regression models then. Would you make both regression and classifier models in such cases?  