# Lab | Random Forests
- For this lab, you will be using the CSV files provided in the files_for_lab folder.

### 1. Apply the Random Forests algorithm but this time only by upscaling the data using SMOTE

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, Normalizer, LabelEncoder
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import tree

import warnings
warnings.filterwarnings('ignore')

In [2]:
numerical = pd.read_csv('numerical.csv')
categorical = pd.read_csv('categorical.csv')
targets = pd.read_csv('target.csv')

### 2. Note that since SMOTE works on numerical data only, we will first encode the categorical variables in this case.

In [3]:
encoder = OneHotEncoder(drop='first').fit(categorical)
encoded_categorical = encoder.transform(categorical).toarray()
encoded_categorical = pd.DataFrame(encoded_categorical)

data = pd.concat([numerical, encoded_categorical, targets], axis = 1)

In [4]:
data.head()

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,311,312,313,314,315,316,317,318,TARGET_B,TARGET_D
0,0,60.0,5,9,0,0,39,34,18,10,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0.0
1,1,46.0,6,9,16,0,15,55,11,6,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0.0
2,1,61.611649,3,1,2,0,20,29,33,6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0
3,0,70.0,1,4,2,0,23,14,31,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0
4,0,78.0,3,2,60,1,28,9,53,26,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0


In [5]:
# regression_target = data['TARGET_D']
y = data['TARGET_B']
X = data.drop(['TARGET_B'], axis = 1)

In [6]:
data['TARGET_B'].value_counts()

0    90569
1     4843
Name: TARGET_B, dtype: int64

In [7]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
y = data['TARGET_B']
X = data.drop(['TARGET_B'], axis=1)
X_sm, y_sm = smote.fit_resample(X, y)
y_sm.value_counts()

0    90569
1    90569
Name: TARGET_B, dtype: int64

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.25, random_state=0)

X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

y_train_regression = X_train['TARGET_D']
y_test_regression = X_test['TARGET_D']

My computer run 30 minutes without a chance of output. So I decided to go with n_estimators 10, 20. How to fix this problem for the final project? 

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [10, 20],
    'min_samples_split': [2, 4],
    'min_samples_leaf' : [1, 2],
    'max_features': ['sqrt']
#    'max_samples' : ['None', 0.5]
    }
clf = RandomForestClassifier(random_state=100)

grid_search = GridSearchCV(clf, param_grid, cv=5,return_train_score=True,n_jobs=-1)
grid_search.fit(X_train,y_train)
grid_search.best_params_

{'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 4,
 'n_estimators': 20}

In [10]:
from sklearn.model_selection import cross_val_score
clf = RandomForestClassifier(random_state=0, max_features='sqrt', min_samples_leaf=1, min_samples_split=4, n_estimators=20)
cross_val_scores = cross_val_score(clf, X_train, y_train, cv=10)
print(np.mean(cross_val_scores))

0.9953332293776403


I think this would be a classic case of overfitting. I try to avoid this with max_depth = 5

In [11]:
clf = RandomForestClassifier(random_state=0, max_features='sqrt', max_depth=5, min_samples_leaf=1, min_samples_split=4, n_estimators=20)
cross_val_scores = cross_val_score(clf, X_train, y_train, cv=10)
print(np.mean(cross_val_scores))

0.9792496513845116


The score is still very high. So I tried a max_depth of 3.

In [12]:
clf = RandomForestClassifier(random_state=0, max_features='sqrt', max_depth=3, min_samples_leaf=1, min_samples_split=4, n_estimators=20)
cross_val_scores = cross_val_score(clf, X_train, y_train, cv=10)
print(np.mean(cross_val_scores))

0.9465891542967789


Is this the right way to avoid over fitting? Still get an score of 0.94 and it seems to be to high. Would be happy if you could give me a feedback here :-). 