# Lab | Random Forests

- For this lab, you will be using the CSV files provided in the files_for_lab folder.

### 1. Apply the Random Forests algorithm but this time only by upscaling the data using SMOTE.

### 2. Note that since SMOTE works on numerical data only, we will first encode the categorical variables in this case.

In [1]:
import pandas as pd
import numpy as np
import warnings
import statsmodels.api as sm
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, Normalizer, LabelEncoder, OrdinalEncoder
from sklearn import metrics
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

In [2]:
numerical = pd.read_csv('/Users/spicasumampouw/IronSpica/lab/Unit_7/lab-random-forests/files_for_lab/numerical.csv')
categorical = pd.read_csv('/Users/spicasumampouw/IronSpica/lab/Unit_7/lab-random-forests/files_for_lab/categorical.csv')
targets = pd.read_csv('/Users/spicasumampouw/IronSpica/lab/Unit_7/lab-random-forests/files_for_lab/target.csv')

In [3]:
#numerical.info()
#categorical.dtypes
categorical.shape

(95412, 22)

In [5]:
encoder = OneHotEncoder(drop='first').fit(categorical)
encoded_categorical = encoder.transform(categorical).toarray()
encoded_categorical = pd.DataFrame(encoded_categorical)

df = pd.concat([numerical, encoded_categorical, targets], axis = 1)
regression_target = df['TARGET_D']

In [6]:
smote = SMOTE()
y = df['TARGET_B']
X = df.drop(['TARGET_B'], axis=1)
X_sm, y_sm = smote.fit_resample(X, y)
y_sm.value_counts()

0    90569
1    90569
Name: TARGET_B, dtype: int64

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.25, random_state=0)

X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

y_train_regression = X_train['TARGET_D']
y_test_regression = X_test['TARGET_D']

In [8]:
param_grid = {
    'n_estimators': [50, 100],
    'min_samples_split': [2, 4],
    'min_samples_leaf' : [1, 2],
    'max_features': ['sqrt'],
    'max_samples' : ['None', 0.5]
    }
clf = RandomForestClassifier(random_state=100)

grid_search = GridSearchCV(clf, param_grid, cv=5,return_train_score=True,n_jobs=-1)
grid_search.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=100),
             n_jobs=-1,
             param_grid={'max_features': ['sqrt'], 'max_samples': ['None', 0.5],
                         'min_samples_leaf': [1, 2],
                         'min_samples_split': [2, 4],
                         'n_estimators': [50, 100]},
             return_train_score=True)

In [9]:
grid_search.best_params_

{'max_features': 'sqrt',
 'max_samples': 0.5,
 'min_samples_leaf': 1,
 'min_samples_split': 4,
 'n_estimators': 100}

In [10]:
clf.fit( X_train, y_train)
X_train.head()
feature_names = X_train.columns
feature_names = list(feature_names)

df = pd.DataFrame(list(zip(feature_names, clf.feature_importances_)))
df.columns = ['columns_name', 'score_feature_importance']
df.sort_values(by=['score_feature_importance'], ascending = False)

Unnamed: 0,columns_name,score_feature_importance
634,TARGET_D,0.358654
384,69,0.030643
585,270,0.027202
571,256,0.022815
379,64,0.020335
...,...,...
396,81,0.000000
598,283,0.000000
599,284,0.000000
600,285,0.000000


In [11]:
clf = RandomForestClassifier(random_state=0, max_features='sqrt', min_samples_leaf=1, min_samples_split=4, n_estimators=100)
cross_val_scores = cross_val_score(clf, X_train, y_train, cv=10)
print(np.mean(cross_val_scores))

0.9995509845512556
