# Load Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier

In [2]:
from google.colab import files
up = files.upload()

Saving Salary_Random_Forest.txt to Salary_Random_Forest (1).txt


# Load Dataset 

In [3]:
data = pd.read_csv("Salary_Random_Forest.txt")

#Summarize Dataset



In [4]:
data.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19.0,19000.0,0
1,15810944,Male,35.0,20000.0,0
2,15668575,Female,26.0,43000.0,0
3,15603246,Female,27.0,57000.0,0
4,15804002,Male,19.0,76000.0,0


Check for Missing values

In [5]:
data.isnull().sum()

User ID            0
Gender             0
Age                0
EstimatedSalary    0
Purchased          0
dtype: int64

As UserID is unique, we can drop the feature

In [6]:
data.drop("User ID", axis=1, inplace=True)

Lets convert the Age and EstimatedSalary into Integers

In [7]:
data["Age"] = data["Age"].astype("int64")
data["EstimatedSalary"] = data["EstimatedSalary"].astype("int64")

Since Gender has only 2 categories, lets use Label Encoding the replace to object to Numberic values

In [8]:
data["Gender"] = data["Gender"].map({"Male": 1, "Female":0})

In [9]:
data

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
0,1,19,19000,0
1,1,35,20000,0
2,0,26,43000,0
3,0,27,57000,0
4,1,19,76000,0
...,...,...,...,...
395,0,46,41000,1
396,1,51,23000,1
397,0,50,20000,1
398,1,36,33000,0


#Split the dataset
Split the dataset into Independent and Dependent varibles

In [10]:
#Get Target data 
Y = data['Purchased']

#Load X Variables into a Pandas Dataframe with columns 
X = data.drop(['Purchased'], axis = 1)

In [11]:
X.shape

(400, 3)

In [12]:
from collections import Counter
Counter(Y)

Counter({0: 257, 1: 143})

Since the output class is skewed towards one class, we need to handle the Imbalanced data.
Lets use SMOTEtomek technique as it uses combination of Undersampling and Oversampling techinque. 

In [13]:
from imblearn.combine import SMOTETomek
smk = SMOTETomek(random_state=35)
X_res, Y_res = smk.fit_resample(X,Y)

In [14]:
Counter(Y_res)

Counter({0: 234, 1: 234})

# Divide Data into Train and test

In [15]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X_res, Y_res, test_size=0.20, random_state=101)

In [16]:
print(f'X_train : {x_train.shape}')
print(f'y_train : {y_train.shape}')
print(f'X_test : {x_test.shape}')
print(f'y_test : {y_test.shape}')

X_train : (374, 3)
y_train : (374,)
X_test : (94, 3)
y_test : (94,)


# Build Random Forest Model with hyperparameters

In [17]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 80, num = 10)]
# Number of features to consider at every split
max_features = ['sqrt', "log2"]
# Maximum number of levels in tree
max_depth = [2,4]
# Minimum number of samples required to split a node
min_samples_split = [2, 5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]
# Method of selecting samples for training each tree
bootstrap = [True, False]

In [18]:
n_estimators

[10, 17, 25, 33, 41, 48, 56, 64, 72, 80]

In [19]:
# Create the param grid
param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(param_grid)

{'n_estimators': [10, 17, 25, 33, 41, 48, 56, 64, 72, 80], 'max_features': ['sqrt', 'log2'], 'max_depth': [2, 4], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2], 'bootstrap': [True, False]}


In [20]:
rf_Model = RandomForestClassifier()

In [21]:
from sklearn.model_selection import GridSearchCV
rf_Grid = GridSearchCV(estimator = rf_Model, param_grid = param_grid, cv = 3, verbose=1, n_jobs = 4)

In [22]:
rf_Grid.fit(x_train, y_train)

Fitting 3 folds for each of 320 candidates, totalling 960 fits


GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=4,
             param_grid={'bootstrap': [True, False], 'max_depth': [2, 4],
                         'max_features': ['sqrt', 'log2'],
                         'min_samples_leaf': [1, 2],
                         'min_samples_split': [2, 5],
                         'n_estimators': [10, 17, 25, 33, 41, 48, 56, 64, 72,
                                          80]},
             verbose=1)

In [23]:
rf_Grid.best_params_

{'bootstrap': True,
 'max_depth': 2,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 72}

# Check Accuracy

In [24]:
print (f'Train Accuracy - : {rf_Grid.score(x_train,y_train):.3f}')
print (f'Test Accuracy - : {rf_Grid.score(x_test,y_test):.3f}')

Train Accuracy - : 0.960
Test Accuracy - : 0.979


# END