In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
df = pd.read_csv('Social_Network_Ads.csv')
df

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


In [3]:
df['Purchased'].value_counts()

0    257
1    143
Name: Purchased, dtype: int64

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 15.8+ KB


In [5]:
df['Gender'].value_counts()

Female    204
Male      196
Name: Gender, dtype: int64

In [6]:
df['Gender'].replace({'Male':0,'Female':1}, inplace = True)

In [7]:
df.drop('User ID', axis = 1, inplace =  True)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   Gender           400 non-null    int64
 1   Age              400 non-null    int64
 2   EstimatedSalary  400 non-null    int64
 3   Purchased        400 non-null    int64
dtypes: int64(4)
memory usage: 12.6 KB


# Train Test Split

In [9]:
x = df.drop('Purchased',axis = 1)
y = df['Purchased']

In [10]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2, random_state = 1, stratify=y)

# Train Random Forest Classifier

In [12]:
rf_clf = RandomForestClassifier(random_state=1)
rf_clf.fit(x_train,y_train)

RandomForestClassifier(random_state=1)

# testing accuracy

In [13]:
y_pred = rf_clf.predict(x_test)
y_pred
test_accuracy = accuracy_score(y_test,y_pred)
print('Testing accuracy is:', test_accuracy)

Testing accuracy is: 0.8625


# Training accuracy

In [14]:
y_pred_train = rf_clf.predict(x_train)
train_accuracy = accuracy_score(y_train,y_pred_train)
print('Training accuracy is:', train_accuracy)

Training accuracy is: 0.996875


# Hyperparameters tuning

In [16]:
hyp = {'n_estimators': np.arange(10,100),
      'criterion' : ['gini','entropy'],
      'min_samples_split' : np.arange(2,10),
      'min_samples_leaf': np.arange(1,10),
      'max_depth' : np.arange(2,10)}

rf_clf = RandomForestClassifier(random_state= 1)
gscv_rf_clf = GridSearchCV(rf_clf,hyp,cv = 5)
gscv_rf_clf.fit(x_train,y_train)

KeyboardInterrupt: 

In [None]:
gscv_rf_clf.best_params_

In [None]:
hyp = {'n_estimators': np.arange(10,100),
      'criterion' : ['gini','entropy'],
      'min_samples_split' : np.arange(2,10),
      'min_samples_leaf': np.arange(1,10),
      'max_depth' : np.arange(2,10)}

rf_clf = RandomForestClassifier(random_state= 1)
rscv_rf_clf = RandomizedSearchCV(rf_clf, hyp, cv = 5)
rscv_rf_clf.fit(x_train,y_train)

In [None]:
rscv_rf_clf.best_params_

# Random Forest Regression

In [17]:
x = df.drop('EstimatedSalary',axis = 1)
y = df['EstimatedSalary']

In [19]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2, random_state = 1)

In [20]:
rf_reg = RandomForestRegressor(random_state = 10)
rf_reg.fit(x_train,y_train)

RandomForestRegressor(random_state=10)

# Testing Data Accuracy

In [23]:
y_pred = rf_reg.predict(x_test)

mse = mean_squared_error(y_test,y_pred)
print('Mean Squared error:', mse)

rmse = np.sqrt(mse)
rmse


r2_score(y_test,y_pred)

Mean Squared error: 1409170358.3029413


-0.19441603487920855