# Classification Codes Using RF

In [1]:
# Import libraries
import sklearn
import pandas as pd
import numpy as np
import pickle
import warnings

In [2]:
# Ignore warnings
warnings.filterwarnings("ignore")

In [3]:
# Import the dataset from the table in csv format
dataset = pd.read_csv("Social_Network_Ads.csv")
dataset

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


In [5]:
# Convert alphabetical values in the dataset into numerical values
dataset = pd.get_dummies(dataset, dtype = int, drop_first = True)
dataset

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased,Gender_Male
0,15624510,19,19000,0,1
1,15810944,35,20000,0,1
2,15668575,26,43000,0,0
3,15603246,27,57000,0,0
4,15804002,19,76000,0,1
...,...,...,...,...,...
395,15691863,46,41000,1,0
396,15706071,51,23000,1,1
397,15654296,50,20000,1,0
398,15755018,36,33000,0,1


In [9]:
# Exclude the colum named user ID because it is a redundant variable and does not contribute to the solution 
dataset = dataset.drop('User ID', axis = 1)

In [11]:
# Find the number of output variables classified as O and 1
dataset['Purchased'].value_counts()

Purchased
0    257
1    143
Name: count, dtype: int64

In [13]:
# Get the headings of the dataset
dataset.columns

Index(['Age', 'EstimatedSalary', 'Purchased', 'Gender_Male'], dtype='object')

In [15]:
# Specify the input values or independent variables in the dataset
Input = dataset[['Age', 'EstimatedSalary', 'Gender_Male']]
Input

Unnamed: 0,Age,EstimatedSalary,Gender_Male
0,19,19000,1
1,35,20000,1
2,26,43000,0
3,27,57000,0
4,19,76000,1
...,...,...,...
395,46,41000,0
396,51,23000,1
397,50,20000,0
398,36,33000,1


In [17]:
# Find the number of different input variables and the maximum count of the input variables
Input.shape

(400, 3)

In [19]:
# Specify the output values or dependent variables in the dataset
Output = dataset[['Purchased']]
Output

Unnamed: 0,Purchased
0,0
1,0
2,0
3,0
4,0
...,...
395,1
396,1
397,1
398,0


In [21]:
# Model creation and fit the model
from sklearn.ensemble import RandomForestClassifier as RFC
Classifier = RFC(n_estimators=50, random_state = 0)
Classifier = Classifier.fit(Input, Output)

In [22]:
# Predict output using the data from input variables
Y_pred = Classifier.predict(Input)

In [25]:
# Generate confusion matrix from output variables
from sklearn.metrics import confusion_matrix as cm
cm = cm(Output, Y_pred)
print (cm)

[[257   0]
 [  1 142]]


In [27]:
# Generate report of the confusion matrix
from sklearn.metrics import classification_report as cr
cr = cr(Output, Y_pred)
print (cr)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       257
           1       1.00      0.99      1.00       143

    accuracy                           1.00       400
   macro avg       1.00      1.00      1.00       400
weighted avg       1.00      1.00      1.00       400



In [29]:
filename = "Pythoncode_RFC.sav"
pickle.dump (Classifier, open(filename, "wb"))

In [31]:
load_model=pickle.load(open("Pythoncode_RFC.sav", "rb"))
Result = load_model.predict([[40,30000,1]])
Result

array([0], dtype=int64)