In [13]:
#imports
import numpy as np
import sklearn, sklearn.tree, sklearn.ensemble
import matplotlib.pyplot as plt
import pandas as pd
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

#erad the csv file
dataset = pd.read_csv('https://storage.googleapis.com/neurals/data/Social_Network_Ads.csv')

#removing fields which has no meaning
dataset = dataset.drop(columns=['User ID'])

#display top 5 entries in the dataset
dataset.head(5)

#convert categorical variable to continuous variable usinh onehotencoding
enc = sklearn.preprocessing.OneHotEncoder()
enc.fit(dataset.iloc[:,[0]])
onehotlabels = enc.transform(dataset.iloc[:,[0]]).toarray()
genders = pd.DataFrame({'Female':onehotlabels[:,0],'Male':onehotlabels[:,1]})
#removing gender column from the dataset and concatinating it with the newly created continuous gender column
result = pd.concat([genders,dataset.iloc[:,1:]],axis=1, sort = False)


#actual label
y = result['Purchased']
#remove actual label from the dataset
X = result.drop(columns=['Purchased'])
X.head(5)


#train test split
X_train, X_test,y_train, y_test= train_test_split(X,y,test_size=0.25,random_state=0)

#feature scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [7]:
#Decision Tree (DT) algorithm
classifier = sklearn.tree.DecisionTreeClassifier(criterion = "entropy",random_state = 100, max_depth=2)
classifier = classifier.fit(X_train,y_train)
#prediction
y_pred = classifier.predict(X_test)

#evaluation metrics
cm = metrics.confusion_matrix(y_test,y_pred)
accuracy = metrics.accuracy_score(y_test,y_pred)
recall = metrics.recall_score(y_test,y_pred)
precision = metrics.precision_score(y_test,y_pred)
print(f"Accuracy of DT algorithm :{accuracy}\nRecall value of DT algorithm: {recall}\nPrecision value of DT algorithm: {precision}")

Accuracy of DT algorithm :0.94
Recall value of DT algorithm: 0.9375
Precision value of DT algorithm: 0.8823529411764706


In [10]:
#Gradient Boosting using XGBoost

from xgboost import XGBClassifier
classifier = XGBClassifier()
classifier.fit(X_train,y_train)

#prediction
y_pred = classifier.predict(X_test)

#evaluation metrics
cm = metrics.confusion_matrix(y_test,y_pred) #cm means confusion matrix
accuracy = metrics.accuracy_score(y_test,y_pred)
recall = metrics.recall_score(y_test,y_pred)
precision = metrics.precision_score(y_test,y_pred)
print(f"Accuracy of XGB:{accuracy}\nRecall value of XGB: {recall}\nPrecision value of XGB: {precision}")

Accuracy of XGB:0.91
Recall value of XGB: 0.84375
Precision value of XGB: 0.8709677419354839


In [15]:
#Random Forest(RF) Algorithm
classifier = sklearn.ensemble.RandomForestClassifier(n_estimators = 10, max_depth=4,criterion = "entropy",random_state=0)
classifier.fit(X_train,y_train)

#prediction
y_pred = classifier.predict(X_test)

#evaluation metrics
cm = metrics.confusion_matrix(y_test,y_pred)
accuracy = metrics.accuracy_score(y_test,y_pred)
recall = metrics.recall_score(y_test,y_pred)
precision = metrics.precision_score(y_test,y_pred)
print(f"Accuracy of RF algorithm :{accuracy}\nRecall value of RF algorithm : {recall}\nPrecision value of RF algorithm : {precision}")

Accuracy of RF algorithm :0.93
Recall value of RF algorithm : 0.90625
Precision value of RF algorithm : 0.8787878787878788
