In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
import warnings
warnings.filterwarnings("ignore")

In [2]:
df=pd.read_csv("Customer-Behaviour.csv")

In [3]:
df.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [4]:
df.shape

(400, 5)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 15.8+ KB


In [6]:
df.corr()

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased
User ID,1.0,-0.000721,0.071097,0.00712
Age,-0.000721,1.0,0.155238,0.622454
EstimatedSalary,0.071097,0.155238,1.0,0.362083
Purchased,0.00712,0.622454,0.362083,1.0


In [7]:
df.Gender.value_counts()

Female    204
Male      196
Name: Gender, dtype: int64

In [8]:
df.Purchased.value_counts()

0    257
1    143
Name: Purchased, dtype: int64

In [9]:
df.isnull().sum()

User ID            0
Gender             0
Age                0
EstimatedSalary    0
Purchased          0
dtype: int64

In [10]:
lb=LabelEncoder()
df['Gender']=lb.fit_transform(df['Gender'])
df['Gender'].value_counts()

0    204
1    196
Name: Gender, dtype: int64

In [11]:
df=df.drop(['User ID'],axis=1)

In [12]:
x=df.iloc[:,:-1].values
y=df.iloc[:,-1].values

In [13]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.3,random_state=42)

In [14]:
lr=LogisticRegression()
lr=lr.fit(x_train,y_train)
y_pred_lr=lr.predict(x_test)
print('Logistic regression Precision:',precision_score(y_test,y_pred_lr))
print('Logistic regression Recall:',recall_score(y_test,y_pred_lr))
print('Logistic regression F1-score:',f1_score(y_test,y_pred_lr))
print('Logistic regression Accuracy:',accuracy_score(y_test,y_pred_lr))

Logistic regression Precision: 0.0
Logistic regression Recall: 0.0
Logistic regression F1-score: 0.0
Logistic regression Accuracy: 0.6083333333333333


In [15]:
rf=RandomForestClassifier()
rf=rf.fit(x_train,y_train)
y_pred_rf=rf.predict(x_test)
print('Random forest Precision:',precision_score(y_test,y_pred_rf))
print('Random forest Recall:',recall_score(y_test,y_pred_rf))
print('Random forest F1-score:',f1_score(y_test,y_pred_rf))
print('Random forest Accuracy:',accuracy_score(y_test,y_pred_rf))

Random forest Precision: 0.8888888888888888
Random forest Recall: 0.851063829787234
Random forest F1-score: 0.8695652173913044
Random forest Accuracy: 0.9


In [16]:
svm=SVC()
svm=svm.fit(x_train,y_train)
y_pred_svm=svm.predict(x_test)
print('Support vector machine Precision:',precision_score(y_test,y_pred_svm))
print('Support vector machine Recall:',recall_score(y_test,y_pred_svm))
print('Support vector machine F1-score:',f1_score(y_test,y_pred_svm))
print('Support vector machine Accuracy:',accuracy_score(y_test,y_pred_svm))

Support vector machine Precision: 0.8571428571428571
Support vector machine Recall: 0.3829787234042553
Support vector machine F1-score: 0.5294117647058824
Support vector machine Accuracy: 0.7333333333333333


In [17]:
y_pred_rf

array([1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1,
       1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 0, 1, 0, 0], dtype=int64)

In [18]:
y_test

array([0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 1, 1, 1, 0, 0, 1, 0, 0], dtype=int64)

In [23]:
df2=df[df.Purchased==0].head()

In [24]:
df3=df[df.Purchased==1].head()

In [27]:
df4=pd.concat([df2.head(3),df3.head(3)])
df4

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
0,1,19,19000,0
1,1,35,20000,0
2,0,26,43000,0
7,0,32,150000,1
16,1,47,25000,1
17,1,45,26000,1


In [21]:
#Testing Model for prediction

In [40]:
#Customers will not buy test
print(rf.predict(np.array([[1,19,19000]])))
print(rf.predict(np.array([[1,35,20000]])))
print(rf.predict(np.array([[0,26,43000]])))

#Customers will buy Test
print(rf.predict(np.array([[0,32,150000]])))
print(rf.predict(np.array([[1,47,25000]])))
print(rf.predict(np.array([[0,45,26000]])))

[0]
[0]
[0]
[1]
[1]
[1]


In [41]:
#Customers will not buy test
s=[rf.predict(np.array([[1,19,19000]])),rf.predict(np.array([[1,35,20000]])),\
   rf.predict(np.array([[0,26,43000]])),rf.predict(np.array([[0,32,150000]])),
   rf.predict(np.array([[1,47,25000]])),rf.predict(np.array([[0,45,26000]]))]
df4['Prediction']=s
df4

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased,Prediction
0,1,19,19000,0,[0]
1,1,35,20000,0,[0]
2,0,26,43000,0,[0]
7,0,32,150000,1,[1]
16,1,47,25000,1,[1]
17,1,45,26000,1,[1]
