### import the required libraries

In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### import dataset

In [33]:
df= pd.read_csv('social_network_ads.csv')
df.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [34]:
df.describe()

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased
count,400.0,400.0,400.0,400.0
mean,15691540.0,37.655,69742.5,0.3575
std,71658.32,10.482877,34096.960282,0.479864
min,15566690.0,18.0,15000.0,0.0
25%,15626760.0,29.75,43000.0,0.0
50%,15694340.0,37.0,70000.0,0.0
75%,15750360.0,46.0,88000.0,1.0
max,15815240.0,60.0,150000.0,1.0


In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 15.8+ KB


### Searching in database

In [36]:
df['Purchased'].value_counts()

Purchased
0    257
1    143
Name: count, dtype: int64

In [37]:
df.columns

Index(['User ID', 'Gender', 'Age', 'EstimatedSalary', 'Purchased'], dtype='object')

In [38]:
gender = pd.get_dummies( df['Gender'], drop_first=True )
df= pd.concat( [df, gender], axis= 1 )

df.drop( ['Gender'], axis= 1, inplace=True)

### Separate de dataset

In [39]:
from sklearn.preprocessing import StandardScaler

In [41]:
df.columns

Index(['User ID', 'Age', 'EstimatedSalary', 'Purchased', 'Male'], dtype='object')

In [42]:
X = df[['Age', 'EstimatedSalary', 'Male']]
y = df['Purchased']

In [44]:
scaler= StandardScaler()
scaler.fit(X)

In [45]:
scaled_features= scaler.transform(X)
scaled_features

array([[-1.78179743, -1.49004624,  1.02020406],
       [-0.25358736, -1.46068138,  1.02020406],
       [-1.11320552, -0.78528968, -0.98019606],
       ...,
       [ 1.17910958, -1.46068138, -0.98019606],
       [-0.15807423, -1.07893824,  1.02020406],
       [ 1.08359645, -0.99084367, -0.98019606]])

In [46]:
df_feat= pd.DataFrame( scaled_features, columns=X.columns )
df_feat.head()

Unnamed: 0,Age,EstimatedSalary,Male
0,-1.781797,-1.490046,1.020204
1,-0.253587,-1.460681,1.020204
2,-1.113206,-0.78529,-0.980196
3,-1.017692,-0.374182,-0.980196
4,-1.781797,0.183751,1.020204


### Training using KNN

In [48]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test= train_test_split( scaled_features, y, test_size=0.2 )

In [49]:
from sklearn.neighbors import KNeighborsClassifier

knn= KNeighborsClassifier( n_neighbors=1 )
knn.fit(X_train, y_train)

In [53]:
pred= knn.predict(X_test)

In [54]:
from sklearn.metrics import classification_report, confusion_matrix
confusion_matrix(y_test, pred)

array([[45,  6],
       [ 6, 23]])

In [55]:
print( classification_report(y_test, pred) )

              precision    recall  f1-score   support

           0       0.88      0.88      0.88        51
           1       0.79      0.79      0.79        29

    accuracy                           0.85        80
   macro avg       0.84      0.84      0.84        80
weighted avg       0.85      0.85      0.85        80

