# importating libraries

In [45]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix , accuracy_score
from sklearn.preprocessing import StandardScaler


In [2]:
data = pd.read_csv(r"https://raw.githubusercontent.com/omairaasim/machine_learning/master/project_11_k_nearest_neighbor/iphone_purchase_records.csv")

In [3]:
data.head()

Unnamed: 0,Gender,Age,Salary,Purchase Iphone
0,Male,19,19000,0
1,Male,35,20000,0
2,Female,26,43000,0
3,Female,27,57000,0
4,Male,19,76000,0


In [4]:
data.tail()

Unnamed: 0,Gender,Age,Salary,Purchase Iphone
395,Female,46,41000,1
396,Male,51,23000,1
397,Female,50,20000,1
398,Male,36,33000,0
399,Female,49,36000,1


In [5]:
data.describe()

Unnamed: 0,Age,Salary,Purchase Iphone
count,400.0,400.0,400.0
mean,37.655,69742.5,0.3575
std,10.482877,34096.960282,0.479864
min,18.0,15000.0,0.0
25%,29.75,43000.0,0.0
50%,37.0,70000.0,0.0
75%,46.0,88000.0,1.0
max,60.0,150000.0,1.0


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Gender           400 non-null    object
 1   Age              400 non-null    int64 
 2   Salary           400 non-null    int64 
 3   Purchase Iphone  400 non-null    int64 
dtypes: int64(3), object(1)
memory usage: 12.6+ KB


In [7]:
data['Gender'].value_counts()

Gender
Female    204
Male      196
Name: count, dtype: int64

In [8]:
data.loc[data['Purchase Iphone']==1,"Gender"].value_counts()

Gender
Female    77
Male      66
Name: count, dtype: int64

# Machine Learning

Spliting of data

In [9]:
data.head(10)

Unnamed: 0,Gender,Age,Salary,Purchase Iphone
0,Male,19,19000,0
1,Male,35,20000,0
2,Female,26,43000,0
3,Female,27,57000,0
4,Male,19,76000,0
5,Male,27,58000,0
6,Female,27,84000,0
7,Female,32,150000,1
8,Male,25,33000,0
9,Female,35,65000,0


In [10]:
X = data.iloc[:,:3]
Y = data.iloc[:,3]

In [11]:
X

Unnamed: 0,Gender,Age,Salary
0,Male,19,19000
1,Male,35,20000
2,Female,26,43000
3,Female,27,57000
4,Male,19,76000
...,...,...,...
395,Female,46,41000
396,Male,51,23000
397,Female,50,20000
398,Male,36,33000


Label Encoding


In [12]:
from sklearn.preprocessing import LabelEncoder

In [13]:
X

Unnamed: 0,Gender,Age,Salary
0,Male,19,19000
1,Male,35,20000
2,Female,26,43000
3,Female,27,57000
4,Male,19,76000
...,...,...,...
395,Female,46,41000
396,Male,51,23000
397,Female,50,20000
398,Male,36,33000


In [14]:
X['Gender'] = LabelEncoder().fit_transform(X['Gender'])

In [15]:
X

Unnamed: 0,Gender,Age,Salary
0,1,19,19000
1,1,35,20000
2,0,26,43000
3,0,27,57000
4,1,19,76000
...,...,...,...
395,0,46,41000
396,1,51,23000
397,0,50,20000
398,1,36,33000


In [16]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   Gender  400 non-null    int32
 1   Age     400 non-null    int64
 2   Salary  400 non-null    int64
dtypes: int32(1), int64(2)
memory usage: 7.9 KB


Splitting the data into set

In [17]:
sp = StratifiedKFold(n_splits=5)

In [18]:
for train_index , test_index in sp.split(X,Y):
    x_train, x_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = Y.iloc[train_index], Y.iloc[test_index]

In [19]:
len(x_train),len(x_test)

(320, 80)

# Feature Scaling

In [20]:
x_train, y_test

(     Gender  Age  Salary
 0         1   19   19000
 1         1   35   20000
 2         0   26   43000
 3         0   27   57000
 4         1   19   76000
 ..      ...  ...     ...
 354       1   36   99000
 355       1   60   34000
 356       1   54   70000
 358       1   40   71000
 360       1   43  129000
 
 [320 rows x 3 columns],
 266    0
 267    0
 269    0
 270    0
 276    0
       ..
 395    1
 396    1
 397    1
 398    0
 399    1
 Name: Purchase Iphone, Length: 80, dtype: int64)

In [21]:
scal = StandardScaler()

In [22]:
x_train = scal.fit_transform(x_train)
x_test = scal.fit_transform(x_test)

In [23]:
x_test

array([[ 1.        , -0.51982647,  0.49135807],
       [ 1.        , -0.94708111,  0.45312009],
       [ 1.        , -0.51982647, -0.04397368],
       [-1.        , -0.09257184,  2.70916104],
       [ 1.        , -0.8046629 ,  0.33840614],
       [ 1.        , -1.23191753, -0.04397368],
       [ 1.        ,  0.61951922,  3.0150649 ],
       [-1.        , -0.94708111, -0.0057357 ],
       [ 1.        , -0.37740826,  0.64431   ],
       [ 1.        , -0.94708111,  0.56783404],
       [-1.        , -1.23191753, -0.19692561],
       [-1.        , -1.08949932,  0.03250228],
       [ 1.        ,  0.19226459,  0.64431   ],
       [-1.        , -0.51982647, -0.08221166],
       [ 1.        , -0.23499005, -0.31163955],
       [-1.        ,  1.04677386,  2.74739902],
       [-1.        , -0.8046629 , -0.46459148],
       [-1.        , -0.23499005,  0.30016816],
       [-1.        , -0.8046629 , -0.46459148],
       [-1.        , -0.66224469,  0.64431   ],
       [ 1.        , -1.23191753, -0.273

Model Selection

1. log
2. KNN

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [25]:
log = LogisticRegression()
knn = KNeighborsClassifier(n_neighbors=5)

# Training the model

In [26]:
log.fit(x_train,y_train)
knn.fit(x_train,y_train)

In [29]:
# using logistic regression
y_train_pred_log = log.predict(x_train)
# sing knn neighbor
y_train_pred_knn = knn.predict(x_train)

In [33]:
# using logistic regression
df1 = pd.DataFrame({"actual":y_train,"predicted":y_train_pred_log})
# sing knn neighbor
df2 = pd.DataFrame({"actual":y_train,"predicted":y_train_pred_knn})


In [34]:
df1

Unnamed: 0,actual,predicted
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
354,1,1
355,1,1
356,1,1
358,1,0


In [35]:
df2

Unnamed: 0,actual,predicted
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
354,1,1
355,1,1
356,1,1
358,1,0


In [36]:
df1.head()

Unnamed: 0,actual,predicted
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0


confusion_matrix

In [38]:
confusion_matrix(y_train,y_train_pred_log)

array([[190,  16],
       [ 31,  83]], dtype=int64)

accuract of logistic regression ::: 
accuracy = (190+83)/320 = 85 %

In [39]:
confusion_matrix(y_train,y_train_pred_knn)

array([[191,  15],
       [ 14, 100]], dtype=int64)

accuracy of knn : : 
accuracy = (191+100)/320 = 97%

Hyper parameter 

Accuracy for range 3 to 100 odd numbers

In [69]:
list = [i for i in range(1,100) if i%2 != 0]

Using knn

In [82]:
acc = []
for i in list :
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(x_train,y_train)
    y_test_pred_logi = knn.predict(x_test)
    acc.append(accuracy_score(y_test,y_test_pred_logi))
print(max(acc))

0.8875


In [87]:
dic = {}
for i in list:
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(x_train,y_train)
    y_test_pred_logi = knn.predict(x_test)
    dic[i] = accuracy_score(y_test,y_test_pred_logi)
print(max(dic))

99
