In [1]:
import pandas as pd
df = pd.read_csv('iphone_purchase_records.csv')
df.head()

Unnamed: 0,Gender,Age,Salary,Purchase Iphone
0,Male,19,19000,0
1,Male,35,20000,0
2,Female,26,43000,0
3,Female,27,57000,0
4,Male,19,76000,0


In [2]:
# Checking if we have NaN in data set
df.isnull().sum()

Gender             0
Age                0
Salary             0
Purchase Iphone    0
dtype: int64

In [3]:
# For a model we need to pass numeric data but not categorical data
#Converting Category to numeric
mapping={'Male': 0, 'Female': 1}
df['Gender'] = df['Gender'].map(mapping)
df.head()

# Use inbuilt Label encoder

Unnamed: 0,Gender,Age,Salary,Purchase Iphone
0,0,19,19000,0
1,0,35,20000,0
2,1,26,43000,0
3,1,27,57000,0
4,0,19,76000,0


In [4]:
# Splitting features and target
y=df['Purchase Iphone'].values
#y=df.iloc[:,-1].values
#df.drop('Purchase Iphone',inplace=True, axis='columns')
#x=df[['Gender', 'Age', 'Salary']].values
X=df.iloc[:,:-1].values

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   Gender           400 non-null    int64
 1   Age              400 non-null    int64
 2   Salary           400 non-null    int64
 3   Purchase Iphone  400 non-null    int64
dtypes: int64(4)
memory usage: 12.6 KB


In [25]:
# Splitting Data



In [29]:
# Feature Scaling and only KNN with default value 5
# As we map a  plot in the space it is calculated using the feature parameters, so all parameters must be normalised 
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
 


steps = [('scaler', StandardScaler()), ('knn', KNeighborsClassifier())]

pipeline = Pipeline(steps)
X_train, X_test, y_train, y_test = train_test_split(X,  y, test_size = 0.3, random_state = 25)

#print(X.shape)
#print(y.shape)


#Hyper Parameter tuning

knn_scaled = pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print("Accuracy {}".format(accuracy_score(y_test, y_pred)))

print("Classification Report {}".format(classification_report(y_test, y_pred)))


Accuracy 0.9333333333333333
Classification Report               precision    recall  f1-score   support

           0       0.97      0.92      0.95        77
           1       0.87      0.95      0.91        43

    accuracy                           0.93       120
   macro avg       0.92      0.94      0.93       120
weighted avg       0.94      0.93      0.93       120



In [28]:
####Cross fold validation and hyper tuning the correct k value
import numpy as np
steps = [('scaler', StandardScaler()), ('knn', KNeighborsClassifier())]

pipeline = Pipeline(steps)
parameters = {'knn__n_neighbors': np.arange(1,50)}
X_train, X_test, y_train, y_test = train_test_split(X,  y, test_size = 0.3, random_state = 25)

#print(X.shape)
#print(y.shape)


#Hyper Parameter tuning
cv = GridSearchCV(pipeline, param_grid = parameters)
cv.fit(X_train, y_train)
y_pred = cv.predict(X_test)
print("Best Params {}".format(cv.best_params_))
print("Best Score {}" .format(cv.best_score_))
print("Classification Report {}" .format(classification_report(y_test, y_pred)))



Best Params {'knn__n_neighbors': 11}
Accuracy 0.8964285714285716
Classification Report               precision    recall  f1-score   support

           0       0.96      0.92      0.94        77
           1       0.87      0.93      0.90        43

    accuracy                           0.93       120
   macro avg       0.91      0.93      0.92       120
weighted avg       0.93      0.93      0.93       120

