# KNN

### Import libraries and load data

In [3]:
import numpy as np
import pandas as pd 
dataset = pd.read_csv("iphone_purchase_records.csv")
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:, 3].values
dataset.Gender=dataset.Gender.map({'Female':1,'Male':0})
dataset.head(5)

Unnamed: 0,Gender,Age,Salary,Purchase Iphone
0,0,19,19000,0
1,0,35,20000,0
2,1,26,43000,0
3,1,27,57000,0
4,0,19,76000,0


### Data pre-processing and selection

In [4]:
from sklearn.preprocessing import LabelEncoder
labelEncoder_gender =  LabelEncoder()
X[:,0] = labelEncoder_gender.fit_transform(X[:,0])

X = np.vstack(X[:, :]).astype(np.float64)

### Training and Testing data

In [5]:
train_size = int(dataset.shape[0]*0.60)

In [6]:
train_df = dataset.iloc[:train_size,:] 
test_df = dataset.iloc[train_size:,:]
train = dataset.values
test = test_df.values
y_true = test[:,-1]
print('Train_Shape: ',train_df.shape)
print('Test_Shape: ',test_df.shape)

Train_Shape:  (240, 4)
Test_Shape:  (160, 4)


### KNN in 3 Steps:
1. Measure distance (Euclidean Distance or Manhattan Distance)
2. Get nearest neighbours
3. Predict Classifier

#### Step 1. Euclidian distance
- Measuring Distance using Euclidean Distance:
  <b>Mathematical formula √ (x2 − x1)2 + (y2 − y1)2</b>

In [7]:
from math import sqrt
def euclidean_distance(x_test, x_train):
    distance = 0
    for i in range(len(x_test)-1):
        distance += (x_test[i]-x_train[i])**2
    return sqrt(distance)

#### Step 2. Getting the nearest neighbours

In [8]:
def get_neighbors(x_test, x_train, num_neighbors):
    distances = []
    data = []
    for i in x_train:
        distances.append(euclidean_distance(x_test,i))
        data.append(i)
    distances = np.array(distances)
    data = np.array(data)
    sort_indexes = distances.argsort()             #argsort() function returns indices by sorting distances data in ascending order
    data = data[sort_indexes]                      #modifying our data based on sorted indices, so that we can get the nearest neightbours
    return data[:num_neighbors]     

#### Step 3. Predicting the classifier of which our new data point belongs to.

In [9]:
def prediction(x_test, x_train, num_neighbors):
    classes = []
    neighbors = get_neighbors(x_test, x_train, num_neighbors)
    for i in neighbors:
        classes.append(i[-1])
    predicted = max(classes, key=classes.count)              #taking the most repeated class
    return predicted

### Measuring the accuracy. So that we can know how accurate our model would predict new data samples

In [10]:
def accuracy(y_true, y_pred):
    num_correct = 0
    for i in range(len(y_true)):
        if y_true[i]==y_pred[i]:
            num_correct+=1
    accuracy = num_correct/len(y_true)
    return accuracy

### Predicting test data

In [11]:
y_pred = []
for i in test:
    y_pred.append(prediction(i, train, 4))

In [12]:
accuracy = accuracy(y_true, y_pred)

### Accuracy

In [13]:
accuracy*100

89.375

### Sample Output

In [14]:
test_df.sample(5)

Unnamed: 0,Gender,Age,Salary,Purchase Iphone
244,1,41,72000,0
385,0,56,60000,1
250,1,44,39000,0
317,0,35,55000,0
281,0,35,61000,0
