In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from collections import Counter

In [9]:
# Load data
dataset = pd.read_csv('train.csv')

In [10]:
dataset.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [11]:
# features and labels
X = dataset.iloc[:, 0:20].values
y = dataset.iloc[:, 20].values

# Normalization
for i, x in enumerate(X):
    minVal = np.min(x)
    norm_x = np.subtract(x, minVal)
    maxVal = np.max(norm_x)
    norm_x = np.divide(norm_x, maxVal)
    X[i] = norm_x

# Train and Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [12]:
K_VALUES = [(2 * i + 1) for i in range(1, 10)]
K_VALUES_MAX = max(K_VALUES)
K_values_train = Counter()
match = 0
for test_point_index, test_point in enumerate(X_train):
    distances_dict = Counter()
    class_count = Counter()
    
    for index, train_point in enumerate(X_train):
        distances_dict[index] = np.sum(np.square(test_point - train_point))
        
    matched_k_list = []
    for K in K_VALUES:
        for point_index, point in distances_dict.most_common()[-(K + 1):-1]:
            class_count[y_train[point_index]] += 1

        if class_count.most_common(1)[0][0] == y_train[test_point_index]:
            matched_k_list.append(K)
    K_values_train[test_point_index] = matched_k_list

In [13]:
K_values_train
df = [[value_list] for value_list in K_values_train.values()]
df = pd.DataFrame(df)
df.columns = ["K values for correct classification per training point"]
print(df.to_string())

     K values for correct classification per training point
0                                                        []
1                                                        []
2                          [3, 5, 7, 9, 11, 13, 15, 17, 19]
3                                                 [3, 5, 7]
4                                    [3, 5, 13, 15, 17, 19]
5                                                        []
6                          [3, 5, 7, 9, 11, 13, 15, 17, 19]
7                                                        []
8                                                        []
9                          [3, 5, 7, 9, 11, 13, 15, 17, 19]
10                                                [3, 5, 7]
11                                                       []
12                               [3, 7, 11, 13, 15, 17, 19]
13                         [3, 5, 7, 9, 11, 13, 15, 17, 19]
14                                                       []
15                         [3, 5, 7, 9, 

In [14]:
M = 15
match = 0
total = 0
for test_point_index, test_point in enumerate(X_test):
    total += 1
    distances_dict = Counter()
    k_count = Counter()
    class_count = Counter()
    
    for index, train_point in enumerate(X_train):
        distances_dict[index] = np.sum(np.square(test_point - train_point))
        
    for point_index, point in distances_dict.most_common()[-M:]:
        for k in K_values_train[point_index]:
            k_count[k] += 1
    try:        
        K = int(k_count.most_common(1)[0][0])
    except:
        K = int(M)
        
    for point_index, point in distances_dict.most_common()[-K:]:
        class_count[y_train[point_index]] += 1
    
    if class_count.most_common(1)[0][0] == y_test[test_point_index]:
        match += 1

print(match / total)

0.5925
