In [62]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter

In [63]:
# Load data
dataset = pd.read_csv('./datasets/train.csv')

In [64]:
dataset.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [65]:
# features and labels
X = dataset.iloc[:, 0:20].values
y = dataset.iloc[:, 20].values

# Row normalization
# for i, x in enumerate(X):
#     minVal = np.min(x)
#     norm_x = x - minVal
#     maxVal = np.max(norm_x)
#     norm_x /= maxVal
#     X[i] = norm_x
    
# Column normalization

def normalisation(X):
    for i in range(X.shape[1]):
        x = X[:, i]
        minVal = np.min(x)
        norm_x = x - minVal
        maxVal = np.max(norm_x)
        norm_x /= maxVal
        X[:, i] = norm_x

normalisation(X)
# Train and Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [66]:
K_VALUES = [(2 * i + 1) for i in range(1, 10)]

In [67]:
K_ACCURACY = []
for K in K_VALUES:
    match = 0
    total = 0
    for test_point_index, test_point in enumerate(X_test):
        distances_dict = Counter()
        class_count = Counter()

        for index, train_point in enumerate(X_train):
            distances_dict[index] = np.sum(np.square(test_point - train_point))

        for point_index, point in distances_dict.most_common()[-K:]:
            class_count[y_train[point_index]] += 1
            
        if class_count.most_common(1)[0][0] == y_test[test_point_index]:
            match += 1
        total += 1
    K_ACCURACY.append(match / total)

In [68]:
df = []
for i, value in enumerate(K_VALUES):
    df.append([value, K_ACCURACY[i]])
df = pd.DataFrame(df)
df.columns = ['K', 'Accuracy']
df

Unnamed: 0,K,Accuracy
0,3,0.3775
1,5,0.3975
2,7,0.405
3,9,0.4275
4,11,0.425
5,13,0.445
6,15,0.435
7,17,0.435
8,19,0.455


In [69]:
K_values_train = Counter()
for test_point_index, test_point in enumerate(X_train):
    distances_dict = Counter()
    class_count = Counter()
    
    for index, train_point in enumerate(X_train):
        distances_dict[index] = np.sum(np.square(test_point - train_point))
        
    matched_k_list = []
    for K in K_VALUES:
        for point_index, point in distances_dict.most_common()[-(K + 1):-1]:
            class_count[y_train[point_index]] += 1

        if class_count.most_common(1)[0][0] == y_train[test_point_index]:
            matched_k_list.append(K)
    K_values_train[test_point_index] = matched_k_list

In [70]:
K_count = Counter()
for test_point_index, test_point in enumerate(X_train):
    for k in K_values_train[test_point_index]:
        K_count[k] += 1

L = K_count.most_common(int(len(K_VALUES) * 0.75))[-1][1]
print(L)
k_excluded_set = set() 
for k in K_VALUES:
    if K_count[k] < L:
        k_excluded_set.add(k)
        
for test_point_index, test_point in enumerate(X_train):
    final_k_list = []
    K_values_train[test_point_index] = list(set(K_values_train[test_point_index]) - k_excluded_set)  

645


In [75]:
k_excluded_set

{3, 5, 7}

In [73]:
M = 17

In [74]:
match = 0
total = 0
for test_point_index, test_point in enumerate(X_test):
    total += 1
    distances_dict = Counter()
    k_count = Counter()
    class_count = Counter()
    
    for index, train_point in enumerate(X_train):
        distances_dict[index] = np.sum(np.square(test_point - train_point))
        
    for point_index, point in distances_dict.most_common()[-M:]:
        for k in K_values_train[point_index]:
            k_count[k] += 1
            
    try:        
        K = int(k_count.most_common(1)[0][0])
    except:
        K = int(M)
        
    for point_index, point in distances_dict.most_common()[-K:]:
        class_count[y_train[point_index]] += 1
    
    if class_count.most_common(1)[0][0] == y_test[test_point_index]:
        match += 1

print(match / total)

0.4675
