In [70]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from collections import Counter

In [71]:
# Load data
dataset = pd.read_csv('./datasets/train.csv')

In [72]:
dataset.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [73]:
# features and labels
X = dataset.iloc[:, 0:20].values
y = dataset.iloc[:, 20].values

# Row normalization
# for i, x in enumerate(X):
#     minVal = np.min(x)
#     norm_x = x - minVal
#     maxVal = np.max(norm_x)
#     norm_x /= maxVal
#     X[i] = norm_x
    
# Column normalization

def normalisation(X):
    for i in range(X.shape[1]):
        x = X[:, i]
        minVal = np.min(x)
        norm_x = x - minVal
        maxVal = np.max(norm_x)
        norm_x /= maxVal
        X[:, i] = norm_x

normalisation(X)
# Train and Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [74]:
K_VALUES = [(2 * i + 1) for i in range(1, 10)]

In [75]:
def point_k_value(X_train,y_train,K_VALUES):
    K_values_train = Counter()
    for test_point_index, test_point in enumerate(X_train):
        distances_dict = Counter()
        class_count = Counter()

        for index, train_point in enumerate(X_train):
            distances_dict[index] = np.sum(np.square(test_point - train_point))

        matched_k_list = []
        for K in K_VALUES:
            for point_index, point in distances_dict.most_common()[-(K + 1):-1]:
                class_count[y_train[point_index]] += 1

            if class_count.most_common(1)[0][0] == y_train[test_point_index]:
                matched_k_list.append(K)
        K_values_train[test_point_index] = matched_k_list
    return K_values_train

K_values_train = point_k_value(X_train,y_train,K_VALUES)

In [76]:
K_values_train
df = [[value_list] for value_list in K_values_train.values()]
df = pd.DataFrame(df)
df.columns = ["K values for correct classification per training point"]
print(df.to_string())

     K values for correct classification per training point
0                          [3, 5, 7, 9, 11, 13, 15, 17, 19]
1                                                        []
2                          [3, 5, 7, 9, 11, 13, 15, 17, 19]
3                          [3, 5, 7, 9, 11, 13, 15, 17, 19]
4                          [3, 5, 7, 9, 11, 13, 15, 17, 19]
5                                                        []
6                                          [13, 15, 17, 19]
7                                                        []
8                                                       [3]
9                                                  [17, 19]
10                                                       []
11                                                      [3]
12                         [3, 5, 7, 9, 11, 13, 15, 17, 19]
13                            [5, 7, 9, 11, 13, 15, 17, 19]
14                                             [15, 17, 19]
15                               [7, 9, 

In [77]:
#M = 15
def knn_unrestricted(X_train,X_test,y_train,y_test,M,K_values_train):
    match = 0
    total = 0
    for test_point_index, test_point in enumerate(X_test):
        total += 1
        distances_dict = Counter()
        k_count = Counter()
        class_count = Counter()

        for index, train_point in enumerate(X_train):
            distances_dict[index] = np.sum(np.square(test_point - train_point))

        for point_index, point in distances_dict.most_common()[-M:]:
            for k in K_values_train[point_index]:
                k_count[k] += 1
        try:        
            K = int(k_count.most_common(1)[0][0])
        except:
            K = int(M)

        for point_index, point in distances_dict.most_common()[-K:]:
            class_count[y_train[point_index]] += 1

        if class_count.most_common(1)[0][0] == y_test[test_point_index]:
            match += 1

    print(match / total)
    return match / total
knn_unrestricted(X_train,X_test,y_train,y_test,5,K_values_train)

0.435


0.435

# Validation

In [78]:
K_VALUES = [(2 * i + 1) for i in range(1, 10)]

np.reshape(X_train,(10,int(X_train.shape[0]/10),X_train.shape[1])).shape

(10, 160, 20)

In [79]:
def K_fold_validation(X,y,k):
    BINSIZE = int(len(X)/k)
    X_bins = []
    Y_bins = []
    
    X_Rem = []
    Y_Rem = []
    for i in range(0,k):
        X_bins.append(X[i * BINSIZE : (i + 1) * BINSIZE])
        Y_bins.append(y[i * BINSIZE : (i + 1) * BINSIZE])
        
        X_Rem.append(np.concatenate((X[:i * BINSIZE],X[(i + 1) * BINSIZE:]),axis=0))
        Y_Rem.append(np.concatenate((y[:i * BINSIZE],y[(i + 1) * BINSIZE:]),axis=0))
        
    X_bins = np.array(X_bins)
    Y_bins = np.array(Y_bins)
    
    X_Rem = np.array(X_Rem)
    Y_Rem = np.array(Y_Rem)
    
    test_scores = []
    maxval = 0
    maxm = 0
    max_score = 0
    score = 0
    for m in K_VALUES:
        score = 0
        for i in range(0,k):
            X_data = X_Rem[i]
            Y_data = Y_Rem[i]
            #model = DecisionTreeClassifier()
            #model.fit(X_data,Y_data)
            K_values_train = point_k_value(X_data,Y_data,K_VALUES)
            score += knn_unrestricted(X_data,X_bins[i],Y_data,Y_bins[i],m,K_values_train)
        if score > max_score:
            max_score = score
            maxm = m
    print(maxm)
    return maxm

In [80]:
M = K_fold_validation(X_train,y_train,5)

0.365625
0.365625
0.4
0.36875
0.434375
0.35
0.346875
0.40625
0.3875
0.409375
0.35
0.3625
0.43125
0.38125
0.421875
0.35625
0.384375
0.425
0.390625
0.440625
0.371875
0.365625
0.409375
0.375
0.434375
0.375
0.3875
0.4
0.396875
0.43125
0.3875
0.403125
0.396875
0.403125
0.425
0.390625
0.396875
0.409375
0.409375
0.421875
0.3875
0.390625
0.41875
0.425
0.40625
17


In [81]:
K_values_train = point_k_value(X_train,y_train,K_VALUES)
knn_unrestricted(X_train,X_test,y_train,y_test,M,K_values_train)

0.4475


0.4475

In [83]:
M

17