In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter

In [15]:
# Load data
dataset = pd.read_csv("./datasets/mobiles-wine-combined.csv")
dataset.dtypes

battery_power      int64
blue               int64
clock_speed      float64
dual_sim           int64
fc                 int64
four_g             int64
int_memory         int64
m_dep            float64
mobile_wt          int64
n_cores            int64
pc                 int64
px_height          int64
px_width           int64
ram                int64
sc_h               int64
sc_w               int64
talk_time          int64
three_g            int64
touch_screen       int64
wifi               int64
A                float64
B                float64
C                float64
D                float64
E                  int64
F                float64
G                float64
H                float64
I                float64
J                float64
K                float64
L                float64
M                  int64
Class_ID           int64
dtype: object

In [16]:
dataset

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,E,F,G,H,I,J,K,L,M,Class_ID
0,842,0,2.2,0,1,0,7,0.6,188,2,...,0,0.00,0.00,0.00,0.00,0.0,0.00,0.00,0,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,0,0.00,0.00,0.00,0.00,0.0,0.00,0.00,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,0,0.00,0.00,0.00,0.00,0.0,0.00,0.00,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,0,0.00,0.00,0.00,0.00,0.0,0.00,0.00,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,0,0.00,0.00,0.00,0.00,0.0,0.00,0.00,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2173,0,0,0.0,0,0,0,0,0.0,0,0,...,95,1.68,0.61,0.52,1.06,7.7,0.64,1.74,740,6
2174,0,0,0.0,0,0,0,0,0.0,0,0,...,102,1.80,0.75,0.43,1.41,7.3,0.70,1.56,750,6
2175,0,0,0.0,0,0,0,0,0.0,0,0,...,120,1.59,0.69,0.43,1.35,10.2,0.59,1.56,835,6
2176,0,0,0.0,0,0,0,0,0.0,0,0,...,120,1.65,0.68,0.53,1.46,9.3,0.60,1.62,840,6


In [17]:
# features and labels
X = dataset.iloc[:, 0:33].values
y = dataset.iloc[:, 33].values

# Row normalization
# for i, x in enumerate(X):
#     minVal = np.min(x)
#     norm_x = x - minVal
#     maxVal = np.max(norm_x)
#     norm_x /= maxVal
#     X[i] = norm_x
    
# Column normalization
def normalisation(X):
    for i in range(X.shape[1]):
        x = X[:, i]
        minVal = np.min(x)
        norm_x = x - minVal
        maxVal = np.max(norm_x)
        norm_x /= maxVal
        X[:, i] = norm_x

normalisation(X)
# Train and Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=32)

## Testing (regular knn)

In [18]:
K_VALUES = [(2 * i + 1) for i in range(1, 10)]

In [19]:
K_ACCURACY = []
for K in K_VALUES:
    match = 0
    total = 0
    for test_point_index, test_point in enumerate(X_test):
        distances_dict = Counter()
        class_count = Counter()

        for index, train_point in enumerate(X_train):
            distances_dict[index] = np.sum(np.square(test_point - train_point))

        for point_index, point in distances_dict.most_common()[-K:]:
            class_count[y_train[point_index]] += 1
            
        if class_count.most_common(1)[0][0] == y_test[test_point_index]:
            match += 1
        total += 1
    K_ACCURACY.append(match / total)

In [20]:
df = []
for i, value in enumerate(K_VALUES):
    df.append([value, K_ACCURACY[i]])
df = pd.DataFrame(df)
df.columns = ['K', 'Accuracy']
df

Unnamed: 0,K,Accuracy
0,3,0.408257
1,5,0.417431
2,7,0.405963
3,9,0.392202
4,11,0.422018
5,13,0.428899
6,15,0.428899
7,17,0.433486
8,19,0.481651


## Training

In [21]:
K_values_train = Counter()
for test_point_index, test_point in enumerate(X_train):
    distances_dict = Counter()
    class_count = Counter()
    
    for index, train_point in enumerate(X_train):
        distances_dict[index] = np.sum(np.square(test_point - train_point))
        
    matched_k_list = []
    for K in K_VALUES:
        for point_index, point in distances_dict.most_common()[-(K + 1):-1]:
            class_count[y_train[point_index]] += 1

        if class_count.most_common(1)[0][0] == y_train[test_point_index]:
            matched_k_list.append(K)
    K_values_train[test_point_index] = matched_k_list

In [22]:
K_count = Counter()
for test_point_index, test_point in enumerate(X_train):
    for k in K_values_train[test_point_index]:
        K_count[k] += 1

L = K_count.most_common(int(len(K_VALUES) * 0.75))[-1][1]
print('L:', L)

k_excluded_set = set() 
for k in K_VALUES:
    if K_count[k] < L:
        k_excluded_set.add(k)
        
for test_point_index, test_point in enumerate(X_train):
    final_k_list = []
    K_values_train[test_point_index] = list(set(K_values_train[test_point_index]) - k_excluded_set)  

L: 791


In [27]:
K_values_train
df = [[value_list] for value_list in K_values_train.values()]
df = pd.DataFrame(df)
df.columns = ["K values for correct classification per training point"]
# print(df.to_string())
df

Unnamed: 0,K values for correct classification per training point
0,"[5, 7, 9, 11, 15, 17]"
1,[19]
2,"[7, 9, 11, 15, 17, 19]"
3,[]
4,[]
...,...
1737,[]
1738,[]
1739,"[5, 7, 9, 11, 15, 17, 19]"
1740,[]


In [24]:
k_excluded_set

{3, 13}

## Testing

In [25]:
M = 9

In [26]:
match = 0
total = 0
for test_point_index, test_point in enumerate(X_test):
    total += 1
    distances_dict = Counter()
    k_count = Counter()
    class_count = Counter()
    
    for index, train_point in enumerate(X_train):
        distances_dict[index] = np.sum(np.square(test_point - train_point))
        
    for point_index, point in distances_dict.most_common()[-M:]:
        for k in K_values_train[point_index]:
            k_count[k] += 1
            
    try:        
        K = int(k_count.most_common(1)[0][0])
    except:
        K = int(M)
        
    for point_index, point in distances_dict.most_common()[-K:]:
        class_count[y_train[point_index]] += 1
    
    if class_count.most_common(1)[0][0] == y_test[test_point_index]:
        match += 1

print('Accuracy:')
print(match / total)

Accuracy:
0.4105504587155963
