In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from collections import Counter

In [14]:
# Load data
dataset = pd.read_csv("./datasets/mobiles-wine-combined.csv")

In [15]:
dataset.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,E,F,G,H,I,J,K,L,M,Class_ID
0,842,0,2.2,0,1,0,7,0.6,188,2,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1


In [16]:
# features and labels
X = dataset.iloc[:, 0:33].values
y = dataset.iloc[:, 33].values

# Row normalization
# for i, x in enumerate(X):
#     minVal = np.min(x)
#     norm_x = x - minVal
#     maxVal = np.max(norm_x)
#     norm_x /= maxVal
#     X[i] = norm_x
    
# Column normalization
def normalisation(X):
    for i in range(X.shape[1]):
        x = X[:, i]
        minVal = np.min(x)
        norm_x = x - minVal
        maxVal = np.max(norm_x)
        norm_x /= maxVal
        X[:, i] = norm_x

normalisation(X)
# Train and Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Training

In [17]:
K_VALUES = [(2 * i + 1) for i in range(1, 10)]

In [18]:
def point_k_value(X_train, y_train, K_VALUES):
    K_values_train = Counter()
    for test_point_index, test_point in enumerate(X_train):
        distances_dict = Counter()
        class_count = Counter()

        for index, train_point in enumerate(X_train):
            distances_dict[index] = np.sum(np.square(test_point - train_point))

        matched_k_list = []
        for K in K_VALUES:
            for point_index, point in distances_dict.most_common()[-(K + 1):-1]:
                class_count[y_train[point_index]] += 1

            if class_count.most_common(1)[0][0] == y_train[test_point_index]:
                matched_k_list.append(K)
        K_values_train[test_point_index] = matched_k_list
    return K_values_train

K_values_train = point_k_value(X_train, y_train, K_VALUES)

In [19]:
K_values_train
df = [[value_list] for value_list in K_values_train.values()]
df = pd.DataFrame(df)
df.columns = ["K values for correct classification per training point"]
print(df.to_string())

     K values for correct classification per training point
0                                      [3, 5, 7, 9, 11, 13]
1                                                  [17, 19]
2                          [3, 5, 7, 9, 11, 13, 15, 17, 19]
3                                                        []
4                                                        []
5                                                        []
6                                      [11, 13, 15, 17, 19]
7                                                        []
8                                                        []
9                                             [3, 5, 7, 11]
10                                                   [3, 5]
11                         [3, 5, 7, 9, 11, 13, 15, 17, 19]
12                                     [11, 13, 15, 17, 19]
13                                             [15, 17, 19]
14                                                       []
15                                 [3, 5

In [20]:
# M = 15
def knn_unrestricted(X_train, X_test, y_train, y_test, M, K_values_train):
    match = 0
    total = 0
    for test_point_index, test_point in enumerate(X_test):
        total += 1
        distances_dict = Counter()
        k_count = Counter()
        class_count = Counter()

        for index, train_point in enumerate(X_train):
            distances_dict[index] = np.sum(np.square(test_point - train_point))

        for point_index, point in distances_dict.most_common()[-M:]:
            for k in K_values_train[point_index]:
                k_count[k] += 1
        try:        
            K = int(k_count.most_common(1)[0][0])
        except:
            K = int(M)

        for point_index, point in distances_dict.most_common()[-K:]:
            class_count[y_train[point_index]] += 1

        if class_count.most_common(1)[0][0] == y_test[test_point_index]:
            match += 1

    return match / total

print('Accuracy:')
knn_unrestricted(X_train, X_test, y_train, y_test, 5, K_values_train)

Accuracy:


0.43119266055045874

# Validation

In [21]:
K_VALUES = [(2 * i + 1) for i in range(1, 10)]

# np.reshape(X_train, (10, int(X_train.shape[0]/10), X_train.shape[1])).shape

In [22]:
def K_fold_validation(X,y,k):
    BINSIZE = int(len(X)/k)
    X_bins = []
    Y_bins = []
    
    X_Rem = []
    Y_Rem = []
    for i in range(0,k):
        X_bins.append(X[i * BINSIZE : (i + 1) * BINSIZE])
        Y_bins.append(y[i * BINSIZE : (i + 1) * BINSIZE])
        
        X_Rem.append(np.concatenate((X[:i * BINSIZE], X[(i + 1) * BINSIZE:]), axis=0))
        Y_Rem.append(np.concatenate((y[:i * BINSIZE], y[(i + 1) * BINSIZE:]), axis=0))
        
    X_bins = np.array(X_bins)
    Y_bins = np.array(Y_bins)
    
    X_Rem = np.array(X_Rem)
    Y_Rem = np.array(Y_Rem)
    
    test_scores = []
    maxval = 0
    maxm = 0
    max_score = 0
    score = 0
    for m in K_VALUES:
        score = 0
        for i in range(0,k):
            X_data = X_Rem[i]
            Y_data = Y_Rem[i]
            K_values_train = point_k_value(X_data, Y_data, K_VALUES)
            score += knn_unrestricted(X_data, X_bins[i], Y_data, Y_bins[i], m, K_values_train)
        if score > max_score:
            max_score = score
            maxm = m
    # print(maxm)
    return maxm

In [23]:
# M = K_fold_validation(X_train, y_train, 5)

# Testing

In [24]:
K_values_train = point_k_value(X_train, y_train, K_VALUES)

print('Accuracy:')
print(knn_unrestricted(X_train, X_test, y_train, y_test, 15, K_values_train))

Accuracy:
0.43119266055045874
