In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from collections import Counter

In [2]:
# Load data
dataset = pd.read_csv('train.csv')
dataset.dtypes

battery_power      int64
blue               int64
clock_speed      float64
dual_sim           int64
fc                 int64
four_g             int64
int_memory         int64
m_dep            float64
mobile_wt          int64
n_cores            int64
pc                 int64
px_height          int64
px_width           int64
ram                int64
sc_h               int64
sc_w               int64
talk_time          int64
three_g            int64
touch_screen       int64
wifi               int64
price_range        int64
dtype: object

In [3]:
# features and labels
X = dataset.iloc[:, 0:20].values
y = dataset.iloc[:, 20].values

# Row normalization
# for i, x in enumerate(X):
#     minVal = np.min(x)
#     norm_x = x - minVal
#     maxVal = np.max(norm_x)
#     norm_x /= maxVal
#     X[i] = norm_x
    
# Column normalization
for i in range(X.shape[1]):
    x = X[:, i]
    minVal = np.min(x)
    norm_x = x - minVal
    maxVal = np.max(norm_x)
    norm_x /= maxVal
    X[:, i] = norm_x

# Train and Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [4]:
classes = np.unique(y)
num_classes = len(classes)
classes

array([0, 1, 2, 3])

## Training

In [5]:
class_partitions = []
for c in classes:
    partition = X_train[y_train == c]
    # partition = dataset.loc[dataset['price_range'] == 0].iloc[:, :-1].values
    class_partitions.append(partition)

In [6]:
K_VALUES = [3,5,7,9,11,13,15]

# Finding best k per class
k_classes = [0 for _ in range(num_classes)]
for i in range(num_classes):
    data = class_partitions[i]
    label = classes[i]
    
    K_values_train = Counter()
    match = 0
    for test_point_index, test_point in enumerate(data):
        distances_dict = Counter()
        class_count = Counter()

        for index, train_point in enumerate(X_train):
            distances_dict[index] = np.sum(np.square(test_point - train_point))

        matched_k_list = []
        for K in K_VALUES:
            for point_index, point in distances_dict.most_common()[-(K + 1):-1]:
                class_count[y_train[point_index]] += 1

            if class_count.most_common(1)[0][0] == y_train[test_point_index]:
                matched_k_list.append(K)
        K_values_train[test_point_index] = matched_k_list
        
    # Taking the first best k value in case of ties
    k_classes[i] = K_values_train.most_common(1)[0][1][0]
    
k_classes

[15, 15, 15, 15]

## Testing

In [8]:
match = 0
total = 0
y_pred = [0 for i in range(len(y_test))]
for test_point_index, test_point in enumerate(X_test):
    distances_dict = Counter()
    for index, train_point in enumerate(X_train):
        distances_dict[index] = np.sum(np.square(test_point - train_point))
    
    fractions = [0 for _ in range(num_classes)]
    for i, c in enumerate(classes):
        K = k_classes[i]
        
        class_count = Counter()
        for point_index, point in distances_dict.most_common()[-K:]:
            class_count[y_train[point_index]] += 1

        num_c = class_count[c]
        t = sum(class_count.values())
        fractions[i] = num_c / t
        
    max_fraction = fractions[0]
    labelIdx = 0
    for i, fraction in enumerate(fractions):
        if fraction > max_fraction:
            max_fraction = fraction
            labelIdx = i
    
    label = classes[labelIdx]
    y_pred[test_point_index] = label
    
    if label == y_test[test_point_index]:
        match += 1
    total += 1
    
accuracy = match / total
print(accuracy)
print(y_pred == y_test)

0.57
[ True False False  True False  True  True False False False False  True
  True  True  True  True  True  True False False False  True False False
 False False  True False  True  True False  True  True False False  True
  True  True  True  True False False  True False False False  True  True
 False False  True  True False False  True  True  True False  True  True
  True False  True  True False  True  True  True False False  True False
 False  True  True  True False False False False  True False False  True
  True  True  True False  True  True  True False False False  True False
 False  True  True  True False False  True  True  True False False  True
  True  True False  True False False False  True False False  True  True
 False False  True  True  True False False  True  True False  True  True
  True False  True  True  True  True  True False False  True False  True
 False  True  True  True False  True  True  True  True False  True  True
  True  True  True  True  True False  True  Tr