In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
from sklearn.model_selection import train_test_split
from collections import Counter

In [17]:
# Load data
dataset = pd.read_csv('train.csv')
dataset.dtypes

battery_power      int64
blue               int64
clock_speed      float64
dual_sim           int64
fc                 int64
four_g             int64
int_memory         int64
m_dep            float64
mobile_wt          int64
n_cores            int64
pc                 int64
px_height          int64
px_width           int64
ram                int64
sc_h               int64
sc_w               int64
talk_time          int64
three_g            int64
touch_screen       int64
wifi               int64
price_range        int64
dtype: object

In [18]:
# features and labels
X = dataset.iloc[:, 0:20].values
y_actual = dataset.iloc[:, 20].values

# Row normalization
# for i, x in enumerate(X):
#     minVal = np.min(x)
#     norm_x = x - minVal
#     maxVal = np.max(norm_x)
#     norm_x /= maxVal
#     X[i] = norm_x
    
# Column normalization
for i in range(X.shape[1]):
    x = X[:, i]
    minVal = np.min(x)
    norm_x = x - minVal
    maxVal = np.max(norm_x)
    norm_x /= maxVal
    X[:, i] = norm_x

## Clustering

In [19]:
# Find clusters using K - means algorithm
k = 4
y = np.array([0 for _ in range(X.shape[0])])

rand_idxs = random.sample(range(0, X.shape[0]), k)
means = np.array(X[rand_idxs])
means

array([[2.20506845e-01, 2.91290417e-04, 2.62161375e-04, 2.91290417e-04,
        1.16516167e-03, 0.00000000e+00, 1.86425867e-02, 1.45645208e-04,
        4.57325954e-02, 2.03903292e-03, 1.45645208e-03, 3.32071075e-02,
        2.38566851e-01, 1.00000000e+00, 1.74774250e-03, 1.45645208e-03,
        2.91290417e-03, 2.91290417e-04, 2.91290417e-04, 0.00000000e+00],
       [6.46164978e-01, 0.00000000e+00, 1.80897250e-04, 0.00000000e+00,
        3.25615051e-03, 3.61794501e-04, 2.09840810e-02, 2.53256151e-04,
        5.82489146e-02, 2.53256151e-03, 3.61794501e-03, 8.35745297e-02,
        6.04558611e-01, 1.00000000e+00, 3.25615051e-03, 1.08538350e-03,
        2.53256151e-03, 3.61794501e-04, 3.61794501e-04, 3.61794501e-04],
       [9.29291330e-01, 5.27676640e-05, 9.49817952e-04, 5.27676640e-05,
        6.38488734e-03, 5.27676640e-05, 1.27170070e-02, 0.00000000e+00,
        9.92559759e-02, 5.27676640e-05, 6.91256398e-03, 1.57828083e-01,
        2.77610680e-01, 1.00000000e+00, 6.91256398e-03, 4.8018

In [20]:
# clusters = np.array([np.array([]) for _ in range(k)])
prev_means = np.array([])

while not np.array_equal(prev_means, means):
    clusters = [[mean] for mean in means]
    for point_idx, point in enumerate(X):
        distances = np.array([-1 for _ in range(k)])

        for index, mean in enumerate(means):
            distances[index] = np.sum(np.square(point - mean))
        
        min_idx = np.argmin(distances)
        # clusters[min_idx] = np.append(clusters[min_idx], [point], axis = 0)
        clusters[min_idx].append(point)
        y[point_idx] = min_idx
    
    prev_means = means[:]
    # clusters = np.array(clusters)
    for idx, cluster in enumerate(clusters):
        means[idx] = np.mean(cluster)

match = 0
for a, b in zip(y, y_actual):
    if a == b:
        match += 1
match / len(y)

0.154

In [21]:
# Train and Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Form classes or set distinct labels for clusters
# classes = np.array([i for i in range(k)])
# num_classes = k
# classes

# Get distinct classes
classes = np.unique(y)
num_classes = len(classes)
classes

array([0, 1, 2, 3])

## Training

In [22]:
# Partition dataset based on class
class_partitions = []
for c in classes:
    partition = X_train[y_train == c]
    # partition = dataset.loc[dataset['price_range'] == 0].iloc[:, :-1].values
    class_partitions.append(partition)

In [23]:
K_VALUES = [3,5,7,9,11,13,15]

# Finding best k per class
k_classes = [0 for _ in range(num_classes)]
for i in range(num_classes):
    data = class_partitions[i]
    label = classes[i]
    
    K_values_train = Counter()
    match = 0
    for test_point_index, test_point in enumerate(data):
        distances_dict = Counter()
        class_count = Counter()

        for index, train_point in enumerate(X_train):
            distances_dict[index] = np.sum(np.square(test_point - train_point))

        matched_k_list = []
        for K in K_VALUES:
            for point_index, point in distances_dict.most_common()[-(K + 1):-1]:
                class_count[y_train[point_index]] += 1

            if class_count.most_common(1)[0][0] == y_train[test_point_index]:
                matched_k_list.append(K)
        K_values_train[test_point_index] = matched_k_list
        
    # Taking the first best k value in case of ties
    k_classes[i] = K_values_train.most_common(1)[0][1][0]
    
k_classes

[15, 13, 3, 3]

## Testing

In [24]:
match = 0
total = 0
y_pred = [0 for i in range(len(y_test))]
for test_point_index, test_point in enumerate(X_test):
    distances_dict = Counter()
    for index, train_point in enumerate(X_train):
        distances_dict[index] = np.sum(np.square(test_point - train_point))
    
    fractions = [0 for _ in range(num_classes)]
    for i, c in enumerate(classes):
        K = k_classes[i]
        
        class_count = Counter()
        for point_index, point in distances_dict.most_common()[-K:]:
            class_count[y_train[point_index]] += 1

        num_c = class_count[c]
        t = sum(class_count.values())
        fractions[i] = num_c / t
        
    max_fraction = fractions[0]
    labelIdx = 0
    for i, fraction in enumerate(fractions):
        if fraction > max_fraction:
            max_fraction = fraction
            labelIdx = i
    
    label = classes[labelIdx]
    y_pred[test_point_index] = label
    
    if label == y_test[test_point_index]:
        match += 1
    total += 1
    
accuracy = match / total
print(accuracy)
print(y_pred == y_test)

0.9675
[ True  True  True  True  True  True False  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True False  True False  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
 False  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True False  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  