## KNN Algorithm for Yeast Data


In [21]:
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd  
from math import floor, ceil, sqrt

In [22]:
def the_train_test_split(X, test_ratio = 0.2):
    if(test_ratio >= 1 or test_ratio <0):
        test_ratio = 0.2
    row, _ = X.shape
    train_count = floor(row * (1-test_ratio)) 
    train = X[:train_count]
    test = X[train_count:]
    return train, test

In [32]:
def euclidean_distance(x,y):
    return sqrt(sum(np.square(x-y)))

def get_distance(x, y, algorithm ="euclidean"):
    if(algorithm == "euclidean"):
        return euclidean_distance(x,y)

In [95]:
class K_Neigbours_Classifier():
    
    def __init__(self, neigbour_count = 7, algorithm = "euclidean"):
        self.alg = algorithm
        self.n_count = neigbour_count

    def fit(self, train_input, train_output):
        self.train_in = train_input
        self.train_out = train_output
        #
        pd.unique(self.train_out) # since it is array of arrays sized 1
        self.categories = pd.unique(self.train_out.ravel())
    
    def predict(self, single):
        # calculate the distances
        distances = np.apply_along_axis(get_distance, 1, self.train_in, y=single, algorithm=self.alg)
        #print(distances)
        nearest_indices = np.argpartition(distances, self.n_count)[:self.n_count]
        #print(nearest_indices)
        category_dict = dict.fromkeys(self.categories, 0)
        nearest_keys = self.train_out[nearest_indices]
        for neigbour_key in nearest_keys:
            category_dict[neigbour_key] = 1 + category_dict[neigbour_key]
        the_key_with_max = max(category_dict, key=category_dict.get)
        #print("We predict this one to be: ", the_key_with_max)
        return the_key_with_max

In [24]:
file_name = "yeast.csv" 
md = pd.read_csv(file_name)

# md.dropna(inplace = True)
# md.replace('unknown', 0, inplace = True)
md.head()

Unnamed: 0,ADT1_YEAST,0.58,0.61,0.47,0.13,0.50,0.00,0.48,0.22,MIT
0,ADT2_YEAST,0.43,0.67,0.48,0.27,0.5,0.0,0.53,0.22,MIT
1,ADT3_YEAST,0.64,0.62,0.49,0.15,0.5,0.0,0.53,0.22,MIT
2,AAR2_YEAST,0.58,0.44,0.57,0.13,0.5,0.0,0.54,0.22,NUC
3,AATM_YEAST,0.42,0.44,0.48,0.54,0.5,0.0,0.48,0.22,MIT
4,AATC_YEAST,0.51,0.4,0.56,0.17,0.5,0.5,0.49,0.22,CYT


In [98]:
test_ratio = 0.2
X = md.values[:,1:9]
Y = md.values[:,9:]

X_train, X_test = the_train_test_split(X, test_ratio = test_ratio)
Y_train, Y_test = the_train_test_split(Y, test_ratio = test_ratio)

In [99]:
knc = K_Neigbours_Classifier(neigbour_count=13)
knc.fit(X_train, Y_train[:,0]) # we know that y_train is 1 dimensional 

In [100]:
correct_pred = 0;
incorrect_pred = 0;
for i in range (Y_test.size):
    if( knc.predict(X_test[i]) == Y_test[i]):
        correct_pred = 1 + correct_pred
    else:
        incorrect_pred = 1 + incorrect_pred

print("Accuracy: ", correct_pred/(correct_pred + incorrect_pred) )
print("Number of correct predictions: ", correct_pred)
print("Number of incorrect predictions: ", incorrect_pred)

Accuracy:  0.5589225589225589
Number of correct predictions:  166
Number of incorrect predictions:  131
