# KNN Algorithm for Yeast Data


In [None]:
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd  
from math import floor, ceil, sqrt
from sklearn.metrics import classification_report


In [None]:
def the_train_test_split(X, test_ratio = 0.2):
    if(test_ratio >= 1 or test_ratio <0):
        test_ratio = 0.2
    row, _ = X.shape
    train_count = floor(row * (1-test_ratio)) 
    train = X[:train_count]
    test = X[train_count:]
    return train, test

In [None]:
def euclidean_distance(x,y):
    return sqrt(sum(np.square(x-y)))

def minkowski_distance(x,y):
    return -1;

def get_distance(x, y, algorithm ="euclidean"):
    if(algorithm == "euclidean"):
        return euclidean_distance(x,y)
    else:
        print("The algorithm ", algorithm, " couldn't be recognized.\n", "\"euclidean\" algorithm is used instead")
        return euclidean_distance(x,y)

In [None]:
class K_Neigbours_Classifier():
    def __init__(self, neigbour_count = 7, algorithm = "euclidean"):
        self.alg = algorithm
        self.n_count = neigbour_count

    def fit(self, train_input, train_output):
        self.train_in = train_input
        self.train_out = train_output
        #
        pd.unique(self.train_out) # since it is array of arrays sized 1
        self.categories = pd.unique(self.train_out.ravel())
    
    def predict(self, single):
        # calculate the distances
        distances = np.apply_along_axis(get_distance, 1, self.train_in, y=single, algorithm=self.alg)
        #print(distances)
        nearest_indices = np.argpartition(distances, self.n_count)[:self.n_count]
        #print(nearest_indices)
        category_dict = dict.fromkeys(self.categories, 0)
        nearest_keys = self.train_out[nearest_indices]
        for neigbour_key in nearest_keys:
            category_dict[neigbour_key] = 1 + category_dict[neigbour_key]
        the_key_with_max = max(category_dict, key=category_dict.get)
        #print("We predict this one to be: ", the_key_with_max)
        return the_key_with_max

In [None]:
def measure(X_train, Y_train, X_test, Y_test  ):
    knc = K_Neigbours_Classifier(neigbour_count=13)
    knc.fit(X_train, Y_train[:,0]) # we know that y_train is 1 dimensional 
    correct_pred = 0
    incorrect_pred = 0
    correct_pred_dict = dict.fromkeys(cat,0)
    failed_to_pred_dict = dict.fromkeys(cat,0)
    assumed_to_pred_dict = dict.fromkeys(cat,0)

    predictions = [] #= np.empty(Y_test.size,  dtype="S3")
    for i in range (Y_test.size):
        correct_key = Y_test[i][0]
        predicted_key =knc.predict(X_test[i])
        predictions.append(predicted_key)
        if(  predicted_key== correct_key):
            correct_pred = 1 + correct_pred
            correct_pred_dict[correct_key] = 1 + correct_pred_dict[correct_key]

        else:
            incorrect_pred = 1 + incorrect_pred
            failed_to_pred_dict[correct_key] = 1 + failed_to_pred_dict[correct_key]
            assumed_to_pred_dict[predicted_key] = 1 + assumed_to_pred_dict[predicted_key] 
            
    print("Accuracy: ", correct_pred/(correct_pred + incorrect_pred) )
    print("Number of correct predictions: ", correct_pred)
    print("Number of incorrect predictions: ", incorrect_pred)
    print("correct predict(ion) count:\n", correct_pred_dict)
    print("failed_to predict(ion) count:\n", failed_to_pred_dict)
    print("assumed_to predict(ion) count:\n", assumed_to_pred_dict)
    

    print("\n                   Classification Report                  \n",
    classification_report(Y_test,predictions, zero_division=1)) # ignores zero division warning
    


## Read the data

In [None]:
file_name = "yeast.csv" 
md = pd.read_csv(file_name)

# md.dropna(inplace = True)
# md.replace('unknown', 0, inplace = True)
md.head()


## Prepare the data
* Separate the input and output variables
* Seperate the data into training and test sets
* Normalize the data


In [None]:
test_ratio = 0.2
X = md.values[:,1:9]
Y = md.values[:,9:]
cat = pd.unique(Y[:,0])

# normalize X:
for i in range(X.shape[1]):
    X[:,i] = (X[:,i] - X[:,i].mean())/X[:,i].std()


In [None]:
X_train, X_test = the_train_test_split(X, test_ratio = test_ratio)
Y_train, Y_test = the_train_test_split(Y, test_ratio = test_ratio)

In [None]:
measure(X_train, Y_train, X_test, Y_test)

### PCA pretest VIA Sci-Kit LEARN


In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=7)
X = pca.fit_transform(X)

X_train, X_test = the_train_test_split(X, test_ratio = test_ratio)
Y_train, Y_test = the_train_test_split(Y, test_ratio = test_ratio)

measure(X_train, Y_train, X_test, Y_test)

