In [16]:
import numpy as np
import pandas as pd
from collections import Counter




class KNN:
        def __init__(self, k=3):
            self.k = k
        
        def fit(self, X, y): #X=X_train   y=y_train
            self.X = X
            self.y = y
        
        def predict(self, X): # predict function predicts personality 
            y_pred = []     #this list is result  predict list of this func 
            for x in X:
                # calc distance between values of x 
                distances = self.calculate_distance(x)
                indices = np.argpartition(distances, self.k)[:self.k]
                nearest_neighbors = [self.y[i] for i in indices]
                nearest_neighbors = [int(i) for i in nearest_neighbors]
                
                most_common = Counter(nearest_neighbors).most_common(1)
                y_pred.append(most_common[0][0])
            return y_pred   
        
        def calculate_distance(self, x):
            #calc distance by using numpy linear norm
            distances = np.linalg.norm(self.X - x, axis=1)
            return distances


        def confmatrixmaker(self,prediction,y_test):
            # will create confusion matrix
            confusion_matrix = np.array([[0 for _ in range(16)] for _ in range(16)]) #empty matrix

            # Iterate through the true labels and predicted labels
            for true, pred in zip(y_test, prediction):
                confusion_matrix[true][pred] += 1  # Increment the value in the confusion matrix         
            return confusion_matrix

        def accur(self,matr,y_test):#calc accuracy
            accurlist=[]    #this list will contain accuracy of each k-fold
            for row_colrecall in range(16):    # calc macro
                accurlist.append(matr[row_colrecall,row_colrecall])
                
            return(sum(accurlist)/len(y_test))

        def precisioncalc(self,matr):#calc precision 
            precilis=[]       #this list will contain precision of each k-fold
            for row_col in range(16):# calc macro precision by using precision values of precilis 
                precilis.append(matr[row_col,row_col]/(matr[:,row_col].sum()))
            return(sum(precilis)/len(precilis))   


        def recallcalc(self,matr):#calc precision 
            recallist=[]   #this list will contain recall of each k-fold
            for row_colrecall in range(16):# calc macro recall by using recall values of recallist 
                recallist.append(matr[row_colrecall,row_colrecall]/(matr[row_colrecall,:].sum()))
            return(sum(recallist)/len(recallist)) 


In [13]:
pd.options.display.max_rows = 9999

dft = pd.read_csv('data_set.csv',sep=",", encoding='cp1252')
df=pd.DataFrame(dft)      # load data and turn into pandas dataframe


df.drop( columns='Response Id',inplace=True) # dropped Response Id 
# encoded personolity from str to int
personality_dict = {         
    "ESTJ": 0,
    "ENTJ": 1,
    "ESFJ": 2,
    "ENFJ": 3,
    "ISTJ": 4,
    "ISFJ": 5,
    "INTJ": 6,
    "INFJ": 7,
    "ESTP": 8,
    "ESFP": 9,
    "ENTP": 10,
    "ENFP": 11,
    "ISTP": 12,
    "ISFP": 13,
    "INTP": 14,
    "INFP": 15
}

df["Personality"] = df["Personality"].map(personality_dict)



In [14]:
def anacalisir(nparr): # main work func
    pr_independent= nparr[:, :-1]# set target and predictor
    tar_dependent= nparr[:, -1:] 
    for a in range(1,10,2):  #this for loop makes k=1,3,5,7,9 


        num_rows = df.shape[0]

        # calculate the size of each fold
        fold_size = num_rows // 5

        # set the starting index for the first fold
        start_index = 0

        accur_average_list=[]
        precision_average_list=[]
        recal_average_list=[]


        # Loop through the 5 folds
        for i in range(5):
            
            # calculate the ending index for the current fold
            end_index = start_index + fold_size
            
            # check if this is the last fold
            if end_index > num_rows:
                # If it is, set the ending index to the last row
                end_index = num_rows
            
            # Split the data into train and test sets
            X_test = pr_independent[start_index:end_index]
            y_test = tar_dependent[start_index:end_index]
            X_train = np.concatenate((pr_independent[:start_index], pr_independent[end_index:]))
            y_train = np.concatenate((tar_dependent[:start_index], tar_dependent[end_index:]))

            #set knn k=a ,a defined in the for loop 1,3,5,7,9
            knn = KNN(k=a)
            knn.fit(X_train, y_train) # fit trains arrays

            y_test=y_test.flatten() # flatten the arrays due to some future bugs
            y_test=np.array(list(map(int,y_test)))
            y1d_test=np.array(y_test).flatten()

            # will predict result
            prediction=knn.predict(X_test)

            confusion_matrix=knn.confmatrixmaker(prediction,y_test) # makes confusion matrix

            # average lists for recall precision and accuracy
            precision_average_list.append(knn.precisioncalc(confusion_matrix))
            accur_average_list.append(knn.accur(confusion_matrix,y_test))
            recal_average_list.append(knn.recallcalc(confusion_matrix))
            start_index += fold_size

        # print results
        print("k=",a)    
        print("precision:\t",(sum(precision_average_list)/5))    
        print("recall:\t",(sum(recal_average_list)/5))
        print("accuracy:\t",(sum(accur_average_list)/5)) 

In [8]:

def normalize(X):   # this func normalize values of data set (0-1)ranges
    min_vals = np.min(X, axis=0)
    max_vals = np.max(X, axis=0)
    return (X - min_vals) / (max_vals - min_vals)


for colname in df.columns[:-1]:
    flatcol=np.array(df[colname]).flatten()
    df[colname]=normalize(flatcol)

nparr = df.to_numpy()
print("normalize true\n")# with normalize
anacalisir(nparr) #work main anacalisir func



normalize true

k= 1
precision:	 0.9737752096074164
recall:	 0.9737077341549872
accuracy:	 0.9737144762063504
k= 3
precision:	 0.9876644922008755
recall:	 0.9876448325345372
accuracy:	 0.9876489707475622
k= 5
precision:	 0.9885223762197626
recall:	 0.9885108732363792
accuracy:	 0.9885157096424703
k= 7
precision:	 0.9887545241026778
recall:	 0.9887419473079742
accuracy:	 0.9887490624218686
k= 9
precision:	 0.9888729582597969
recall:	 0.9888653056563401
accuracy:	 0.9888657388115677


In [5]:
nparr = df.to_numpy()  #without normalize
print("normalize false\n")
anacalisir(nparr)

normalize false

k= 1
precision:	 0.9778342577301988
recall:	 0.9778015267996316
accuracy:	 0.9778148179014916
k= 3
precision:	 0.9884830003425475
recall:	 0.9884587293792176
accuracy:	 0.9884657054754562
k= 5
precision:	 0.9890504944176399
recall:	 0.9890236238913616
accuracy:	 0.9890324193682807
k= 7
precision:	 0.9892273733260109
recall:	 0.9892116075917116
accuracy:	 0.9892157679806651
k= 9
precision:	 0.9893756156332406
recall:	 0.9893627175543781
accuracy:	 0.9893657804817068


Error Analysis for Classification

In [15]:
# The above results show us that normalizing will calculate data faster and have higher accuracy,recall and precision.

In [19]:
#run this mini code for shown confussion matrix k=1

def showmatrix(nparr): # main work func
    pr_independent= nparr[:, :-1]# set target and predictor
    tar_dependent= nparr[:, -1:] 
    for a in range(1,2):   
        num_rows = df.shape[0]

        # calculate the size of each fold
        fold_size = num_rows // 5

        # set the starting index for the first fold
        start_index = 0

        accur_average_list=[]
        precision_average_list=[]
        recal_average_list=[]


        # Loop through the 5 folds
        for i in range(5):
            
            # calculate the ending index for the current fold
            end_index = start_index + fold_size
            
            # check if this is the last fold
            if end_index > num_rows:
                # If it is, set the ending index to the last row
                end_index = num_rows
            
            # Split the data into train and test sets
            X_test = pr_independent[start_index:end_index]
            y_test = tar_dependent[start_index:end_index]
            X_train = np.concatenate((pr_independent[:start_index], pr_independent[end_index:]))
            y_train = np.concatenate((tar_dependent[:start_index], tar_dependent[end_index:]))

            #set knn k=a ,a defined in the for loop 1,3,5,7,9
            knn = KNN(k=a)
            knn.fit(X_train, y_train) # fit trains arrays

            y_test=y_test.flatten() # flatten the arrays due to some future bugs
            y_test=np.array(list(map(int,y_test)))
            y1d_test=np.array(y_test).flatten()

            # will predict result
            prediction=knn.predict(X_test)

            confusion_matrix=knn.confmatrixmaker(prediction,y_test) # makes confusion matrix
            return confusion_matrix


In [20]:
print(showmatrix(nparr)) # this is the confusion matrix of prediction  k=1 and 5-fold validation

[[736   2   0   2   3   2   0   1   4   2   1   1   0   1   2   1]
 [  0 771   4   2   0   3   0   0   3   0   1   0   0   1   4   1]
 [  1   1 733   0   1   1   1   1   2   0   1   1   1   0   1   0]
 [  2   0   1 721   0   0   0   0   0   0   0   0   2   0   1   1]
 [  0   1   2   0 717   2   0   3   0   4   0   0   3   0   1   1]
 [  3   1   0   0   2 757   1   1   0   3   0   1   3   2   0   1]
 [  0   2   1   4   3   1 743   0   1   3   0   1   1   1   0   2]
 [  5   5   1   4   3   1   0 745   0   1   3   0   1   0   2   0]
 [  0   1   1   1   1   2   1   2 732   3   0   1   1   0   0   1]
 [  3   2   1   1   0   1   3   3   1 716   0   1   2   0   0   2]
 [  2   1   2   2   0   2   6   1   2   0 727   0   0   2   3   2]
 [  4   0   1   2   2   0   1   2   1   1   0 710   1   1   2   0]
 [  0   0   2   0   3   3   2   0   2   1   1   0 716   3   1   1]
 [  0   3   1   4   3   0   0   1   0   0   0   6   6 741   1   1]
 [  0   0   1   1   2   1   1   7   1   5   0   1   1   2 711 

In [None]:
# I think knn algorithm is slow. If it includes na values, we must get rid of na etc. 

In [None]:
# algorithm may not calculate outliers correctly because euclidean finds distance.