Jeongmin Chae | 6022220672 | hw1

In [18]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
#from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt

In [19]:

class KNN():
    
    def __init__(self,address):
        self.data_path=address
        self.data = pd.DataFrame()
        self.train_data = pd.DataFrame()
        self.test_data = pd.DataFrame()
        self.X_train = pd.DataFrame()
        self.y_train = pd.DataFrame()
        self.X_test  = pd.DataFrame()
        self.y_test = pd.DataFrame()



    def load_data(self):

        df = pd.read_csv(self.data_path,header=None)
        data = df.iloc[:,0].str.split(' ', expand = True)
        data=data.rename(columns={0: "pelvic_incidence", 1: "pelvic_tilt",2: "lumbar_lordosis_angle",3: "sacral_slope",4: "pelvic_radius",5: "degree_spondylolisthesis",6: "class" })
        data["class"].replace({"AB": "1", "NO": "0"}, inplace=True)
        data = data.apply(pd.to_numeric)
        self.data = data
        
        return data

    def plot_data(self,method):
        if method=='scatter':
            scatter_plot = sns.pairplot(self.data, hue='class')
        else:
            boxplot = self.data.boxplot(column=['pelvic_incidence', 'pelvic_tilt', 'lumbar_lordosis_angle','sacral_slope','pelvic_radius','degree_spondylolisthesis'],by='class', figsize=(15,10))


    def train_test_split(self):
        data_0=self.data.loc[self.data['class'] == 0]
        data_1=self.data.loc[self.data['class'] == 1]
        train_0 = data_0.iloc[:70,:]
        train_1 = data_1.iloc[:140,:]
        test_0 = data_0.iloc[70:,:]
        test_1 = data_1.iloc[140:,:]
        train_data=pd.concat([train_0,train_1], ignore_index=True)
        test_data=pd.concat([test_0,test_1], ignore_index=True)
        
        if(self.train_data.shape[0] == 0):
            
            self.train_data = train_data
            self.test_data  = test_data
            

        return train_data, test_data


    def get_train_test(self):
        X_train = self.train_data.loc[:, self.train_data.columns!='class']
        y_train = self.train_data['class']
        X_test = self.test_data.loc[:, self.test_data.columns!='class']
        y_test = self.test_data['class']
        
        if(self.X_train.shape[0] == 0):
            self.X_train = X_train
            self.y_train = y_train
            self.X_test = X_test
            self.y_test = y_test
            
            print('Splited train and test data.')
        

        return X_train, y_train, X_test, y_test 


    def best_k(self,k_list,X_train,y_train,X_test,y_test,p, metric  = 'minkowski',plot=False, weights='uniform'):

     #   X_train = self.train_data.loc[:, self.train_data.columns!='class']
     #   y_train = self.train_data['class']
     #   X_test = self.test_data.loc[:, self.test_data.columns!='class']
     #   y_test = self.test_data['class']

        train_error=[]
        test_error=[]

        for k in k_list:
            if (metric=='minkowski') and (weights=='uniform'):
                model = KNeighborsClassifier(n_neighbors=k, weights='uniform', p=p)
            elif (metric!='minkowski') and (weights=='uniform'):
                model = KNeighborsClassifier(n_neighbors=k, weights='uniform', metric=metric, metric_params={'VI':np.cov(X_train.T)})
            elif (metric=='minkowski') and (weights=='distance'):
                model = KNeighborsClassifier(n_neighbors=k, weights='distance', p=p)
            elif (metric!='minkowski') and (weights =='distance'):
                model = KNeighborsClassifier(n_neighbors=k, weights='distance', metric=metric, metric_params={'VI':np.cov(X_train.T)})


            model.fit(X_train,y_train)
            y_train_pred= model.predict(X_train)
            y_test_pred = model.predict(X_test)
            #train_error.append(1-accuracy_score(y_train,y_train_pred))
            #test_error.append(1-accuracy_score(y_test,y_test_pred))
            train_error.append(np.mean(y_train_pred!=y_train))
            test_error.append(np.mean(y_test_pred!=y_test))

        if (plot==True):
            plt.figure(figsize=(8,4))  
            plt.plot(k_list,train_error,k_list,test_error)
            plt.xlim(max(k_list), min(k_list))
            plt.ylabel('Error')
            plt.xlabel('k')
            plt.legend(['train_error','test_error'])
            plt.show()


        # Find best k
        min_index=test_error.index(min(test_error))
        best_k=k_list[min_index]
        best_test_error = test_error[min_index]

        if (plot==True):
            print("Best k* is :", best_k)
            print("Best test error is :", best_test_error)

        # Find stat for the best k

        model_best = KNeighborsClassifier(n_neighbors=best_k, weights='uniform')
        model_best.fit(X_train,y_train)
        y_test_pred_best = model_best.predict(X_test)
        cm=confusion_matrix(y_test,y_test_pred_best)
        tn, fp, fn, tp =cm.ravel()
        y_num=len(y_test_pred_best)
        precision = tp/(tp+fp)
        recall = tp/(tp+fn)

        if (plot==True):
            #disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=model_best.classes_)
            fig, ax = plt.subplots(figsize=(8,4))
            sns.heatmap(cm, annot=True, cmap='YlGnBu', ax=ax)
            #disp.plot()
            print('True positive: {}'.format(100*tp/y_num))
            print('True negative: {}'.format(100*tn/y_num))
            print('Precision: {}'.format(100*precision))
            print('F1 score: {}'.format(100*2*(precision*recall)/(precision+recall)))

        return best_k, best_test_error, cm, tn, fp, fn, tp


    def get_learning_curve(self, n_list):


        X_train_0=self.train_data.loc[self.train_data['class'] == 0]
        X_train_1=self.train_data.loc[self.train_data['class'] == 1]
        y_train_0 = self.train_data[self.train_data['class']==0]['class']
        y_train_1 = self.train_data[self.train_data['class']==1]['class']

        n_test_error=[]

        for n in n_list:

            X_train_0_sub=X_train_0.head(n//3)
            X_train_0_sub = X_train_0_sub.loc[:, X_train_0_sub.columns!='class']
            X_train_1_sub=X_train_1.head(n-n//3)
            X_train_1_sub = X_train_1_sub.loc[:, X_train_1_sub.columns!='class']
            y_train_0_sub=y_train_0.head(n//3)
            y_train_1_sub=y_train_1.head(n-n//3)

            X_train_sub = pd.concat([X_train_0_sub,X_train_1_sub], ignore_index=True)
            y_train_sub = pd.concat([y_train_0_sub,y_train_1_sub], ignore_index=True)

            k_list = range(1,n,5)
            best_k, best_test_error, _, _, _, _, _ =self.best_k(k_list,X_train_sub,y_train_sub,self.X_test,self.y_test,p=2,metric = 'minkowski')
            n_test_error.append(best_test_error)

        plt.figure(figsize=(8,4))  
        plt.plot(n_list,n_test_error,'.-', markersize=10, color='red')
        plt.ylabel('The best test error rate')
        plt.xlabel('N')
        plt.legend(['test_error'])
        plt.show()


    def test_various_metric(self, euclidean = False, manhattan = False, p_value = False, chebyshev = False, mahalanobis = False, weights='uniform'):

        k_list = range(1,200,5)
        d_list = []

        if (euclidean == True) and (weights=='uniform') :
            p=2
            best_k, best_test_error, _, _, _, _, _ =self.best_k(k_list,self.X_train,self.y_train,self.X_test,self.y_test,p)
            d_list.append([str(p), best_k, best_test_error])

        if (manhattan == True) and (weights=='uniform'):
            p=1
            best_k_m, best_test_error, _, _, _, _, _ =self.best_k(k_list,self.X_train,self.y_train,self.X_test,self.y_test,p)
            d_list.append([str(p), best_k_m, best_test_error])


        if (p_value == True) and (weights=='uniform'):
            p_range = np.arange(0.1, 1.1, 0.1)
            for pv in p_range:
                p = 10**pv
                model = KNeighborsClassifier(n_neighbors = best_k_m, p=p)
                model.fit(self.X_train, self.y_train)
                score = model.score(self.X_test, self.y_test)
                new_dis = ['10^({:.2f})'.format(pv), best_k_m, 1 - score]
                d_list.append(new_dis)


        if(chebyshev == True) and (weights=='uniform'):
            p = 100
            best_k, best_test_error, _, _, _, _, _ = self.best_k(k_list,self.X_train, self.y_train, self.X_test, self.y_test, p)
            d_list.append([str(p), best_k, best_test_error])

        if(mahalanobis == True) and (weights=='uniform'):
            best_k, best_test_error, _, _, _, _, _ = self.best_k(k_list,self.X_train, self.y_train, self.X_test, self.y_test, p=2,metric='mahalanobis')
            d_list.append(['mahalanobis', best_k, best_test_error])

            
        if (weights == 'distance') and (euclidean == True):
            p=2
            best_k, best_test_error, _, _, _, _, _ =self.best_k(k_list,self.X_train,self.y_train,self.X_test,self.y_test,p,weights='distance')
            d_list.append([str(p), best_k, best_test_error])
        
        if (weights == 'distance') and (manhattan == True):
            p=1
            best_k_m, best_test_error, _, _, _, _, _ =self.best_k(k_list,self.X_train,self.y_train,self.X_test,self.y_test,p,weights='distance')
            d_list.append([str(p), best_k_m, best_test_error])
            
        if (weights == 'distance') and (chebyshev == True):
            p = 100
            best_k, best_test_error, _, _, _, _, _ = self.best_k(k_list,self.X_train, self.y_train, self.X_test, self.y_test, p,weights='distance')
            d_list.append([str(p), best_k, best_test_error])
            


        d_errors_df = pd.DataFrame(d_list, columns=['p / metric','best_k','test_error'])
        print('\n', '\t Distance Metric Results')
        print(d_errors_df) 

    
address = '../data/vertebral_column_data/column_2C.dat'
kNN_model=KNN(address)        
        
    

In [20]:
# 1.(a) load the data
kNN_model.load_data()

Unnamed: 0,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis,class
0,63.03,22.55,39.61,40.48,98.67,-0.25,1
1,39.06,10.06,25.02,29.00,114.41,4.56,1
2,68.83,22.22,50.09,46.61,105.99,-3.53,1
3,69.30,24.65,44.31,44.64,101.87,11.21,1
4,49.71,9.65,28.32,40.06,108.17,7.92,1
...,...,...,...,...,...,...,...
305,47.90,13.62,36.00,34.29,117.45,-4.25,0
306,53.94,20.72,29.22,33.22,114.37,-0.42,0
307,61.45,22.69,46.17,38.75,125.67,-2.71,0
308,45.25,8.69,41.58,36.56,118.55,0.21,0


In [None]:
# 1.(b).i scatter plot 
kNN_model.plot_data('scatter')

In [None]:
# 1.(b).ii Box plot 
kNN_model.plot_data('boxplot')

In [None]:
# 1.(b).iii Get train and test data
train_data, test_data=kNN_model.train_test_split()
X_train, y_train, X_test, y_test =kNN_model.get_train_test()

In [None]:
# 1.(c).ii
k_list=range(1,211,3)
best_k, best_test_error, cm, tn, fp, fn, tp = kNN_model.best_k(k_list,X_train,y_train,X_test,y_test,p=2, metric  = 'minkowski',plot=True)

In [None]:
#1.(c).iii
n_list = range(10,211,10)
kNN_model.get_learning_curve(n_list)

In [None]:
# 1.(d) : Euclidean distance
kNN_model.test_various_metric(euclidean = True)

In [None]:
# 1.(d).i.A : Manhattan distance
kNN_model.test_various_metric(manhattan = True)

In [None]:
# 1.(d).i.B 
kNN_model.test_various_metric(manhattan = True,p_value=True)

In [None]:
# 1.(d).i.C : Chebyshev
kNN_model.test_various_metric(chebyshev= True)

In [None]:
# 1.(d).ii : Mahalanobis distance
kNN_model.test_various_metric(mahalanobis= True)

In [None]:
# 1.(e) : Euclidean distance
kNN_model.test_various_metric(euclidean = True, weights='distance')

In [None]:
# 1.(e) : Manhattan distance
kNN_model.test_various_metric(manhattan = True, weights='distance')

In [None]:
# 1.(e) : Chebyshev distance
kNN_model.test_various_metric(chebyshev= True, weights='distance')