In [33]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import time

In [34]:
data = pd.read_csv("/Users/julia/CSUEB/Fall2023/ML/data_banknote_authentication.txt")

In [35]:
data

Unnamed: 0,Column1,Column2,Column3,Column4,Column5
0,3.62160,8.66610,-2.8073,-0.44699,0
1,4.54590,8.16740,-2.4586,-1.46210,0
2,3.86600,-2.63830,1.9242,0.10645,0
3,3.45660,9.52280,-4.0112,-3.59440,0
4,0.32924,-4.45520,4.5718,-0.98880,0
...,...,...,...,...,...
1367,0.40614,1.34920,-1.4501,-0.55949,1
1368,-1.38870,-4.87730,6.4774,0.34179,1
1369,-3.75030,-13.45860,17.5932,-2.77710,1
1370,-3.56370,-8.38270,12.3930,-1.28230,1


In [36]:
data.corr()

Unnamed: 0,Column1,Column2,Column3,Column4,Column5
Column1,1.0,0.264026,-0.38085,0.276817,-0.724843
Column2,0.264026,1.0,-0.786895,-0.526321,-0.444688
Column3,-0.38085,-0.786895,1.0,0.318841,0.155883
Column4,0.276817,-0.526321,0.318841,1.0,-0.023424
Column5,-0.724843,-0.444688,0.155883,-0.023424,1.0


In [37]:
data['Distance'] = None
data

Unnamed: 0,Column1,Column2,Column3,Column4,Column5,Distance
0,3.62160,8.66610,-2.8073,-0.44699,0,
1,4.54590,8.16740,-2.4586,-1.46210,0,
2,3.86600,-2.63830,1.9242,0.10645,0,
3,3.45660,9.52280,-4.0112,-3.59440,0,
4,0.32924,-4.45520,4.5718,-0.98880,0,
...,...,...,...,...,...,...
1367,0.40614,1.34920,-1.4501,-0.55949,1,
1368,-1.38870,-4.87730,6.4774,0.34179,1,
1369,-3.75030,-13.45860,17.5932,-2.77710,1,
1370,-3.56370,-8.38270,12.3930,-1.28230,1,


In [38]:
#Here we drop columns that have very low correlation with the Outcome column
new_data1 = data.drop('Column4',axis=1)
new_data2 = new_data1.drop('Column5',axis=1)

In [39]:
X = new_data2

In [40]:
y = data.iloc[:, -2]

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [42]:
X_train

Unnamed: 0,Column1,Column2,Column3,Distance
1226,-4.06790,2.4955,0.79571,
1085,-2.66490,-12.8130,12.66890,
148,3.62770,0.9829,0.68861,
1178,-2.07540,1.2767,-0.64206,
478,-1.77810,0.8546,7.13030,
...,...,...,...,...
715,3.49160,8.5709,-3.03260,
905,0.74521,3.6357,-4.40440,
1096,-4.36670,6.0692,0.57208,
235,2.04660,2.0300,2.17610,


In [43]:
X_test

Unnamed: 0,Column1,Column2,Column3,Distance
1240,-3.5510,1.89550,0.186500,
703,1.3114,4.54620,2.293500,
821,-4.0173,-8.31230,12.454700,
1081,-5.1190,6.64860,-0.049987,
37,3.6289,0.81322,1.627700,
...,...,...,...,...
654,3.5127,2.90730,1.057900,
1100,1.4378,0.66837,-2.026700,
90,1.4806,7.63770,-2.787600,
1107,1.2198,2.09820,-3.195400,


In [44]:
#This function calculates the euclidean distance between test row(passed as argument to the function) and each of the training rows. Returns the list of all distances for all the rows of training data.
def calculate_euclidean_distance(training_features, test_features):
    dist_list = []
    for i in range(0,len(training_features)):
        #Not including the Distance column
        tf = training_features.iloc[i,:-1]
        dist = np.sqrt(np.sum(np.square(test_features - tf)))
        dist_list.append(dist)   
    return dist_list

In [45]:
#This function calculates the number of occurences of each of the k-nearest neighbors class and 
#retruns the class value that has maximum occurence
def calculate_occurence_of_eachclass(knearest_neighbors_class_list):
    count_per_inner_list = [inner_list.count(0) for inner_list in knearest_neighbors_class_list]
    total_count0 = sum(count_per_inner_list)
    count_per_inner_list = [inner_list.count(1) for inner_list in knearest_neighbors_class_list]
    total_count1 = sum(count_per_inner_list)
    
    if total_count1 > total_count0:
        return 1
    elif total_count0 > total_count1:
        return 0
    else:
        return 1

In [55]:
#This function is the main algorithm which implements HL-KNN algorithm(Baseline). Takes the training data, 
#test data and predicts the class of the given test data.
#More description detailed in the report
def high_level_knn_modified (X_train,X_test,y_train):
    dist_arr_lowlevel = []
    predict_arr = []
    dist_arr_highlevel = []
    neighbors_class_list = []
    predicted_testdata = pd.DataFrame()
    for i in range(0,len(X_test)):
        test = X_test.iloc[i,:-1]
        dist_arr_lowlevel = calculate_euclidean_distance(X_train, test)
        X_train['Distance']= dist_arr_lowlevel
        X_train['Column5']= y_train
        df = pd.DataFrame(X_train)
        df_sorted = df.sort_values(by='Distance')
        df_top7 = df_sorted.head(7)
        neighbors_class_list.append(df_top7['Column5'].tolist())
        total1_count = calculate_occurence_of_eachclass(neighbors_class_list)
        
        X_train = X_train.drop('Column5',axis=1)
        for j in range(0,len(df_top7)):
            top7 = df_top7.iloc[j,:-1]
            dist_arr_highlevel = calculate_euclidean_distance(X_train, top7)
            X_train['Distance']= dist_arr_highlevel
            X_train['Column5']= y_train
            df1 = pd.DataFrame(X_train)
            df1_sorted = df1.sort_values(by='Distance')
            df1_top7 = df1_sorted.head(7)
            neighbors_class_list.append(df1_top7['Column5'].tolist())
            X_train = X_train.drop('Column5',axis=1)
        total_count = calculate_occurence_of_eachclass(neighbors_class_list)
        predict_arr.append(total_count)
        neighbors_class_list.clear()   
    predicted_testdata['Column1'] = X_test['Column1']
    predicted_testdata['Column2'] = X_test['Column2']
    predicted_testdata['Column3'] = X_test['Column3']
    predicted_testdata['Column5'] = predict_arr

    return predicted_testdata

In [56]:
#This function calculates the accuracy of the predicted data by dividing the correctly predicetd data over total
#number of test samples multiplied by 100
def efficiency_test (predicted_testdata, X_test):
    counter = 0
    for i in range (0,len(X_test)):
        if X_test.iloc[i,4] == predicted_test.iloc[i,3]:
            counter = counter +1
    percentage = (counter/len(X_test)) * 100
    return percentage

In [57]:
predicted_test = pd.DataFrame()
start_time = time.time()
predicted_test = high_level_knn_modified(X_train,X_test,y_train)
end_time = time.time()
predicted_test

Unnamed: 0,Column1,Column2,Column3,Column5
1240,-3.5510,1.89550,0.186500,1
703,1.3114,4.54620,2.293500,0
821,-4.0173,-8.31230,12.454700,1
1081,-5.1190,6.64860,-0.049987,1
37,3.6289,0.81322,1.627700,0
...,...,...,...,...
654,3.5127,2.90730,1.057900,0
1100,1.4378,0.66837,-2.026700,1
90,1.4806,7.63770,-2.787600,0
1107,1.2198,2.09820,-3.195400,1


In [58]:
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time} seconds")
X_test ['Outcome'] = y_test

Time taken: 653.2583322525024 seconds


In [59]:
match_percent = efficiency_test (predicted_test, X_test)

In [60]:
match_percent

99.27272727272727