In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from prettytable import PrettyTable

# Training Set

In [2]:
df_train = pd.read_csv('Train.csv')
df_train.head()

Unnamed: 0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,...,F120,F121,F122,F123,F124,F125,F126,F127,F128,Label
0,82,47,41,3,0,3,0,3,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5,0,0,0,30,5,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,4581,5175,1957,7073,470,2669,0,3553,132,0,...,0,0,0,0,0,0,0,0,0,0
3,6,38,0,18,1,1,1,2,4,1,...,0,0,0,0,0,0,0,0,0,1
4,88,48,44,4,0,4,0,4,0,0,...,0,0,0,0,0,0,0,0,0,0


## Check whether there are missing values

In [3]:
df_train.isnull().values.any()

np.False_

**No missing values are found.**

## Check the number of samples for each label

In [4]:
df_train['Label'].value_counts()

Label
0    3000
1    1465
Name: count, dtype: int64

## Hyperparameter Tuning (k)

In [5]:
features = df_train.iloc[:,:128] 
label = df_train.iloc[:,-1] 
skf = StratifiedKFold(n_splits=10, shuffle= True, random_state= 2024) #Stratified 10-Fold Cross Validation

In [6]:
hyperparameter_score_list = []
for k in range(2,21):
    knn = KNeighborsClassifier(n_neighbors=k)
    val_scores = cross_val_score(estimator = knn, X = features, y = label, cv = skf)
    mean_score = val_scores.mean()
    hyperparameter_score_list.append([k, mean_score])

In [7]:
myTable = PrettyTable([ "k", "Avg accuracy"])
for row in hyperparameter_score_list:
    myTable.add_row([row[0], round(row[1],3)])
print(myTable)

+----+--------------+
| k  | Avg accuracy |
+----+--------------+
| 2  |    0.937     |
| 3  |    0.939     |
| 4  |    0.937     |
| 5  |    0.939     |
| 6  |    0.934     |
| 7  |    0.935     |
| 8  |     0.93     |
| 9  |     0.93     |
| 10 |    0.927     |
| 11 |    0.925     |
| 12 |    0.921     |
| 13 |    0.924     |
| 14 |     0.92     |
| 15 |    0.919     |
| 16 |    0.916     |
| 17 |    0.915     |
| 18 |    0.912     |
| 19 |    0.912     |
| 20 |    0.911     |
+----+--------------+


**Based on the result shown in the above table, choose k=5.**

# Test set

In [8]:
df_test = pd.read_csv('Validation.csv')
df_test.head()

Unnamed: 0,ID,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F119,F120,F121,F122,F123,F124,F125,F126,F127,F128
0,1,9,399,93,101,361,2,1,3,136,...,0,0,0,0,0,0,0,0,0,0
1,2,74,1677,37,0,176,37,0,74,88,...,0,0,0,0,0,0,0,0,0,0
2,3,3,329,107,53,287,1,0,1,137,...,0,0,0,0,0,0,0,0,0,0
3,4,39,86,23,80,0,16,0,16,2,...,0,0,0,0,0,0,0,0,0,0
4,5,5286,1926,3786,2877,298,1594,0,1836,185,...,0,0,0,0,0,0,0,0,0,0


In [9]:
df_test = df_test.iloc[:,1:129] # Removing the first column (ID)
df_test.head()

Unnamed: 0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,...,F119,F120,F121,F122,F123,F124,F125,F126,F127,F128
0,9,399,93,101,361,2,1,3,136,1,...,0,0,0,0,0,0,0,0,0,0
1,74,1677,37,0,176,37,0,74,88,0,...,0,0,0,0,0,0,0,0,0,0
2,3,329,107,53,287,1,0,1,137,0,...,0,0,0,0,0,0,0,0,0,0
3,39,86,23,80,0,16,0,16,2,0,...,0,0,0,0,0,0,0,0,0,0
4,5286,1926,3786,2877,298,1594,0,1836,185,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
knn_final = KNeighborsClassifier(n_neighbors=5)
knn_model = knn_final.fit(features, label)
y_pred = knn_model.predict(df_test)

In [11]:
df_pred = pd.DataFrame(y_pred, columns = ['Label'])
id_pred = np.arange(1, len(df_pred)+1)
df_pred['ID'] = id_pred
df_pred = df_pred[['ID', 'Label']]
df_pred.head()

Unnamed: 0,ID,Label
0,1,1
1,2,0
2,3,1
3,4,0
4,5,0


In [12]:
df_pred.to_csv('knn_base_output.csv', index=False) 