# KNN Classifier Training on CTGAN-Balanced Dataset
**Author:** Puxin
 
**Latest version date:** 2024-11-19

In [1]:
import pandas as pd
import numpy as np
import sklearn
import prettytable
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from prettytable import PrettyTable

In [2]:
import sys
print("Python version:", sys.version)
print("Pandas version:", pd.__version__)
print("NumPy version:", np.__version__)
print("Scikit-learn version:", sklearn.__version__)
print("PrettyTable version:", prettytable.__version__)

Python version: 3.12.5 (v3.12.5:ff3bc82f7c9, Aug  7 2024, 05:32:06) [Clang 13.0.0 (clang-1300.0.29.30)]
Pandas version: 2.2.3
NumPy version: 1.26.4
Scikit-learn version: 1.5.2
PrettyTable version: 3.11.0


## Training Set

### Original dataset

In [3]:
df_train = pd.read_csv('Train.csv')
df_train.head()

Unnamed: 0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,...,F120,F121,F122,F123,F124,F125,F126,F127,F128,Label
0,82,47,41,3,0,3,0,3,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5,0,0,0,30,5,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,4581,5175,1957,7073,470,2669,0,3553,132,0,...,0,0,0,0,0,0,0,0,0,0
3,6,38,0,18,1,1,1,2,4,1,...,0,0,0,0,0,0,0,0,0,1
4,88,48,44,4,0,4,0,4,0,0,...,0,0,0,0,0,0,0,0,0,0


### Synthetic data generated from CTGAN

In [4]:
df_syn = pd.read_csv('CTGANSynthData1.csv')
df_syn.head()

Unnamed: 0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,...,F119,F120,F121,F122,F123,F124,F125,F126,F127,F128
0,0,465,41,8033,58,0,0,0,52,2,...,0,0,0,0,0,0,0,0,0,0
1,86,815,159,11,1307,44,0,123,62,0,...,0,0,0,0,0,0,0,0,0,0
2,12,159,128,209,29,0,0,238,188,0,...,0,0,0,0,0,0,0,0,0,0
3,60,278,104,153,138,0,1,134,211,7,...,0,0,0,0,0,0,0,0,0,0
4,0,390,0,11891,1372,281,0,85,55,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
df_syn['Label'] = 1
df_syn.head()

Unnamed: 0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,...,F120,F121,F122,F123,F124,F125,F126,F127,F128,Label
0,0,465,41,8033,58,0,0,0,52,2,...,0,0,0,0,0,0,0,0,0,1
1,86,815,159,11,1307,44,0,123,62,0,...,0,0,0,0,0,0,0,0,0,1
2,12,159,128,209,29,0,0,238,188,0,...,0,0,0,0,0,0,0,0,0,1
3,60,278,104,153,138,0,1,134,211,7,...,0,0,0,0,0,0,0,0,0,1
4,0,390,0,11891,1372,281,0,85,55,0,...,0,0,0,0,0,0,0,0,0,1


### Balanced dataset

In [6]:
df_balanced = pd.concat([df_train, df_syn], ignore_index=True)
df_balanced.head()

Unnamed: 0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,...,F120,F121,F122,F123,F124,F125,F126,F127,F128,Label
0,82,47,41,3,0,3,0,3,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5,0,0,0,30,5,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,4581,5175,1957,7073,470,2669,0,3553,132,0,...,0,0,0,0,0,0,0,0,0,0
3,6,38,0,18,1,1,1,2,4,1,...,0,0,0,0,0,0,0,0,0,1
4,88,48,44,4,0,4,0,4,0,0,...,0,0,0,0,0,0,0,0,0,0


### Check whether there are missing values

In [7]:
df_balanced.isnull().values.any()

False

**No missing values are found.**

### Check the number of samples for each label

In [8]:
df_balanced['Label'].value_counts()

Label
0    3000
1    3000
Name: count, dtype: int64

## Hyperparameter Tuning (k)

In [9]:
features = df_balanced.iloc[:,:128] 
label = df_balanced.iloc[:,-1] 
skf = StratifiedKFold(n_splits=10, shuffle= True, random_state= 2024) #Stratified 10-Fold Cross Validation

In [10]:
hyperparameter_score_list = []
for k in range(2,21):
    knn = KNeighborsClassifier(n_neighbors=k)
    val_scores = cross_val_score(estimator = knn, X = features, y = label, cv = skf)
    mean_score = val_scores.mean()
    hyperparameter_score_list.append([k, mean_score])

In [11]:
k_table = PrettyTable([ "k", "Avg accuracy"])
for row in hyperparameter_score_list:
    k_table.add_row([row[0], round(row[1],3)])
print(k_table)

+----+--------------+
| k  | Avg accuracy |
+----+--------------+
| 2  |    0.932     |
| 3  |    0.943     |
| 4  |    0.935     |
| 5  |    0.941     |
| 6  |    0.933     |
| 7  |    0.936     |
| 8  |     0.93     |
| 9  |    0.932     |
| 10 |    0.928     |
| 11 |    0.929     |
| 12 |    0.923     |
| 13 |    0.927     |
| 14 |    0.922     |
| 15 |    0.922     |
| 16 |    0.918     |
| 17 |    0.919     |
| 18 |    0.917     |
| 19 |    0.919     |
| 20 |    0.914     |
+----+--------------+


**Based on the result shown in the above table, choose k=3.**

## Test set

In [12]:
df_test = pd.read_csv('Validation.csv')
df_test.head()

Unnamed: 0,ID,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F119,F120,F121,F122,F123,F124,F125,F126,F127,F128
0,1,9,399,93,101,361,2,1,3,136,...,0,0,0,0,0,0,0,0,0,0
1,2,74,1677,37,0,176,37,0,74,88,...,0,0,0,0,0,0,0,0,0,0
2,3,3,329,107,53,287,1,0,1,137,...,0,0,0,0,0,0,0,0,0,0
3,4,39,86,23,80,0,16,0,16,2,...,0,0,0,0,0,0,0,0,0,0
4,5,5286,1926,3786,2877,298,1594,0,1836,185,...,0,0,0,0,0,0,0,0,0,0


In [13]:
df_test = df_test.iloc[:,1:129] # Removing the first column (ID)
df_test.head()

Unnamed: 0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,...,F119,F120,F121,F122,F123,F124,F125,F126,F127,F128
0,9,399,93,101,361,2,1,3,136,1,...,0,0,0,0,0,0,0,0,0,0
1,74,1677,37,0,176,37,0,74,88,0,...,0,0,0,0,0,0,0,0,0,0
2,3,329,107,53,287,1,0,1,137,0,...,0,0,0,0,0,0,0,0,0,0
3,39,86,23,80,0,16,0,16,2,0,...,0,0,0,0,0,0,0,0,0,0
4,5286,1926,3786,2877,298,1594,0,1836,185,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
knn_final = KNeighborsClassifier(n_neighbors=3)
knn_model = knn_final.fit(features, label)
y_pred = knn_model.predict(df_test)

In [15]:
df_pred = pd.DataFrame(y_pred, columns = ['Label'])
id_pred = np.arange(1, len(df_pred)+1)
df_pred['ID'] = id_pred
df_pred = df_pred[['ID', 'Label']]
df_pred.head()

Unnamed: 0,ID,Label
0,1,1
1,2,1
2,3,1
3,4,0
4,5,0


In [16]:
df_pred.to_csv('knn_ctgan_output.csv', index=False) 