# **k-NN**
For classification of Processed Energy Dataset.

In [None]:
# Import the standard tools for pythonic data analysis. 
import csv
import math
import random
import numpy as np 
import pandas as pd

from google.colab import drive 
drive.mount('/content/gdrive')

Mounted at /content/gdrive


## Initial import and basic understanding of data

In [None]:
# Import the final dataset as dataframe from csv
df = pd.read_csv('/content/gdrive/MyDrive/CS3244 Team 05/Dataset Analysis/fixed_data_train.csv',sep=',',low_memory=False) # Separate on semicolons

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,client_id,district,client_catg,region,region_group,creation_date_day,creation_date_month,creation_date_year,no_months_as_client,...,tally_check_false,sum_tally_value,min_tally_value,max_tally_value,mean_tally_value,counter_type,last_year,last_month,last_day,last_day_is_weekday
0,0,train_Client_0,60,11,101,2,31,12,1994,290.568595,...,0,0,0,0,0.0,1,2019,3,19,1
1,1,train_Client_1,69,11,107,2,29,5,2002,202.123247,...,0,0,0,0,0.0,1,2019,4,2,1
2,2,train_Client_10,62,11,301,3,13,3,1986,397.642662,...,0,0,0,0,0.0,1,2019,5,2,1
3,3,train_Client_100,69,11,105,2,7,11,1996,190.591183,...,0,0,0,0,0.0,1,2012,9,25,1
4,4,train_Client_1000,62,11,303,3,14,10,2014,56.083287,...,0,0,0,0,0.0,1,2019,6,17,1


In [None]:
df.dtypes

Unnamed: 0                        int64
client_id                        object
district                          int64
client_catg                       int64
region                            int64
region_group                      int64
creation_date_day                 int64
creation_date_month               int64
creation_date_year                int64
no_months_as_client             float64
services_consumed                 int64
target                            int64
months_of_service               float64
number_of_counter                 int64
number_of_instances               int64
number_of_person_counting         int64
min_reading_remark                int64
max_reading_remark                int64
mean_reading_remark             float64
mean_difference_index           float64
sum_difference_index              int64
min_difference_index              int64
max_difference_index              int64
min_counter_statue                int64
max_counter_statue                int64


## Data Cleaning or Selection

In [None]:
del df['Unnamed: 0'] # Delete Index of CSV file
del df['client_id'] # Delete client_id as it is a string and can't be used as a feature

In [None]:
df.head()

Unnamed: 0,district,client_catg,region,region_group,creation_date_day,creation_date_month,creation_date_year,no_months_as_client,services_consumed,target,...,tally_check_false,sum_tally_value,min_tally_value,max_tally_value,mean_tally_value,counter_type,last_year,last_month,last_day,last_day_is_weekday
0,60,11,101,2,31,12,1994,290.568595,1,0,...,0,0,0,0,0.0,1,2019,3,19,1
1,69,11,107,2,29,5,2002,202.123247,1,0,...,0,0,0,0,0.0,1,2019,4,2,1
2,62,11,301,3,13,3,1986,397.642662,1,0,...,0,0,0,0,0.0,1,2019,5,2,1
3,69,11,105,2,7,11,1996,190.591183,1,0,...,0,0,0,0,0.0,1,2012,9,25,1
4,62,11,303,3,14,10,2014,56.083287,1,0,...,0,0,0,0,0.0,1,2019,6,17,1


## Separating Training and Testing Data

Partition the dataset such that both training and testing data have a good mixed of fraud and no fraud data by randomness.

In [None]:
from sklearn.model_selection import train_test_split

# Partion the features from the target to predict
df_features = df[df.columns[df.columns != 'target']].copy() # get columns that are not 'target'; this our features
df_target = df['target'].copy() # get the column named 'target'; this is our label

# (random_state): we use a fixed random seed so we get the same results every time.
X_train, X_test, y_train, y_test = train_test_split(df_features, df_target, test_size=0.3, random_state=1) ## RANDOM STATE DETERMINED HERE

print ("Number of training instances: ", len(X_train), "\nNumber of test instances: ", len(X_test), "\nTotal number of instances: ",len(df_target))

Number of training instances:  94840 
Number of test instances:  40647 
Total number of instances:  135487


In [None]:
X_train.head()

Unnamed: 0,district,client_catg,region,region_group,creation_date_day,creation_date_month,creation_date_year,no_months_as_client,services_consumed,months_of_service,...,tally_check_false,sum_tally_value,min_tally_value,max_tally_value,mean_tally_value,counter_type,last_year,last_month,last_day,last_day_is_weekday
16031,62,11,301,3,27,9,1999,225.943038,1,162.401692,...,0,0,0,0,0.0,1,2018,7,26,1
27947,69,11,104,2,18,10,1995,180.537588,1,57.036079,...,0,0,0,0,0.0,1,2010,11,3,1
64148,63,11,311,3,14,6,2007,149.752562,1,149.292593,...,0,0,0,0,0.0,1,2019,12,6,1
1476,63,11,379,3,28,12,2011,91.730836,1,77.208978,...,0,0,0,0,0.0,1,2019,8,20,1
27331,63,11,101,2,26,6,2002,204.16025,2,156.717797,...,0,0,0,0,0.0,2,2019,7,1,1


# Building the k-NN Classifier (default)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Default values
kNN_default = KNeighborsClassifier(n_neighbors = 5,   # Our k
                           weights = 'uniform',       # {‘uniform’, ‘distance’}, weight of each neighbour
                           algorithm = 'auto',        # {‘auto’, ‘ball_tree’, ‘kd_tree’, ‘brute’}, algorithm used
                           leaf_size = 30,            # Applicable for 'ball_tree' and 'kd_tree'
                           p = 2,                     # power used for 'minkowski' metric
                           metric = 'minkowski',      # metric to calculate distance
                           metric_params = None,      # extra input attributes of metric
                           n_jobs = None              # Used for parallelisation
                          )



## Training

In [None]:
kNN_default.fit(X_train, y_train)

print("Class labels learned:", kNN_default.classes_)
print("Distance metric used:", kNN_default.effective_metric_)
print("Number of input features:", kNN_default.n_features_in_)
print("Number of training samples:", kNN_default.n_samples_fit_)

Class labels learned: [0 1]
Distance metric used: euclidean
Number of input features: 43
Number of training samples: 94840


## Testing

In [None]:
predictions = kNN_default.predict(X_test)

In [None]:
# Automate calculations over confusion matrix.
def analysePredictions(tracker):
    TP = tracker["TP"]
    FP = tracker["FP"]
    TN = tracker["TN"]
    FN = tracker["FN"]

    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    f1 = 2 / (1/precision + 1/recall)

    return precision, recall, f1


In [None]:
tracker = {"TP": 0, "FP": 0, "TN": 0, "FN": 0}
for i in range(len(predictions)):
    if (predictions[i] == 0):
        if (y_test.iloc[i] == 0):
            tracker["TN"] += 1
        else:
            tracker["FN"] += 1
    else:
        if (y_test.iloc[i] == 1):
            tracker["TP"] += 1
        else:
            tracker["FP"] += 1

print(tracker)

{'TP': 26, 'FP': 150, 'TN': 38233, 'FN': 2238}


In [None]:
print(analysePredictions(tracker))

(0.14772727272727273, 0.011484098939929329, 0.021311475409836064)


# kNN variations

In [None]:
# Automates kNN training/testing cycle.
def analyseKNN(knn, X_train, X_test, y_train, y_test):
    knn.fit(X_train, y_train)
    predictions = knn.predict(X_test)

    tracker = {"TP": 0, "FP": 0, "TN": 0, "FN": 0}
    for i in range(len(predictions)):
        if (predictions[i] == 0):
            if (y_test.iloc[i] == 0):
                tracker["TN"] += 1
            else:
                tracker["FN"] += 1
        else:
            if (y_test.iloc[i] == 1):
                tracker["TP"] += 1
            else:
                tracker["FP"] += 1

    precision, recall, f1 = analysePredictions(tracker)

    print("Confusion matrix:", tracker, "\nPrecision:", precision, "\nRecall:", recall, "\nf1:", f1)
    return precision, recall, f1


In [None]:
kNN_special = KNeighborsClassifier(n_neighbors = 5,   # Our k
                           algorithm = 'auto',        # {‘auto’, ‘ball_tree’, ‘kd_tree’, ‘brute’}, algorithm used
                           leaf_size = 30,            # Applicable for 'ball_tree' and 'kd_tree'
                           p = 2,                     # power used for 'minkowski' metric
                          )

In [None]:
analyseKNN(kNN_default, X_train, X_test, y_train, y_test)

Precision: 0.14772727272727273 
Recall: 0.011484098939929329 
f1: 0.021311475409836064


(0.14772727272727273, 0.011484098939929329, 0.021311475409836064)

## Change k

In [None]:
max_n = 15

for n in range(1, max_n):
    print("\n", n)
    kNN = KNeighborsClassifier(n_neighbors = n)
    analyseKNN(kNN, X_train, X_test, y_train, y_test)


 1
Precision: 0.09534270650263621 
Recall: 0.09584805653710247 
f1: 0.09559471365638768

 2
Precision: 0.11560693641618497 
Recall: 0.0088339222614841 
f1: 0.016413623307345096

 3
Precision: 0.11195445920303605 
Recall: 0.026060070671378093 
f1: 0.042278753135077035

 4
Precision: 0.09090909090909091 
Recall: 0.00265017667844523 
f1: 0.005150214592274678

 5
Precision: 0.14772727272727273 
Recall: 0.011484098939929329 
f1: 0.021311475409836064

 6
Precision: 0.16666666666666666 
Recall: 0.002208480565371025 
f1: 0.004359197907585004

 7
Precision: 0.11428571428571428 
Recall: 0.0035335689045936395 
f1: 0.006855184233076264

 8
Precision: 0.2727272727272727 
Recall: 0.001325088339222615 
f1: 0.002637362637362638

 9
Precision: 0.1875 
Recall: 0.001325088339222615 
f1: 0.002631578947368421

 10
Precision: 0.6666666666666666 
Recall: 0.0008833922261484099 
f1: 0.00176444640494045

 11
Precision: 0.3333333333333333 
Recall: 0.0008833922261484099 
f1: 0.001762114537444934

 12


ZeroDivisionError: ignored

## Change distance metric

In [None]:
max_p = 6

for pow in range(1, max_p):
    print("\n", pow)
    kNN = KNeighborsClassifier(p = pow)
    analyseKNN(kNN, X_train, X_test, y_train, y_test)


 1
Precision: 0.1411042944785276 
Recall: 0.010159010600706713 
f1: 0.018953440461475072

 2
Precision: 0.14772727272727273 
Recall: 0.011484098939929329 
f1: 0.021311475409836064

 3
Precision: 0.14465408805031446 
Recall: 0.010159010600706713 
f1: 0.018984729673957902

 4
Precision: 0.15517241379310345 
Recall: 0.011925795053003533 
f1: 0.022149302707136997

 5


# Resampling

In [None]:
from imblearn.over_sampling import SMOTE
# Use SMOTE for oversampling 
# (won't risk not being able to capture the spread of the dataset with undersampling)
# Only oversample for training set

X_train_oversampled, y_train_oversampled = SMOTE().fit_resample(X_train, y_train)

In [None]:
not_fraud = y_train_oversampled[y_train_oversampled == 0].index #Get index of rows that are not fraud
fraud = y_train_oversampled[y_train_oversampled == 1].index # #Get index of rows that are fraud


print("Resampled number of fraud:", len(fraud), "\nResampled number of not fraud:", len(not_fraud))

Resampled number of fraud: 89538 
Resampled number of not fraud: 89538


In [None]:
# Use knn with k=3, p=2
kNN_resampled = KNeighborsClassifier(n_neighbors = 3, p = 2)
analyseKNN(kNN_resampled, X_train_oversampled, X_test, y_train_oversampled, y_test)

Confusion matrix: {'TP': 714, 'FP': 7225, 'TN': 31158, 'FN': 1550} 
Precision: 0.08993576017130621 
Recall: 0.31537102473498235 
f1: 0.1399588356365775


(0.08993576017130621, 0.31537102473498235, 0.1399588356365775)

# HPC Code

Code below was used on NUS HPC servers to iterate through different combinations of k and p.

In [None]:
# # Import the standard tools for pythonic data analysis. 
# import csv
# import math
# import random
# import joblib
# import numpy as np 
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.neighbors import KNeighborsClassifier
# from imblearn.over_sampling import SMOTE
# print("Modules Loaded",flush=True)


# filename = '/home/svu/e0540423/CS3244/dataset/fixed_data_train.csv'
# # Import the final dataset as dataframe from csv
# df = pd.read_csv(filename, sep=',', low_memory=False) # Separate on commas
# print("Dataframe Loaded",flush=True)

# del df['Unnamed: 0'] # Delete Index of CSV file
# del df['client_id'] # Delete client_id as it is a string and can't be used as a feature

# # Partion the features from the target to predict
# df_features = df[df.columns[df.columns != 'target']].copy() # get columns that are not 'target'; this our features
# df_target = df['target'].copy() # get the column named 'target'; this is our label

# # (random_state): we use a fixed random seed so we get the same results every time.
# X_train, X_test, y_train, y_test = train_test_split(df_features, df_target, test_size=0.3, random_state=1) ## RANDOM STATE DETERMINED HERE

# # Do oversampling 
# X_train_oversampled, y_train_oversampled = SMOTE().fit_resample(X_train, y_train)

# print ("Number of oversampled training instances: ", len(X_train_oversampled), "\nNumber of test instances: ", len(X_test),flush=True)


# def analysePredictions(tracker):
#     TP = tracker["TP"]
#     FP = tracker["FP"]
#     TN = tracker["TN"]
#     FN = tracker["FN"]

#     if (TP + FP) == 0.0 or (TP + FN) == 0:
#         return "Null", "Null", "Null"
#     else:
#         precision = TP / (TP + FP)
#         recall = TP / (TP + FN)
#         f1 = 2 / (1/precision + 1/recall)

#         return precision, recall, f1

# def analyseKNN(knn, X_train, X_test, y_train, y_test):
#     knn.fit(X_train, y_train)
#     predictions = knn.predict(X_test)

#     tracker = {"TP": 0, "FP": 0, "TN": 0, "FN": 0}
#     for i in range(len(predictions)):
#         if (predictions[i] == 0):
#             if (y_test.iloc[i] == 0):
#                 tracker["TN"] += 1
#             else:
#                 tracker["FN"] += 1
#         else:
#             if (y_test.iloc[i] == 1):
#                 tracker["TP"] += 1
#             else:
#                 tracker["FP"] += 1

#     precision, recall, f1 = analysePredictions(tracker)

#     print("Confusion matrix:", tracker, "\nPrecision:", precision, "\nRecall:", recall, "\nf1:", f1, flush=True)
#     return tracker, precision, recall, f1

# with open('/home/svu/e0540423/CS3244/kNN_out.csv','a') as file:
#     csv.writer(file).writerow(['k','power', 'TN', 'FN', 'TP', 'FP','precision','recall','f1'])
#     file.close()


# max_n = 31
# max_p = 16

# for n in range(1, max_n):
#     for power in range(1, max_p):
#         print("\n", n,flush=True)
#         print("\n", power,flush=True)
#         kNN = KNeighborsClassifier(n_neighbors = n, p = power,n_jobs = 16)
#         tr, prec, rec, f1 = analyseKNN(kNN, X_train_oversampled, X_test, y_train_oversampled, y_test)
#         with open('/home/svu/e0540423/CS3244/kNN_out16.csv','a') as file:
#             csv.writer(file).writerow([n, power, tr["TN"], tr["FN"], tr["TP"], tr["FP"], prec, rec, f1])
#             file.close()



# References

Curse of dimensionality affects: https://towardsdatascience.com/the-surprising-behaviour-of-distance-metrics-in-high-dimensions-c2cb72779ea6