# K-Nearest Neighbour Model

## Importing libraries

In [1]:
import pickle
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
import pandas as pd

## Reading in CSV data
Reading the dataset from a CSV into a DataFrame for use with the model.
Remove unnecessary data and split into data and labels.

In [2]:
music_df = pd.read_csv("extracted-features.csv")

## This will contain the data
data = music_df.iloc[:, 1:].values
## This will contain the labels corresponding to the data
labels = music_df.iloc[:, 0].values

## Cross-validate a KNN to find the best hyper-parameters

Use GridSearchCV to evaluate the optimal hyperparameters for the algorithm.

In [3]:
data_train, data_test, labels_train, labels_test = train_test_split(data, labels, test_size = 0.30)

knn_model = KNeighborsClassifier()

param_grid = {"n_neighbors": range(1, 25),
              "weights": ["uniform", "distance"],
              "metric": ["euclidean", "manhattan", "chebyshev", "minkowski"]}

scoring_metrics = {"Accuracy": "accuracy",
                   "AUC": "roc_auc_ovr",
                   "F1": "f1_macro"}

## roc_auc_ovr sets the score to use roc_auc in one vs rest mode
## StratifiedKFold is used by default for the cross validation
knn_grid_search_cv = GridSearchCV(knn_model,
                                  param_grid,
                                  scoring = scoring_metrics,
                                  refit = "AUC",
                                  cv = 5)

knn_grid_search_cv.fit(data_train, labels_train)

best_params = knn_grid_search_cv.best_params_

print(best_params)

knn_grid_predict = knn_grid_search_cv.predict(data_test)

print(classification_report(labels_test, knn_grid_predict))

## Best score is using the metric defined in refit
print("ROC AUC: " + str(knn_grid_search_cv.best_score_))

best_k_value = best_params["n_neighbors"]
best_weighting = best_params["weights"]
best_distance_metric = best_params["metric"]

{'metric': 'manhattan', 'n_neighbors': 24, 'weights': 'distance'}
              precision    recall  f1-score   support

       Blues       0.27      0.27      0.27        26
   Classical       0.73      0.51      0.60        37
     Country       0.14      0.12      0.13        32
       Disco       0.18      0.21      0.19        29
      HipHop       0.11      0.10      0.11        30
        Jazz       0.24      0.17      0.20        35
       Metal       0.45      0.59      0.51        32
         Pop       0.38      0.48      0.42        25
      Reggae       0.26      0.30      0.28        27
        Rock       0.24      0.26      0.25        27

    accuracy                           0.30       300
   macro avg       0.30      0.30      0.30       300
weighted avg       0.31      0.30      0.30       300

ROC AUC: 0.7404926783412891


## Produce KNN model

Produce a KNN model using the optimal hyperparameters and save to a Pickle file for use with Azure.

In [4]:
knn_model = KNeighborsClassifier(n_neighbors = best_k_value,
                                 weights = best_weighting,
                                 metric = best_distance_metric)

knn_model.fit(data_train, labels_train)

with open("models/knn.pkl", mode = "wb") as model_file:
    pickle.dump(knn_model, model_file)