# Creating and Tuning Models

## Importing libraries

In [None]:
import csv
import pickle
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, plot_confusion_matrix
import pandas as pd
from matplotlib import pyplot as plt

## Reading in CSV data
Reading the dataset from a CSV into a DataFrame for use with the model.
Remove unnecessary data and split into data and labels.

In [None]:
music_df = pd.read_csv("extracted-features.csv")

## This will contain the data
data = music_df.iloc[:, 1:].values
## This will contain the labels corresponding to the data
labels = music_df.iloc[:, 0].values

## Split the data into training and testing sets
data_train, data_test, labels_train, labels_test = train_test_split(data, labels, test_size = 0.30)

with open("test_data_split.csv", "w") as test_file:
    test_writer = csv.writer(test_file, delimiter = ',')
    test_writer.writerow(["Genre", "MFCC", "ZCR", "Spectral Centroid", "Spectral Rolloff"])

    for i in range(0, len(data_test), 1):
        track_num = i
        test_writer.writerow([data_test[0], data_test[1], data_test[2], data_test[3], data_test[4]])

## Cross-validate KNN to find the best hyper-parameters

Use GridSearchCV to evaluate the optimal hyperparameters for the algorithm.

In [None]:
knn_model = KNeighborsClassifier()

param_grid = {"n_neighbors": range(1, 25),
              "weights": ["uniform", "distance"],
              "metric": ["euclidean", "manhattan", "chebyshev", "minkowski"]}

scoring_metrics = {"Accuracy": "accuracy",
                   "AUC": "roc_auc_ovr",
                   "F1": "f1_macro"}

## roc_auc_ovr sets the score to use roc_auc in one vs rest mode
## StratifiedKFold is used by default for the cross validation
knn_grid_search_cv = GridSearchCV(knn_model,
                                  param_grid,
                                  scoring = scoring_metrics,
                                  refit = "AUC",
                                  cv = 5)

knn_grid_search_cv.fit(data_train, labels_train)

best_params = knn_grid_search_cv.best_params_

print(best_params)

knn_grid_predict = knn_grid_search_cv.predict(data_test)

print(classification_report(labels_test, knn_grid_predict))

plot_confusion_matrix(estimator=knn_grid_search_cv, X=data_test, y_true=labels_test, xticks_rotation="vertical")
plt.show()

## Best score is using the metric defined in refit
print("ROC AUC: " + str(knn_grid_search_cv.best_score_))

best_k_value = best_params["n_neighbors"]
best_weighting = best_params["weights"]
best_distance_metric = best_params["metric"]

## Cross-validate SVM to find the best hyper-parameters

Use GridSearchCV to evaluate the optimal hyperparameters for the algorithm.

In [None]:
svm_model = SVC(probability = True, kernel = "rbf")

param_grid = {"C": [0.1, 1, 10, 100, 1000],
              "gamma": ["scale", "auto"]}

scoring_metrics = {"Accuracy": "accuracy",
                   "AUC": "roc_auc_ovr",
                   "F1": "f1_macro"}

## roc_auc_ovr sets the score to use roc_auc in one vs rest mode
## StratifiedKFold is used by default for the cross validation
svm_grid_search_cv = GridSearchCV(svm_model,
                                  param_grid,
                                  scoring = scoring_metrics,
                                  refit = "AUC",
                                  cv = 5)

svm_grid_search_cv.fit(data_train, labels_train)

best_params = svm_grid_search_cv.best_params_

print(best_params)

svm_grid_predict = svm_grid_search_cv.predict(data_test)

print(classification_report(labels_test, svm_grid_predict))

plot_confusion_matrix(estimator=svm_grid_search_cv, X=data_test, y_true=labels_test, xticks_rotation="vertical")
plt.show()

## Best score is using the metric defined in refit
print("ROC AUC: " + str(svm_grid_search_cv.best_score_))

best_c_value = best_params["C"]
best_gamma = best_params["gamma"]

## Save models

Save the tuned KNN and SVM models using the optimal hyperparameters and save to a Pickle file for use with Azure.

In [None]:
with open("models/knn.pkl", mode = "wb") as model_file:
    pickle.dump(knn_grid_search_cv, model_file)

with open("models/svm.pkl", mode = "wb") as model_file:
    pickle.dump(svm_grid_search_cv, model_file)