In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
import librosa
import librosa.display
import IPython.display as ipd
from glob import glob
import warnings
warnings.filterwarnings('ignore')
import csv

from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from xgboost import XGBClassifier

In [20]:
# %pwd, %cd if path is not right

def load_features(path):
    data = pd.read_csv(path)
    prefix = 'feature_'
    new_columns = []
    for col in range(66):
        new_col = prefix + str(col)
        new_columns.append(new_col)
    data.columns = new_columns
    # # uncoment this to check that data is loaded
    # data.head()
    return data

In [21]:
def create_dictionary_for_instruments(instruments):
    dict = {}
    i = 0
    for instrument in instruments:
        dict[instrument] = i
        i += 1
    return dict

In [22]:
def select_instrument_subset_and_replace_labels(instruments, features):
    features = features.iloc[0:, 1:]

    features = features.loc[features['feature_65'].isin(instruments)]


    dict = create_dictionary_for_instruments(instruments)

    # replace the values in the 'feature_59' column using the dictionary
    y = features['feature_65'].replace(dict)

    return features, y

In [23]:
def remove_unwanted_features(X):
    X = X.loc[:, X.columns != 'feature_65'] #select all columns but not the labels
    X = X.loc[:, X.columns != 'feature_1']
    X = X.loc[:, X.columns != 'feature_18']

    return X

In [24]:
def remove_unwanted_features_and_normalize(X):
    X = X.loc[:, X.columns != 'feature_65'] #select all columns but not the labels
    X = X.loc[:, X.columns != 'feature_1']
    X = X.loc[:, X.columns != 'feature_18']

    X_normalized = (X - X.mean()) / X.std()

    return X

In [25]:
def model_assess(model, title, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print('Accuracy', title, ':', round(accuracy_score(y_test, preds), 5), '\n')

In [26]:
def plot_confusion_matrix(model_type, preds, y_test, accuracy, instruments):
    plt.rcParams["font.family"] = "serif"

    # preds = model.predict(X_test)
    # accuracy = round(accuracy_score(y_test, preds), 5)
    # print('Accuracy', ':', round(accuracy, 5), '\n')
    # Confusion Matrix
    confusion_matr = confusion_matrix(y_test, preds) #normalize = 'true'

    confusion_matr_normalized = confusion_matr.astype('float') / confusion_matr.sum(axis=1)[:,np.newaxis]
    figsize = (16, 9)
    figure,ax=plt.subplots(figsize=figsize)
    sns.heatmap(confusion_matr_normalized, cmap="Blues", annot=True, 
                xticklabels = instruments,
                yticklabels = instruments)

    font_params = {  
                    'weight' : 'normal',
                    'size' : 15,
                }
    font_label = {  
                    'weight' : 'bold',
                    'size' : 22,
                }

    font_title = {  
                    'weight' : 'bold',
                    'size' : 25,
                }
    plt.tick_params(labelsize=15)
    labels = ax.get_xticklabels() + ax.get_yticklabels()
    plt.xlabel('Target Class',font_label)
    plt.ylabel('Output Class',font_label)
    plt.title('Accuracy of ' + model_type + ': ' +str(round(accuracy*100,3))+'%',font_title)
    plt.tight_layout()
    plt.show()
    # plt.savefig("probability conf matrix.jpg")

In [27]:
model_dict = {
        'naive_bayes': GaussianNB,
        'decision_trees': DecisionTreeClassifier,
        'logistic_regression': LogisticRegression,
        'support_vector_machine': SVC,
        'stochastic_gradient_descent': SGDClassifier,
        'xgboost': XGBClassifier
    }
def select_model(model_type):
    return model_dict[model_type]

In [28]:
def fit_model(model_type, X_train, y_train, X_test, y_test, instruments):
    model = select_model(model_type)()

    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    accuracy = round(accuracy_score(y_test, preds), 5)

    return accuracy

In [29]:
def fit_and_plot_model(model_type, X_train, y_train, X_test, y_test, instruments):
    model = select_model(model_type)()

    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    accuracy = round(accuracy_score(y_test, preds), 5)

    plot_confusion_matrix(model_type, preds, y_test, accuracy, instruments)

In [30]:
def test_and_plot_confusion_matrix(instruments, features_path, model_type):

    features = load_features(features_path)

    X, y = select_instrument_subset_and_replace_labels(instruments, features)

    X = remove_unwanted_features(X)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    if model_type is None:
        for model_type in list(model_dict.keys()):
            fit_and_plot_model(model_type, X_train, y_train, X_test, y_test, instruments)
    else:
        fit_and_plot_model(model_type, X_train, y_train, X_test, y_test, instruments)

In [31]:
def test_and_return_results(instruments, features_path, model_type):

    features = load_features(features_path)

    X, y = select_instrument_subset_and_replace_labels(instruments, features)

    X = remove_unwanted_features(X)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    if model_type is None:
        acc_max = 0
        model_max = ""
        for model_type in list(model_dict.keys()):
            acc_curr = fit_model(model_type, X_train, y_train, X_test, y_test, instruments)
            if acc_curr > acc_max:
                acc_max = acc_curr
                model_max = model_type
        return acc_max, model_max
    else:
        acc = fit_model(model_type, X_train, y_train, X_test, y_test, instruments)
    return acc, model_type
    

In [15]:
all_instruments = ['cel', 'cla', 'flu', 'gac', 'gel', 'org', 'pia', 'sax', 'tru', 'vio', 'voi']

In [1]:
# generate best model accuracies for all pairs of instruments
features_path = 'test.csv'
bad_pairs = []
for i in range(0, 10):
    for j in range(i + 1, 11):
        print(all_instruments[i] + ' vs. ' + all_instruments[j])
        pair_of_instruments = [all_instruments[i], all_instruments[j]]
        acc_max, model_max = test_and_return_results(pair_of_instruments, features_path, None)
        print(model_max + ": " + str(acc_max))
        if acc_max < 0.9:
            bad_pairs.append((all_instruments[i], all_instruments[j], acc_max))

In [2]:
# pairs of instrument for which the pairwise distinguishing is the worst
bad_pairs

## Tuning parameters of models

In [84]:
features = load_features(features_path)

X, y = select_instrument_subset_and_replace_labels(all_instruments, features)

X = remove_unwanted_features(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = select_model('xgboost')(learning_rate=0.05, n_estimators=500, max_depth=10, min_child_weight=1)

model.fit(X_train, y_train)
preds = model.predict(X_test)
accuracy = round(accuracy_score(y_test, preds), 5)
print(accuracy)

0.67084


In [83]:
print(model)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.01, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=10, max_leaves=None,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=500, n_jobs=None, num_parallel_tree=None,
              objective='multi:softprob', predictor=None, ...)


In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler


#X_train= scaler.transform(X_train)



# declare parameter ranges to try
params = {
    'C': [8,8.1,8.2,8.3,8.9,9],  # Regularization parameter
    'kernel': ['poly', 'rbf'],  # Kernel function
    'degree': [2, 3, 4],  # Degree of the polynomial kernel (if applicable)
    'gamma': [0.39]  # Kernel coefficient (scale or auto for 'rbf', or numeric value)
}

# initialise estimator
svm_classifier = SVC(class_weight='balanced')

# initialise grid search model
model = GridSearchCV(estimator=svm_classifier, 
                     param_grid=params,
                     scoring='accuracy',
                     n_jobs=-1)

model.fit(scaler.transform(X_train), y_train)

y_pred = model.predict(scaler.transform(X_test))

print(model.best_params_)
print(classification_report(y_test, y_pred))

{'C': 10, 'degree': 2, 'gamma': 0.1, 'kernel': 'rbf'}
              precision    recall  f1-score   support

           0       0.80      0.58      0.67        71
           1       0.80      0.68      0.73       139
           2       0.70      0.59      0.64        76
           3       0.77      0.78      0.77       122
           4       0.79      0.78      0.79       153
           5       0.81      0.77      0.79       142
           6       0.78      0.67      0.72       143
           7       0.69      0.55      0.61       134
           8       0.62      0.71      0.66       107
           9       0.65      0.62      0.64       106
          10       0.58      0.92      0.71       165

    accuracy                           0.71      1358
   macro avg       0.73      0.70      0.70      1358
weighted avg       0.73      0.71      0.71      1358



In [None]:
import optuna
def objective(trial):
    C = trial.suggest_loguniform('C', 13, 16)
    gamma = trial.suggest_loguniform('gamma', 0.025, 0.04)
    clf = SVC(C=C, gamma=gamma)
    clf.fit(scaler.transform(X_train), y_train)
    score = clf.score(scaler.transform(X_test), y_test)
    return score


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[32m[I 2023-04-19 03:07:38,079][0m A new study created in memory with name: no-name-240e17aa-fa12-429c-99ee-5b07d7ab6d41[0m
[32m[I 2023-04-19 03:07:40,467][0m Trial 0 finished with value: 0.7422680412371134 and parameters: {'C': 14.999575909342413, 'gamma': 0.029543664629568422}. Best is trial 0 with value: 0.7422680412371134.[0m
[32m[I 2023-04-19 03:07:42,758][0m Trial 1 finished with value: 0.7422680412371134 and parameters: {'C': 15.878031115167877, 'gamma': 0.028217460653687215}. Best is trial 0 with value: 0.7422680412371134.[0m
[32m[I 2023-04-19 03:07:45,205][0m Trial 2 finished with value: 0.7415316642120766 and parameters: {'C': 13.177374841198757, 'gamma': 0.034541999906854845}. Best is trial 0 with value: 0.7422680412371134.[0m
[32m[I 2023-04-19 03:07:47,625][0m Trial 3 finished with value: 0.7415316642120766 and parameters: {'C': 15.76101094916819, 'gamma': 0.03323426665598193}. Best is trial 0 with value: 0.7422680412371134.[0m
[32m[I 2023-04-19 03:07:50,148

[32m[I 2023-04-19 03:09:14,045][0m Trial 39 finished with value: 0.7430044182621502 and parameters: {'C': 15.217439474289149, 'gamma': 0.03128180094297924}. Best is trial 14 with value: 0.7452135493372607.[0m
[32m[I 2023-04-19 03:09:16,292][0m Trial 40 finished with value: 0.7430044182621502 and parameters: {'C': 14.8983230659044, 'gamma': 0.027588694283165648}. Best is trial 14 with value: 0.7452135493372607.[0m
[32m[I 2023-04-19 03:09:18,654][0m Trial 41 finished with value: 0.7452135493372607 and parameters: {'C': 14.685715376416743, 'gamma': 0.03034970815080575}. Best is trial 14 with value: 0.7452135493372607.[0m
[32m[I 2023-04-19 03:09:21,045][0m Trial 42 finished with value: 0.7452135493372607 and parameters: {'C': 14.811290682930254, 'gamma': 0.030631462437800207}. Best is trial 14 with value: 0.7452135493372607.[0m
[32m[I 2023-04-19 03:09:23,415][0m Trial 43 finished with value: 0.7437407952871871 and parameters: {'C': 14.47671481304924, 'gamma': 0.02939126042450

[32m[I 2023-04-19 03:10:47,346][0m Trial 78 finished with value: 0.7452135493372607 and parameters: {'C': 15.054235917894207, 'gamma': 0.030140567675517708}. Best is trial 76 with value: 0.7459499263622975.[0m
[32m[I 2023-04-19 03:10:49,625][0m Trial 79 finished with value: 0.7407952871870398 and parameters: {'C': 15.406867899491468, 'gamma': 0.028823490012518976}. Best is trial 76 with value: 0.7459499263622975.[0m
[32m[I 2023-04-19 03:10:51,982][0m Trial 80 finished with value: 0.7437407952871871 and parameters: {'C': 15.312661689262905, 'gamma': 0.029710639800279236}. Best is trial 76 with value: 0.7459499263622975.[0m
[32m[I 2023-04-19 03:10:54,345][0m Trial 81 finished with value: 0.7444771723122239 and parameters: {'C': 15.141327592724469, 'gamma': 0.031227081613163232}. Best is trial 76 with value: 0.7459499263622975.[0m
[32m[I 2023-04-19 03:10:56,729][0m Trial 82 finished with value: 0.7452135493372607 and parameters: {'C': 14.92369938617409, 'gamma': 0.0301823542

In [None]:
#OPTIMAL MODEL ASSESSS

svclassifier = SVC(C=15.37, kernel='rbf', gamma=0.0301, class_weight=None, decision_function_shape='ovr', shrinking=True, tol=1e-4, max_iter=-1)

model_assess(svclassifier, "SVM")

Accuracy SVM : 0.74595 

