In [86]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score
import joblib

In [90]:
DATASET_DIR = "./datasets.csv"
LABEL_CHROMA_FEATURE = ['C', 'Db', 'D', 'Eb', 'E', 'F', 'Gb', 'G', 'Ab', 'A', 'Bb', 'B']
LABEL_CHORD = 'chord'

In [88]:
df = pd.read_csv(DATASET_DIR)
df

Unnamed: 0,C,Db,D,Eb,E,F,Gb,G,Ab,A,Bb,B,chord_label
0,0.206002,0.062756,0.082554,0.156872,0.209984,1.000000,0.187169,0.123724,0.052789,0.039549,0.018230,0.056326,FMin7
1,0.206002,0.062756,0.082554,0.156872,0.209984,1.000000,0.187169,0.127993,0.052789,0.039549,0.018230,0.056326,FMin7
2,0.247106,0.055797,0.071903,0.123175,0.187465,1.000000,0.142260,0.127993,0.052789,0.034122,0.018230,0.056326,FMin7
3,0.265622,0.055797,0.071903,0.123175,0.187465,1.000000,0.142260,0.127993,0.052789,0.034122,0.018230,0.056326,FMin7
4,0.265622,0.051397,0.069050,0.118164,0.178773,1.000000,0.123252,0.127993,0.052789,0.034122,0.018130,0.056326,FMin7
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20651,0.051663,0.202682,0.157194,0.103382,0.677675,0.057180,0.206020,0.028019,0.200380,1.000000,0.393915,0.588159,BMin7
20652,0.052551,0.195660,0.149962,0.103382,0.624835,0.057180,0.167295,0.030389,0.201167,1.000000,0.393915,0.572762,BMin7
20653,0.053566,0.195660,0.149962,0.103382,0.379648,0.039133,0.167295,0.030389,0.201167,1.000000,0.333167,0.527127,BMin7
20654,0.052551,0.170932,0.142730,0.083575,0.379648,0.039133,0.153147,0.026181,0.200380,1.000000,0.333167,0.527127,BMin7


In [91]:
df_copy = df.copy()
y = df_copy["chord"]
X = df_copy.drop(columns="chord")
y_encoded = LabelEncoder().fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, random_state=42, shuffle=True, test_size=0.2, stratify=y_encoded)

# Models

## SVM

In [92]:
svc = SVC()
params_svm = [{
    "C": [1, 10, 1000],
    # "kernel": ['linear', 'poly', 'rbf', 'sigmoid'],
    "kernel": ['poly', 'rbf', 'sigmoid'],
    # "gamma": ['scale', 'auto', 1, 0.1, 0.01, 0.001, 0.0001]
    "gamma": [1, 0.1, 0.01, 0.001, 0.0001]
}]
grid_search_svm = GridSearchCV(svc, params_svm)

In [93]:
grid_search_svm.fit(X_train, y_train)

In [94]:
model_svm = grid_search_svm.best_estimator_
y_pred = model_svm.predict(X_test)
score = accuracy_score(y_pred, y_test)
print(f"Model accuracy: {score * 100}%")

Model accuracy: 90.68247821878025%


## KNN

In [95]:
knn = KNeighborsClassifier()
params_knn = [{
    "n_neighbors": [2, 3, 4, 5, 6],
    "weights": ["uniform", "distance"],
    "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
    "leaf_size": [10, 20, 30, 40, 50]
}]
grid_search_knn = GridSearchCV(knn, params_knn)

In [96]:
grid_search_knn.fit(X_train, y_train)

In [97]:
model_knn = grid_search_knn.best_estimator_
y_pred = model_knn.predict(X_test)
score = accuracy_score(y_pred, y_test)
print(f"Model accuracy: {score * 100}%")

Model accuracy: 90.77928363988383%


## Random Forest

In [98]:
rf = RandomForestClassifier()
params_rf = [{
    "n_estimators": [100, 125, 150, 200],
    "max_depth": [None, 1, 2, 3, 4, 5],
    "warm_start": [True, False]
}]
grid_search_rf = GridSearchCV(rf, params_rf)

In [99]:
grid_search_rf.fit(X_train, y_train)

In [100]:
model_rf = grid_search_rf.best_estimator_
y_pred = model_rf.predict(X_test)
score = accuracy_score(y_pred, y_test)
print(f"Model accuracy: {score * 100}%")

Model accuracy: 94.74830590513069%


# Gradient Boost

In [104]:
gb = GradientBoostingClassifier()
params_gb = [{
    "learning_rate": [0.1, 0.01, 0.001],
    "n_estimators": [100, 150, 200, 250, 300],
    
}]
grid_search_gb = GridSearchCV(gb, params_gb)

In [105]:
grid_search_gb.fit(X_train, y_train)

In [106]:
model_gb = grid_search_gb.best_estimator_
y_pred = model_gb.predict(X_test)
score = accuracy_score(y_pred, y_test)
print(f"Model accuracy: {score * 100}%")

Model accuracy: 90.73088092933205%


# Ada Boost

In [107]:
ab = AdaBoostClassifier()
params_ab = [{
    "n_estimators": [50, 100, 150, 200, 250, 300],
    "learning_rate": [1.0, 0.1, 0.01, 0.001]
}]
grid_search_ab = GridSearchCV(ab, params_ab)

In [108]:
grid_search_ab.fit(X_train, y_train)

In [109]:
model_ab = grid_search_ab.best_estimator_
y_pred = model_ab.predict(X_test)
score = accuracy_score(y_pred, y_test)
print(f"Model accuracy: {score * 100}%")

Model accuracy: 35.47918683446273%


# Ensemble Voters

In [112]:
vc = VotingClassifier([
    ("KNN", model_knn),
    ("RandomForest", model_rf),
    ("GradientBoost", model_gb),
])
params_vc = [{
    "voting": ["hard", "soft"],
    "weights": [(1,1,1), (2,1,1), (1,2,1), (1,1,2)]
}]
grid_search_vc = GridSearchCV(vc, params_vc)

In [113]:
grid_search_vc.fit(X_train, y_train)

In [114]:
model_vc = grid_search_vc.best_estimator_
y_pred = model_vc.predict(X_test)
score = accuracy_score(y_pred, y_test)
print(f"Model accuracy: {score * 100}%")

Model accuracy: 94.86931268151017%


# Saving model

In [115]:
joblib.dump(model_vc, "./model.joblib")

['./model.joblib']