In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score
import joblib

In [2]:
DATASET_DIR = "./dataset.csv"
LABEL_CHROMA_FEATURE = ['C', 'Db', 'D', 'Eb', 'E', 'F', 'Gb', 'G', 'Ab', 'A', 'Bb', 'B']
LABEL_CHORD = 'chord'

In [3]:
df = pd.read_csv(DATASET_DIR)
df

Unnamed: 0,C,Db,D,Eb,E,F,Gb,G,Ab,A,Bb,B,chord
0,0.206002,0.062756,0.082554,0.156872,0.209984,1.000000,0.187169,0.123724,0.052789,0.039549,0.018230,0.056326,FMin7
1,0.206002,0.062756,0.082554,0.156872,0.209984,1.000000,0.187169,0.127993,0.052789,0.039549,0.018230,0.056326,FMin7
2,0.247106,0.055797,0.071903,0.123175,0.187465,1.000000,0.142260,0.127993,0.052789,0.034122,0.018230,0.056326,FMin7
3,0.265622,0.055797,0.071903,0.123175,0.187465,1.000000,0.142260,0.127993,0.052789,0.034122,0.018230,0.056326,FMin7
4,0.265622,0.051397,0.069050,0.118164,0.178773,1.000000,0.123252,0.127993,0.052789,0.034122,0.018130,0.056326,FMin7
...,...,...,...,...,...,...,...,...,...,...,...,...,...
26641,1.000000,0.090777,0.344614,0.043224,0.058901,0.016370,0.062591,0.018879,0.014510,0.043015,0.014165,0.136580,D7
26642,1.000000,0.090777,0.337424,0.043224,0.058901,0.025307,0.062591,0.021771,0.017041,0.043015,0.026106,0.136580,D7
26643,1.000000,0.090777,0.324222,0.043858,0.058901,0.026028,0.049029,0.022619,0.018375,0.041911,0.044637,0.136580,D7
26644,1.000000,0.090777,0.295212,0.043858,0.058901,0.028721,0.039280,0.024203,0.018484,0.041911,0.045103,0.136580,D7


In [4]:
df_copy = df.copy()
y = df_copy["chord"]
X = df_copy.drop(columns="chord")
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, random_state=42, shuffle=True, test_size=0.2, stratify=y_encoded)

In [5]:
print(f"X_train: {len(X_train)}")
print(f"X_test: {len(X_test)}")

X_train: 21316
X_test: 5330


In [6]:
joblib.dump(encoder, "./encoder.xz")

['./encoder.xz']

# Models

## SVM

In [7]:
svc = SVC()
params_svm = [{
    "C": [1, 10, 1000],
    # "kernel": ['linear', 'poly', 'rbf', 'sigmoid'],
    "kernel": ['poly', 'rbf', 'sigmoid'],
    # "gamma": ['scale', 'auto', 1, 0.1, 0.01, 0.001, 0.0001]
    "gamma": [1, 0.1, 0.01, 0.001, 0.0001]
}]
grid_search_svm = GridSearchCV(svc, params_svm, cv=5)

In [8]:
grid_search_svm.fit(X_train, y_train)

In [9]:
print(f"Best params: {grid_search_svm.best_params_}")
print(f"Best CV score: {grid_search_svm.best_score_ * 100}%")
model_svm = grid_search_svm.best_estimator_

Best params: {'C': 1000, 'gamma': 1, 'kernel': 'rbf'}
Best CV score: 89.08798778617353%


In [10]:
y_pred = model_svm.predict(X_test)
score = accuracy_score(y_pred, y_test)
print(f"Model accuracy: {score * 100}%")

Model accuracy: 90.56285178236398%


In [11]:
joblib.dump(model_svm, "./model_svm.xz")
joblib.dump(grid_search_svm, "./grid_search_svm.xz")

['./grid_search_svm.xz']

In [None]:
knn = KNeighborsClassifier()
params_knn = [{
    "n_neighbors": [2, 3, 4, 5, 6],
    "weights": ["uniform", "distance"],
    "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
    "leaf_size": [10, 20, 30, 40, 50]
}]
grid_search_knn = GridSearchCV(knn, params_knn)

## KNN

In [None]:
grid_search_knn.fit(X_train, y_train)

In [None]:
print(f"Best params: {grid_search_knn.best_params_}")
print(f"Best CV score: {grid_search_knn.best_score_ * 100}%")
model_knn = grid_search_knn.best_estimator_
# score = accuracy_score(y_pred, y_test)
# print(f"Model accuracy: {score * 100}%")

## Random Forest

In [12]:
rf = RandomForestClassifier()
params_rf = [{
    "n_estimators": [100, 150, 200, 250, 300, 350],
    "max_depth": [None, 1, 2, 3, 4, 5],
    "warm_start": [True, False]
}]
grid_search_rf = GridSearchCV(rf, params_rf)

In [13]:
grid_search_rf.fit(X_train, y_train)

In [14]:
print(f"Best params: {grid_search_rf.best_params_}")
print(f"Best CV score: {grid_search_rf.best_score_ * 100}%")
model_rf = grid_search_rf.best_estimator_

Best params: {'max_depth': None, 'n_estimators': 300, 'warm_start': False}
Best CV score: 94.30004744344525%


In [15]:
y_pred = model_rf.predict(X_test)
score = accuracy_score(y_pred, y_test)
print(f"Model accuracy: {score * 100}%")

Model accuracy: 94.89681050656661%


In [16]:
joblib.dump(model_rf, "./model_rf.xz")
joblib.dump(grid_search_rf, "./grid_search_rf.xz")

['./grid_search_rf.xz']

In [None]:
gb = GradientBoostingClassifier()
params_gb = [{
    "learning_rate": [0.1, 0.01, 0.001],
    "n_estimators": [100, 150, 200, 250, 300, 350],
    
}]
grid_search_gb = GridSearchCV(gb, params_gb)

In [None]:
grid_search_gb.fit(X_train, y_train)

In [None]:
print(f"Best params: {grid_search_gb.best_params_}")
print(f"Best CV score {grid_search_gb.best_score_ * 100}%")
model_gb = grid_search_gb.best_estimator_

In [None]:
joblib.dump(model_vc, "./model.joblib")

In [None]:
vc = VotingClassifier([
    ("SVM", model_svm),
    ("RandomForest", model_rf),
    ("GradientBoost", model_gb),
])
params_vc = [{
    "voting": ["hard", "soft"],
    "weights": [(1,1,1), (2,1,1), (1,2,1), (1,1,2)]
}]
grid_search_vc = GridSearchCV(vc, params_vc)

In [None]:
grid_search_vc.fit(X_train, y_train)

In [None]:
print(f"Best params {grid_search_vc.best_params_}")
print(f"Best CV Score {grid_search_vc.best_score_ * 100}%")
model_vc = grid_search_vc.best_estimator_

In [None]:
y_pred = model_vc.predict(X_test)
score = accuracy_score(y_pred, y_test)
print(f"Model accuracy: {score * 100}%")