In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score
import joblib

In [2]:
DATASET_DIR = "./datasets.csv"
LABEL_CHROMA_FEATURE = ['C', 'Db', 'D', 'Eb', 'E', 'F', 'Gb', 'G', 'Ab', 'A', 'Bb', 'B']
LABEL_CHORD = 'chord'

In [3]:
df = pd.read_csv(DATASET_DIR)
df

Unnamed: 0,C,Db,D,Eb,E,F,Gb,G,Ab,A,Bb,B,chord
0,0.206002,0.062756,0.082554,0.156872,0.209984,1.000000,0.187169,0.123724,0.052789,0.039549,0.018230,0.056326,FMin7
1,0.206002,0.062756,0.082554,0.156872,0.209984,1.000000,0.187169,0.127993,0.052789,0.039549,0.018230,0.056326,FMin7
2,0.247106,0.055797,0.071903,0.123175,0.187465,1.000000,0.142260,0.127993,0.052789,0.034122,0.018230,0.056326,FMin7
3,0.265622,0.055797,0.071903,0.123175,0.187465,1.000000,0.142260,0.127993,0.052789,0.034122,0.018230,0.056326,FMin7
4,0.265622,0.051397,0.069050,0.118164,0.178773,1.000000,0.123252,0.127993,0.052789,0.034122,0.018130,0.056326,FMin7
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20651,0.051663,0.202682,0.157194,0.103382,0.677675,0.057180,0.206020,0.028019,0.200380,1.000000,0.393915,0.588159,BMin7
20652,0.052551,0.195660,0.149962,0.103382,0.624835,0.057180,0.167295,0.030389,0.201167,1.000000,0.393915,0.572762,BMin7
20653,0.053566,0.195660,0.149962,0.103382,0.379648,0.039133,0.167295,0.030389,0.201167,1.000000,0.333167,0.527127,BMin7
20654,0.052551,0.170932,0.142730,0.083575,0.379648,0.039133,0.153147,0.026181,0.200380,1.000000,0.333167,0.527127,BMin7


In [4]:
df_copy = df.copy()
y = df_copy["chord"]
X = df_copy.drop(columns="chord")
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, random_state=42, shuffle=True, test_size=0.25, stratify=y_encoded)

In [5]:
print(f"X_train: {len(X_train)}")
print(f"X_test: {len(X_test)}")

X_train: 15492
X_test: 5164


# Models

## SVM

In [6]:
svc = SVC()
params_svm = [{
    "C": [1, 10, 1000],
    # "kernel": ['linear', 'poly', 'rbf', 'sigmoid'],
    "kernel": ['poly', 'rbf', 'sigmoid'],
    # "gamma": ['scale', 'auto', 1, 0.1, 0.01, 0.001, 0.0001]
    "gamma": [1, 0.1, 0.01, 0.001, 0.0001]
}]
grid_search_svm = GridSearchCV(svc, params_svm, cv=5)

In [7]:
grid_search_svm.fit(X_train, y_train)

In [8]:
print(f"Best params: {grid_search_svm.best_params_}")
print(f"Best CV score: {grid_search_svm.best_score_ * 100}%")
model_svm = grid_search_svm.best_estimator_
# score = accuracy_score(y_pred, y_test)
# print(f"Model accuracy: {score * 100}%")

Best params: {'C': 1000, 'gamma': 1, 'kernel': 'rbf'}
Best CV score: 89.05889590157052%


## KNN

In [9]:
knn = KNeighborsClassifier()
params_knn = [{
    "n_neighbors": [2, 3, 4, 5, 6],
    "weights": ["uniform", "distance"],
    "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
    "leaf_size": [10, 20, 30, 40, 50]
}]
grid_search_knn = GridSearchCV(knn, params_knn)

In [10]:
grid_search_knn.fit(X_train, y_train)

In [11]:
print(f"Best params: {grid_search_knn.best_params_}")
print(f"Best CV score: {grid_search_knn.best_score_ * 100}%")
model_knn = grid_search_knn.best_estimator_
# score = accuracy_score(y_pred, y_test)
# print(f"Model accuracy: {score * 100}%")

Best params: {'algorithm': 'auto', 'leaf_size': 50, 'n_neighbors': 2, 'weights': 'distance'}
Best CV score: 88.70389061133238%


## Random Forest

In [12]:
rf = RandomForestClassifier()
params_rf = [{
    "n_estimators": [100, 150, 200, 250, 300, 350],
    "max_depth": [None, 1, 2, 3, 4, 5],
    "warm_start": [True, False]
}]
grid_search_rf = GridSearchCV(rf, params_rf)

In [13]:
grid_search_rf.fit(X_train, y_train)

In [14]:
print(f"Best params: {grid_search_rf.best_params_}")
print(f"Best CV score: {grid_search_rf.best_score_ * 100}%")
model_rf = grid_search_rf.best_estimator_
# score = accuracy_score(y_pred, y_test)
# print(f"Model accuracy: {score * 100}%")

Best params: {'max_depth': None, 'n_estimators': 300, 'warm_start': True}
Best CV score: 93.59025204615247%


# Gradient Boost

In [15]:
gb = GradientBoostingClassifier()
params_gb = [{
    "learning_rate": [0.1, 0.01, 0.001],
    "n_estimators": [100, 150, 200, 250, 300, 350],
    
}]
grid_search_gb = GridSearchCV(gb, params_gb)

In [16]:
grid_search_gb.fit(X_train, y_train)

In [17]:
print(f"Best params: {grid_search_gb.best_params_}")
print(f"Best CV score {grid_search_gb.best_score_ * 100}%")
model_gb = grid_search_gb.best_estimator_

Best params: {'learning_rate': 0.1, 'n_estimators': 350}
Best CV score 89.85924154296218%


# Ensemble Voters

In [None]:
vc = VotingClassifier([
    ("SVM", model_svc),
    ("RandomForest", model_rf),
    ("GradientBoost", model_gb),
])
params_vc = [{
    "voting": ["hard", "soft"],
    "weights": [(1,1,1), (2,1,1), (1,2,1), (1,1,2)]
}]
grid_search_vc = GridSearchCV(vc, params_vc)

In [None]:
grid_search_vc.fit(X_train, y_train)

In [None]:
print(f"Best params {grid_search_vc.best_params_}")
print(f"Best CV Score {grid_search_vc.best_score_ * 100}%")
model_vc = grid_search_vc.best_estimator_

In [None]:
y_pred = model_vc.predict(X_test)
score = accuracy_score(y_pred, y_test)
print(f"Model accuracy: {score * 100}%")

# Saving model

In [None]:
joblib.dump(model_vc, "./model.joblib")
joblib.dump(encoder, "./encoder.joblib")