In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import joblib
import optuna

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    make_scorer
)


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
df_train = pd.read_csv("../data/processed/numeric_train_data.csv")
df_train.head(5)
df_val = pd.read_csv("../data/processed/numeric_val_data.csv")
df_val.head(5)

Unnamed: 0,chroma_1,chroma_2,chroma_3,chroma_4,chroma_5,chroma_6,chroma_7,chroma_8,chroma_9,chroma_10,...,mfcc_12,mfcc_13,mfcc_14,mfcc_15,mfcc_16,mfcc_17,mfcc_18,mfcc_19,mfcc_20,label
0,0.539667,0.705159,0.649134,0.583651,0.531421,0.561278,0.453441,0.474912,0.456864,0.567574,...,0.316943,0.564993,0.092125,0.599576,0.388226,0.685851,0.370379,0.824795,0.299102,0
1,0.700929,0.910526,0.769736,0.831898,0.789822,0.678607,0.666799,0.743293,0.607547,0.60518,...,0.373482,0.603531,0.473667,0.846925,0.503283,0.623375,0.314519,0.700598,0.456883,0
2,0.709038,0.79124,0.661102,0.667024,0.68387,0.700922,0.633543,0.628824,0.643679,0.689421,...,0.415091,0.61866,0.464444,0.71947,0.520133,0.598141,0.388956,0.734982,0.370067,0
3,0.530169,0.716288,0.642201,0.794942,0.810297,0.778762,0.672078,0.716846,0.748976,0.81374,...,0.515809,0.680099,0.411544,0.792083,0.590471,0.626973,0.399913,0.87733,0.594645,0
4,0.606761,0.698771,0.700411,0.806784,0.815829,0.724553,0.583545,0.693354,0.571397,0.739391,...,0.498413,0.716406,0.41934,0.653179,0.441496,0.515069,0.322138,0.734589,0.426383,0


In [3]:
df_train.groupby(by="label").count()

Unnamed: 0_level_0,chroma_1,chroma_2,chroma_3,chroma_4,chroma_5,chroma_6,chroma_7,chroma_8,chroma_9,chroma_10,...,mfcc_11,mfcc_12,mfcc_13,mfcc_14,mfcc_15,mfcc_16,mfcc_17,mfcc_18,mfcc_19,mfcc_20
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2580,2580,2580,2580,2580,2580,2580,2580,2580,2580,...,2580,2580,2580,2580,2580,2580,2580,2580,2580,2580
1,5160,5160,5160,5160,5160,5160,5160,5160,5160,5160,...,5160,5160,5160,5160,5160,5160,5160,5160,5160,5160


In [None]:
X_train = df_train.iloc[:, :-1]
y_train = df_train.iloc[:, -1]

X_val = df_val.iloc[:, :-1]
y_val = df_val.iloc[:, -1]

In [5]:
X_train.head()

Unnamed: 0,chroma_1,chroma_2,chroma_3,chroma_4,chroma_5,chroma_6,chroma_7,chroma_8,chroma_9,chroma_10,...,mfcc_11,mfcc_12,mfcc_13,mfcc_14,mfcc_15,mfcc_16,mfcc_17,mfcc_18,mfcc_19,mfcc_20
0,0.504315,0.548461,0.501602,0.542981,0.519444,0.689757,0.674948,0.571628,0.550304,0.521429,...,0.669963,0.58084,0.591705,0.399666,0.599349,0.472456,0.64729,0.41677,0.721205,0.24497
1,0.433211,0.484077,0.449849,0.491398,0.503308,0.68286,0.695454,0.762369,0.770101,0.70359,...,0.777054,0.477089,0.576566,0.36474,0.567235,0.376243,0.694686,0.574954,0.745847,0.375365
2,0.650005,0.730896,0.709415,0.698076,0.638344,0.693965,0.629554,0.687421,0.722584,0.71923,...,0.708333,0.557636,0.580289,0.398246,0.78259,0.566742,0.696631,0.526107,0.836288,0.485992
3,0.515072,0.483673,0.467737,0.631756,0.627723,0.595867,0.515102,0.535442,0.546995,0.577641,...,0.488018,0.404663,0.62731,0.343542,0.660205,0.411806,0.616249,0.412144,0.656624,0.200678
4,0.302506,0.493074,0.50483,0.38306,0.388507,0.704446,0.840085,0.752893,0.431209,0.273619,...,0.487332,0.507439,0.545278,0.397998,0.664384,0.438337,0.768158,0.265497,0.659639,0.276219


### RANDOM FOREST

In [6]:
feature_rf = ['chroma_1', 'chroma_2', 'chroma_3', 'chroma_4', 'chroma_8', 'chroma_9',
       'chroma_11', 'chroma_12', 'spectral_bandwidth_mean',
       'spectral_rolloff_mean', 'mfcc_1', 'mfcc_3', 'mfcc_4', 'mfcc_5',
       'mfcc_6', 'mfcc_7', 'mfcc_8', 'mfcc_9', 'mfcc_10', 'mfcc_11', 'mfcc_12',
       'mfcc_13', 'mfcc_14', 'mfcc_15', 'mfcc_16', 'mfcc_17', 'mfcc_18',
       'mfcc_19', 'mfcc_20']


X_train_rf = X_train[feature_rf]

Optuna

In [7]:
# 1. Subset fitur
X_train_rf = X_train[feature_rf]
X_val_rf = X_val[feature_rf]

# 2. Objective tanpa cross-validation
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 300),
        'max_depth': trial.suggest_int('max_depth', 5, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', 0.5]),
        'class_weight': trial.suggest_categorical('class_weight', [None, 'balanced']),
        'n_jobs': -1,
        'random_state': 42
    }

    # Train model
    model_rf = RandomForestClassifier(**params)
    model_rf.fit(X_train_rf, y_train)

    # Evaluate on validation set
    y_pred = model_rf.predict(X_val_rf)
    val_accuracy = accuracy_score(y_val, y_pred)

    return val_accuracy


In [8]:
# 3. Jalankan Optimasi
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)  # bisa diubah sesuai kebutuhan

# 4. Tampilkan hasil terbaik
print("Best trial:")
print(study.best_trial)


[32m[I 2025-06-21 22:48:35,582][0m A new study created in memory with name: no-name-c8a366ef-c636-4053-bad9-ceaebf4bb585[0m
[32m[I 2025-06-21 22:48:36,492][0m Trial 0 finished with value: 0.9487045276105731 and parameters: {'n_estimators': 151, 'max_depth': 15, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_features': 'log2', 'class_weight': 'balanced'}. Best is trial 0 with value: 0.9487045276105731.[0m
[32m[I 2025-06-21 22:48:38,039][0m Trial 1 finished with value: 0.9340486783564512 and parameters: {'n_estimators': 280, 'max_depth': 8, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'class_weight': None}. Best is trial 0 with value: 0.9487045276105731.[0m
[32m[I 2025-06-21 22:48:38,956][0m Trial 2 finished with value: 0.9165140015702695 and parameters: {'n_estimators': 170, 'max_depth': 7, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'class_weight': 'balanced'}. Best is trial 0 with value: 0.9487045276105731.[0m
[32

Best trial:
FrozenTrial(number=41, values=[0.9497513739858676], datetime_start=datetime.datetime(2025, 6, 21, 22, 49, 43, 221699), datetime_complete=datetime.datetime(2025, 6, 21, 22, 49, 44, 168632), params={'n_estimators': 111, 'max_depth': 15, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_features': 'log2', 'class_weight': None}, distributions={'n_estimators': IntUniformDistribution(high=300, low=100, step=1), 'max_depth': IntUniformDistribution(high=20, low=5, step=1), 'min_samples_split': IntUniformDistribution(high=10, low=2, step=1), 'min_samples_leaf': IntUniformDistribution(high=5, low=1, step=1), 'max_features': CategoricalDistribution(choices=('sqrt', 'log2', 0.5)), 'class_weight': CategoricalDistribution(choices=(None, 'balanced'))}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=41, state=TrialState.COMPLETE, value=None)


### SVM

In [9]:
feature_svm = ['chroma_1', 'chroma_4', 'chroma_7', 'chroma_8', 'chroma_11', 'rms_mean',
       'spectral_centroid_mean', 'spectral_bandwidth_mean',
       'spectral_rolloff_mean', 'zcr_mean', 'mfcc_1', 'mfcc_2', 'mfcc_3',
       'mfcc_4', 'mfcc_5', 'mfcc_6', 'mfcc_7', 'mfcc_9', 'mfcc_10', 'mfcc_14',
       'mfcc_17', 'mfcc_18', 'mfcc_19', 'mfcc_20']

X_train_svm = X_train[feature_svm]

In [10]:
# 1. Subset fitur untuk SVM
X_train_svm = X_train[feature_svm]
X_val_svm = X_val[feature_svm]

# 2. Objective function untuk tuning SVM tanpa CV
def objective(trial):
    params = {
        'C': trial.suggest_float('C', 1e-2, 1e2, log=True),
        'gamma': trial.suggest_float('gamma', 1e-4, 1e0, log=True),
        'kernel': 'rbf',
        'class_weight': trial.suggest_categorical('class_weight', [None, 'balanced']),
        'random_state': 42
    }

    model_svm = SVC(**params)
    model_svm.fit(X_train_svm, y_train)

    y_pred = model_svm.predict(X_val_svm)
    val_accuracy = accuracy_score(y_val, y_pred)

    return val_accuracy


In [11]:
# 3. Jalankan optimasi
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# 4. Lihat hasil terbaik
print("Best trial:")
print(study.best_trial)


[32m[I 2025-06-21 22:49:54,166][0m A new study created in memory with name: no-name-a92882fe-f5b1-43a8-af89-c382912ec253[0m
[32m[I 2025-06-21 22:50:22,786][0m Trial 0 finished with value: 0.6665794294687255 and parameters: {'C': 0.026408953688955624, 'gamma': 0.0037882276618062555, 'class_weight': None}. Best is trial 0 with value: 0.6665794294687255.[0m
[32m[I 2025-06-21 22:50:47,433][0m Trial 1 finished with value: 0.6665794294687255 and parameters: {'C': 1.4113326531403814, 'gamma': 0.00014019090520971125, 'class_weight': None}. Best is trial 0 with value: 0.6665794294687255.[0m
[32m[I 2025-06-21 22:50:51,337][0m Trial 2 finished with value: 0.9468725464538079 and parameters: {'C': 54.3329476193131, 'gamma': 0.0065361953775529644, 'class_weight': 'balanced'}. Best is trial 2 with value: 0.9468725464538079.[0m
[32m[I 2025-06-21 22:50:58,123][0m Trial 3 finished with value: 0.9436011515310129 and parameters: {'C': 38.123867823097136, 'gamma': 0.001415719027318199, 'class

Best trial:
FrozenTrial(number=26, values=[0.9763151007589637], datetime_start=datetime.datetime(2025, 6, 21, 22, 54, 42, 25704), datetime_complete=datetime.datetime(2025, 6, 21, 22, 54, 43, 224314), params={'C': 19.519063724517864, 'gamma': 0.4791572427747828, 'class_weight': None}, distributions={'C': LogUniformDistribution(high=100.0, low=0.01), 'gamma': LogUniformDistribution(high=1.0, low=0.0001), 'class_weight': CategoricalDistribution(choices=(None, 'balanced'))}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=26, state=TrialState.COMPLETE, value=None)
