In [1]:
import os

os.chdir("../../")


In [3]:
import numpy as np
import pandas as pd
import joblib


In [4]:
X_train_transformed, X_test_transformed, y_train, y_test = joblib.load(
    "./data/processed/train_test_transformed_data.joblib"
)

In [5]:
X_train_transformed

Unnamed: 0,release_date__month_season_Fall,release_date__month_season_Spring,release_date__month_season_Summer,release_date__month_season_Winter,release_date__day_category_First 10,release_date__day_category_Last 10,release_date__day_category_Middle 10,release_date__day_category_nan,release_day__is_weekend,top_artist__is_top_artist,...,numerical__loudness,numerical__speechiness,numerical__acousticness,numerical__instrumentalness,numerical__liveness,numerical__valence,numerical__tempo,numerical__duration_ms,key__key,mode__mode
0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,-0.655841,-0.504629,-0.754311,-0.383001,-0.555452,1.056443,-0.794418,0.388206,7.0,1.0
1,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,-3.415077,-0.528176,1.190121,-0.382946,0.830956,-0.271435,-1.677819,0.168817,9.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,-0.416945,-0.495798,-0.363348,-0.370209,-0.643323,0.906037,0.157502,-0.932907,0.0,1.0
3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.737217,-0.647877,-0.793091,-0.364294,1.208476,-0.451923,0.269861,2.195151,0.0,1.0
4,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,1.472386,-0.596857,-0.800333,-0.382719,-0.779360,1.193958,0.044435,-0.475267,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22264,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,-0.363521,-0.633159,0.946333,2.692479,-0.477344,-0.090947,-0.177823,-0.832483,1.0,0.0
22265,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.856161,3.404277,1.605463,-0.383001,-0.216986,0.506384,1.685383,0.183389,7.0,1.0
22266,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.502352,-0.694972,-0.119109,-0.383001,-0.687584,-0.847279,-0.701481,-0.681712,1.0,0.0
22267,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.442880,-0.480100,0.120164,-0.383001,-0.662850,1.017767,-0.999490,-1.108428,4.0,0.0


In [13]:
import xgboost as xgb
import optuna
from sklearn.metrics import accuracy_score
import mlflow
import mlflow.xgboost


def objective(trial):
    with mlflow.start_run(nested=True):
        param = {
            "verbosity": 0,
            "objective": "multi:softmax",
            "num_class": len(np.unique(y_train)),
            "eval_metric": "mlogloss",
            "booster": trial.suggest_categorical(
                "booster", ["gbtree", "gblinear", "dart"]
            ),
            "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
            "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        }

        if param["booster"] == "gbtree" or param["booster"] == "dart":
            param["max_depth"] = trial.suggest_int("max_depth", 1, 9)
            param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
            param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
            param["grow_policy"] = trial.suggest_categorical(
                "grow_policy", ["depthwise", "lossguide"]
            )

        if param["booster"] == "dart":
            param["sample_type"] = trial.suggest_categorical(
                "sample_type", ["uniform", "weighted"]
            )
            param["normalize_type"] = trial.suggest_categorical(
                "normalize_type", ["tree", "forest"]
            )
            param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
            param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

        mlflow.log_params(param)

        dtrain = xgb.DMatrix(X_train_transformed, label=y_train)
        dvalid = xgb.DMatrix(X_test_transformed, label=y_test)

        bst = xgb.train(
            param, dtrain, evals=[(dvalid, "validation")], early_stopping_rounds=10
        )
        mlflow.xgboost.log_model(bst, "model")

        preds = bst.predict(dvalid, output_margin=True)
        pred_labels = np.argmax(preds, axis=1)
        accuracy = accuracy_score(y_test, pred_labels)
        mlflow.log_metric("accuracy", accuracy)
        return accuracy


In [14]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

best_params = study.best_trial.params
print("Best trial:", study.best_trial.params)

# Train final model
final_model = xgb.XGBClassifier(**best_params)
final_model.fit(X_train_transformed, y_train)

# Save the model
joblib.dump(final_model, "./models/xgb_optuna_model.joblib")

[I 2024-04-24 01:35:39,502] A new study created in memory with name: no-name-5fbac476-f109-4da8-8f1b-9eb250eae605
[W 2024-04-24 01:35:39,706] Trial 0 failed with parameters: {'booster': 'gbtree', 'lambda': 1.5482170832094488e-05, 'alpha': 7.203910773378253e-08, 'max_depth': 4, 'eta': 0.15309798066423583, 'gamma': 1.1910757819391747e-07, 'grow_policy': 'lossguide'} because of the following error: XGBoostError('[01:35:39] C:\\buildkite-agent\\builds\\buildkite-windows-cpu-autoscaling-group-i-0b3782d1791676daf-1\\xgboost\\xgboost-ci-windows\\src\\objective\\multiclass_obj.cu:123: SoftmaxMultiClassObj: label must be in [0, num_class).').
Traceback (most recent call last):
  File "c:\Users\omsan\anaconda3\envs\eds_ml_2\lib\site-packages\optuna\study\_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\omsan\AppData\Local\Temp\ipykernel_19652\3711538104.py", line 45, in objective
    bst = xgb.train(
  File "c:\Users\omsan\anaconda3\envs\eds_ml_2\lib\site

XGBoostError: [01:35:39] C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0b3782d1791676daf-1\xgboost\xgboost-ci-windows\src\objective\multiclass_obj.cu:123: SoftmaxMultiClassObj: label must be in [0, num_class).