In [None]:
import pandas as pd

import xgboost as xgb

import seaborn as sns

import optuna
import optuna_integration

from sklearn.metrics import roc_auc_score

from math import sqrt

from datetime import datetime

#### Load data

In [None]:
Y_train = pd.read_csv("../../data/ld50/train.csv")["Class"]
Y_test = pd.read_csv("../../data/ld50/test.csv")["Class"]

X_train = pd.read_csv("train_embeddings.csv")
X_test = pd.read_csv("test_embeddings.csv")

class_labels = ["Alto", "Moderado", "Leve", "Desprezível"][::-1]

#### Train

In [None]:
from sklearn.utils.class_weight import compute_sample_weight

train_sample_weight = compute_sample_weight(class_weight='balanced', y=Y_train)
valid_sample_weigth = compute_sample_weight(class_weight='balanced', y=Y_valid)
test_sample_weight = compute_sample_weight(class_weight='balanced', y=Y_test)

In [None]:
best_model = None
best_eval_metric = float("inf")

def objective(trial):
    global best_model
    global best_eval_metric

    model = xgb.XGBClassifier(
        verbosity=0,

        objective= 'multi:softprob',
        eval_metric='mlogloss',
        n_estimators=5000,
        num_class=len(Y_train.unique()),
        
        max_depth=trial.suggest_int('max_depth', 2, 6), 
        learning_rate=trial.suggest_float('learning_rate', 1e-5, 1e-1),
        subsample=0.9,
        colsample_bytree=0.5,
        early_stopping_rounds=trial.suggest_int('early_stop', 10, 100),
        callbacks=[optuna_integration.XGBoostPruningCallback(trial, 'validation_0-mlogloss')],

        n_jobs=4
    )

    model.fit(X_train, Y_train,
              sample_weight=train_sample_weight,
              eval_set=[(X_test, Y_test)], 
              sample_weight_eval_set=[valid_sample_weigth, test_sample_weight],
              verbose=5000)

    eval_metric = model.evals_result()['validation_0']['mlogloss'][-1]

    if eval_metric < best_eval_metric:
        best_eval_metric = eval_metric
        best_model = model

    return eval_metric


study = optuna.create_study(direction='minimize', 
                            storage="sqlite:///XGB_Tox_Pred.sqlite3", 
                            study_name=f"Classification{datetime.now().isoformat()}")
study.optimize(objective, n_trials=100)

In [None]:
bst = best_model

In [None]:
from datetime import datetime
bst.save_model(f"xgboost_classfifcation_model_{datetime.now().isoformat()}.json")

In [None]:
bst = xgb.XGBClassifier()
bst.load_model("xgboost_classfifcation_model_2024-06-19T10:57:14.894608.json")

In [None]:
pred = bst.predict(X_test)

import numpy as np
import seaborn as sns
from sklearn.metrics import r2_score
matrix = np.zeros((4, 4), dtype=np.float64)
for i, (p, t) in enumerate(zip(pred, Y_test)):
    matrix[t, p] += 1

annot = np.copy(matrix)
for i, count in enumerate(Y_test.value_counts().sort_index()):
    matrix[i] /= count

sns.heatmap(matrix, cmap='coolwarm', robust=True, annot=annot, fmt='g', xticklabels=class_labels, yticklabels=class_labels).set_title("Matriz de confusão")

from sklearn.metrics import accuracy_score, classification_report
print(classification_report(Y_test.values, pred))