In [None]:
import pandas as pd

import xgboost as xgb

import seaborn as sns

import optuna
import optuna_integration

from sklearn.metrics import mean_squared_error

from math import sqrt

from datetime import datetime

#### Load data

In [None]:
Y_train = pd.read_csv("../../data/ld50/train.csv")[["ld50"]]
Y_test = pd.read_csv("../../data/ld50/test.csv")[["ld50"]]

X_train = pd.read_csv("train_embeddings.csv")
X_test = pd.read_csv("test_embeddings.csv")

Y_test.describe(percentiles=[0.05, 0.5, 0.95]), Y_train.describe(percentiles=[0.05, 0.5, 0.95])

In [None]:
import numpy as np
# train_mask = Y_train < np.percentile(Y_train, 95)
# filtered_y_train = Y_train[train_mask]
# filtered_x_train = X_train[train_mask]

dtrain = xgb.DMatrix(X_train, Y_train)
dtest = xgb.DMatrix(X_test, Y_test)

In [None]:
plot_df = Y_train.join(Y_test, lsuffix="_train", rsuffix="_test")
sns.boxplot(plot_df)

#### Train

In [None]:
best_model = None
best_eval_metric = float("inf")

def objective(trial):
    global best_model
    global best_eval_metric

    num_rounds = 5000
    param = {
        'verbosity': 0,
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'max_depth': trial.suggest_int('max_depth', 2, 6), 
        'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1e-1),
        'subsample': 0.9,
        'colsample_bytree': 0.5,
    }

    progress = dict()
    evallist = [(dtest, 'test')]
    bst = xgb.train(param, dtrain, num_rounds, evals=evallist,
                    early_stopping_rounds=trial.suggest_int('early_stop', 10, 100), 
                    verbose_eval=num_rounds, 
                    evals_result=progress,
                    callbacks=[optuna_integration.XGBoostPruningCallback(trial, 'test-rmse')])

    y = bst.predict(dtest)
    eval_metric = sqrt(mean_squared_error(Y_test, y))

    if eval_metric < best_eval_metric:
        best_eval_metric = eval_metric
        best_model = bst

    return eval_metric


study = optuna.create_study(direction='minimize', 
                            storage="sqlite:///XGB_Tox_Pred.sqlite3", 
                            study_name=f"Regression{datetime.now().isoformat()}")
study.optimize(objective, n_trials=100)

In [None]:
# bst = best_model

bst = xgb.XGBRegressor()
bst.load_model('xgboost_model_2024-06-17T18:41:09.854815.json')

outputs = bst.predict(X_test.values)

import matplotlib.pyplot as plt

max_val = max(Y_test.values.max(), outputs.max())
min_val = min(Y_test.values.min(), outputs.min())
plt.plot([[min_val], [max_val]], [[min_val], [max_val]])
plt.scatter(Y_test, outputs, color='r')

from sklearn.metrics import r2_score, mean_squared_error

f"{r2_score(Y_test, outputs)=} {sqrt(mean_squared_error(Y_test, outputs))=}"

In [None]:
from datetime import datetime
bst.save_model(f"xgboost_model_{datetime.now().isoformat()}.json")