In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn import datasets
import matplotlib.pyplot as plt
from trb import objectives
from trb.learner import TRBLearner
import xgboost as xgb
from sklearn.ensemble import GradientBoostingRegressor
import warnings
warnings.filterwarnings('ignore')


In [2]:
n_samples = 500
n_outliers = 50


X, y, coef = datasets.make_regression(
    n_samples=n_samples,
    n_features=5,
    noise=10,
    coef=True,
    random_state=42,
)

# Add outlier data
np.random.seed(0)
X[:n_outliers] = 100 + 100 * np.random.normal(size=(n_outliers, 1))
y[:n_outliers] = -100 + 100 * np.random.normal(size=n_outliers)


# Add 5% strong outliers to the dataset.
X_outliers = X[:n_outliers, :]
y_outliers = y[:n_outliers]

X_inliers = X[n_outliers:, :]
y_inliers = y[n_outliers:]




In [3]:
folder_path = "./reg_gradient_result"
if not os.path.exists(folder_path):
    os.makedirs(folder_path)
    
dataset = 'mae'
objective = objectives.MAE()
n_estimators = 100

trb_best_loss_list = []
trb_best_r2_list = []
trb_best_nestimator_list = []


for i in range(5):
    np.random.seed(i)
    seed = np.random.randint(10000, size=1).item()
    X_train_in, X_test, y_train_in, y_test = train_test_split(
        X_inliers, y_inliers, test_size=0.2, random_state=seed)
    X_train = np.vstack((X_train_in, X_outliers))
    y_train = np.append(y_train_in, y_outliers)

    train_x, eval_x, train_y, eval_y = train_test_split(
        X_train, y_train, test_size=0.2, random_state=seed)

    # Trb grid search
    trb_best_nestimator = n_estimators
    trb_best_eval_loss = 1e15
    trb_best_alpha = 0.1
    trb_best_beta = 10
    trb_best_eta = 0
    alpha_list = [0.1, 0.5, 1, 5]
    beta_list = [0, 1, 10]
    eta_list = [0, 0.01, 0.1]
    for _alpha in alpha_list:
        for _eta in eta_list:
            trb_reg = TRBLearner(objective,
                                base_score=np.mean(train_y),  
                                n_estimators=n_estimators,
                                alpha=_alpha,
                                eta=_eta,
                                update_strategy='gradient')
            trb_reg.fit(train_set=(train_x, train_y), eval_set=(eval_x, eval_y))
            trb_eval_loss = trb_reg.eval_loss_list
            if np.nanmin(trb_eval_loss) < trb_best_eval_loss:
                trb_best_nestimator = np.nanargmin(trb_eval_loss)+1
                trb_best_alpha = _alpha
                trb_best_eta = _eta
                trb_best_eval_loss = np.nanmin(trb_eval_loss)

    trb_reg = TRBLearner(objective,
                        base_score=np.mean(y_train),
                        n_estimators=trb_best_nestimator,
                        alpha=trb_best_alpha,
                        eta=trb_best_eta,
                        update_strategy='gradient')
    trb_reg.fit(train_set=(X_train, y_train))
    y_trb_pred = trb_reg.predict(X_test)
    trb_r2 = r2_score(y_test, y_trb_pred) 
    trb_loss = objective.loss(y_test, y_trb_pred)
    trb_best_r2_list.append(trb_r2)   
    trb_best_nestimator_list.append(trb_best_nestimator)
    trb_best_loss_list.append(trb_loss)


    # Saving...
    current_path = os.path.join(
        folder_path, dataset, f'shuffle_{seed}', 'trb', 'evaluate')
    if not os.path.exists(current_path):
        os.makedirs(current_path)

    np.savez(os.path.join(current_path, "data.npz"), 
        loss = trb_loss,
        r2 = trb_r2,
        alpha = trb_best_alpha,
        eta = trb_best_eta,
        n_estiamtors = trb_best_nestimator,
    )
   
    
print('TRB standard metric: mean={}, std={}'.format(np.mean(trb_best_r2_list),
                                    np.std(trb_best_r2_list)))
print('TRB standard loss: mean={}, std={}'.format(np.mean(trb_best_loss_list),
                                    np.std(trb_best_loss_list)))


TRB standard metric: mean=0.8604092812717195, std=0.0304494847363904
TRB standard loss: mean=29.037562177385354, std=3.8523506095399593
