In [3]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from simtree import SIMTreeRegressor, SIMTreeClassifier
from simtree import GLMTreeRegressor, GLMTreeClassifier
from rpy2.robjects import numpy2ri, pandas2ri
import optuna

numpy2ri.activate()
pandas2ri.activate()

In [16]:
# boston = pd.read_csv("boston_housing.csv")
# X = boston.drop("medv", axis=1)
# Y = boston["medv"]
# train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=0.2, random_state=0)

boston_train = pd.read_csv("boston_train.csv")
boston_test = pd.read_csv("boston_test.csv")
train_x = boston_train.drop("medv", axis=1)
train_y = boston_train["medv"]
test_x = boston_test.drop("medv", axis=1)
test_y = boston_test["medv"]

In [19]:
model = SIMTreeRegressor(
    max_depth=4,   
    min_samples_leaf=34, 
    knot_num=31,
    n_split_grid=18, 
    n_screen_grid=3, 
    n_feature_search=7, #6
    reg_lambda=np.logspace(-5, 5, 100).tolist(),
    reg_gamma=[1e-3, 1e-5, 1e-7]
)
model.fit(train_x, train_y)
pred_test = model.predict(test_x).reshape([-1, 1])
print((np.mean((test_y - pred_test.ravel()) ** 2)) ** 0.5)

3.721114707526175


In [13]:
def objective(trial):
    model = SIMTreeRegressor(
        max_depth=trial.suggest_int("max_depth", 1, 5), 
        min_samples_leaf=trial.suggest_int("min_samples_leaf", 25, 35),  
        knot_num=trial.suggest_int("knot_num", 25, 35),
        n_split_grid=trial.suggest_int("n_split_grid", 10, 40), 
        n_screen_grid=trial.suggest_int("n_screen_grid", 1, 10), 
        n_feature_search=trial.suggest_int("n_feature_search", step=1, low=1, high=10), 
        reg_lambda=np.logspace(-5, 5, 100).tolist(),
        reg_gamma=[1e-3, 1e-5, 1e-7]
    )
    model.fit(train_x, train_y)
    #pred_train = model.predict(train_x).reshape([-1, 1])
    pred_test = model.predict(test_x).reshape([-1, 1])
    return (np.mean((test_y - pred_test.ravel()) ** 2)) ** 0.5

study = optuna.create_study()
study.optimize(objective, n_trials = 100)

[32m[I 2022-12-06 15:08:50,259][0m A new study created in memory with name: no-name-eb07fbb0-875e-4838-8052-3f2c75c18438[0m
[32m[I 2022-12-06 15:09:03,962][0m Trial 0 finished with value: 5.732097158422835 and parameters: {'max_depth': 2, 'min_samples_leaf': 27, 'knot_num': 31, 'n_split_grid': 11, 'n_screen_grid': 6, 'n_feature_search': 1}. Best is trial 0 with value: 5.732097158422835.[0m
[32m[I 2022-12-06 15:09:34,687][0m Trial 1 finished with value: 3.8190569077587817 and parameters: {'max_depth': 3, 'min_samples_leaf': 33, 'knot_num': 30, 'n_split_grid': 23, 'n_screen_grid': 9, 'n_feature_search': 7}. Best is trial 1 with value: 3.8190569077587817.[0m
[32m[I 2022-12-06 15:09:45,878][0m Trial 2 finished with value: 5.574195718666305 and parameters: {'max_depth': 1, 'min_samples_leaf': 34, 'knot_num': 27, 'n_split_grid': 31, 'n_screen_grid': 10, 'n_feature_search': 3}. Best is trial 1 with value: 3.8190569077587817.[0m
[32m[I 2022-12-06 15:10:02,343][0m Trial 3 finished