In [1]:
!pip install -q optuna catboost

In [2]:
import numpy as np
import scipy.stats as stats
import pandas as pd
import os
import pickle

import optuna
import sklearn
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
import statsmodels.formula.api as smf

In [3]:
def mtest(self, x_test, y_test):
    if len(y_test.shape) == 1:
        y_test = y_test[:, None]
    y_pred = self.predict(x_test)
    diff = y_pred - y_test
    se = diff**2
    ss_res = se.sum()
    ss_total = ((y_test - self.y_train_mean)**2).sum()
    rsq = 1 - ss_res / ss_total
    rmse = np.sqrt(se.mean())
    mae = np.abs(diff).mean()
    return dict(mae=mae, rmse=rmse, rsq=rsq)

def train_with_hyp_search(x_trial, y_trial, esttype='catb', n_trials=30,
                          optuna_sql_path='/dev/shm/optuna.sqlite3',
                          optuna_storage_dir='/tmp/optuna_pkls'):

    x_trial_train, x_trial_test, y_trial_train, y_trial_test = train_test_split(x_trial, y_trial, test_size=0.1, random_state=0)
    if esttype != 'lasso':
        x_trial_train, x_trial_val, y_trial_train, y_trial_val = train_test_split(x_trial_train, y_trial_train, test_size=0.1, random_state=1)

    def objective(trial):
        successful_trials_so_far = len([t for t in study.trials if t.state.name == 'COMPLETE'])
        if successful_trials_so_far >= n_trials:
            study.stop()
            print('Maximum number of trials reached, prunning')
            raise optuna.TrialPruned()
        print(f"Running trial {trial.number}")

        if esttype == 'catb':
            param = {
                'iterations': 100_000,
                'early_stopping_rounds': trial.suggest_int("early_stopping_rounds", 50, 100),
                'verbose': 1000,
                'random_seed': successful_trials_so_far,
                #'task_type': 'GPU',
            }

            creg = CatBoostRegressor(**param)
            creg.fit(x_trial_train, y_trial_train,
                eval_set=(x_trial_val, y_trial_val)
            )

        elif esttype == 'lasso':
            alpha = trial.suggest_float("alpha", 0, 10)

            creg = sklearn.linear_model.Lasso(alpha=alpha)
            creg.fit(x_trial_train, y_trial_train)

        else:
            raise ValueError('Invalid estimator type')
        
        y_trial_pred = creg.predict(x_trial_test)
        diff = y_trial_pred - y_trial_test
        err = np.abs(diff).mean()

        trial.set_user_attr("err", err)
        print('error on hyp search validation:', err)

        best_estimator_performance = np.inf
        try:
            best_estimator_performance = study.best_trial.values[0]
        except Exception:
            pass

        if best_estimator_performance >= err:
            print('Best estimator so far. Saving estimator, started.')
            with open(f"{os.path.join(optuna_storage_dir, str(trial.number))}.pkl", "wb") as f:
                pickle.dump(creg, f)
            print('Saving estimator, done.')
        else:
            print('Not the best estimator so far.')

        print(f"Finished trial {trial.number}")
        return err

    optuna_storage_dir = os.path.join(optuna_storage_dir, esttype)
    os.makedirs(optuna_storage_dir, exist_ok=True)

    optuna_storage = optuna.storages.RDBStorage(url="sqlite:///"+optuna_sql_path, engine_kwargs={"connect_args": {"timeout": 600}})
    try:
        optuna.delete_study(storage=optuna_storage, study_name=esttype)
    except KeyError:
        pass
    study = optuna.create_study(storage=optuna_storage,
                                study_name=esttype, direction="minimize",
                                sampler=optuna.samplers.TPESampler(seed=1))
    study.optimize(objective, n_trials=n_trials)

    print("Loading best trial:", study.best_params, 'started')
    with open(f"{os.path.join(optuna_storage_dir, str(study.best_trial.number))}.pkl", "rb") as f:
        best_creg = pickle.load(f)
    print("Loading best trial:", study.best_params, 'done')

    return best_creg#, study.best_params, study

class BiasCorrectEstimatorCole:
    def __init__(self, est, x_sval_bc, y_sval_bc):
        self.est = est
        
        x_sval_bc = np.array(x_sval_bc)
        y_sval_bc = np.array(y_sval_bc).reshape(-1)
        
        reg = sklearn.linear_model.LinearRegression()
        reg.fit(y_sval_bc.reshape((-1, 1)), est.predict(x_sval_bc).reshape(-1))
        self.intercept = reg.intercept_
        self.coef = reg.coef_.item()
 
    def predict(self, x_pred, y_pred=None):
        x_pred = np.array(x_pred)
        pred = self.est.predict(x_pred)
        pred = (pred - self.intercept) / self.coef
        return pred

In [4]:
df = pd.read_csv('data/synthetic_db_healthy.csv')
df.isna().any(1).sum()

0

In [5]:
x_all, y_all = df.iloc[:, df.columns!='age_at_MRI'], df['age_at_MRI'], 
x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=0.2, random_state=0)

In [6]:
best_creg  = train_with_hyp_search(x_train, y_train, esttype='catb', n_trials=30)

[32m[I 2022-08-13 14:01:43,472][0m A new study created in RDB with name: catb[0m


Running trial 0
Learning rate set to 0.003698
0:	learn: 7.2927696	test: 6.7203045	best: 6.7203045 (0)	total: 73.7ms	remaining: 2h 2m 52s
1000:	learn: 5.3641452	test: 5.4879250	best: 5.4879250 (1000)	total: 10s	remaining: 16m 28s
2000:	learn: 4.8505237	test: 5.3400868	best: 5.3400868 (2000)	total: 19.9s	remaining: 16m 12s
3000:	learn: 4.4107673	test: 5.2818156	best: 5.2815372 (2996)	total: 29.7s	remaining: 15m 58s


[32m[I 2022-08-13 14:02:21,100][0m Trial 0 finished with value: 4.56010275406753 and parameters: {'early_stopping_rounds': 71}. Best is trial 0 with value: 4.56010275406753.[0m


Stopped by overfitting detector  (71 iterations wait)

bestTest = 5.257545066
bestIteration = 3671

Shrink model to first 3672 iterations.
error on hyp search validation: 4.56010275406753
Best estimator so far. Saving estimator, started.
Saving estimator, done.
Finished trial 0
Running trial 1
Learning rate set to 0.003698
0:	learn: 7.2932588	test: 6.7211142	best: 6.7211142 (0)	total: 10.6ms	remaining: 17m 36s
1000:	learn: 5.3651469	test: 5.4913579	best: 5.4913579 (1000)	total: 10.1s	remaining: 16m 40s
2000:	learn: 4.8561194	test: 5.3463634	best: 5.3463634 (2000)	total: 20.3s	remaining: 16m 32s
3000:	learn: 4.4123265	test: 5.2897985	best: 5.2897985 (3000)	total: 30.3s	remaining: 16m 19s


[32m[I 2022-08-13 14:02:53,648][0m Trial 1 finished with value: 4.5817887455261195 and parameters: {'early_stopping_rounds': 86}. Best is trial 0 with value: 4.56010275406753.[0m


Stopped by overfitting detector  (86 iterations wait)

bestTest = 5.285393081
bestIteration = 3081

Shrink model to first 3082 iterations.
error on hyp search validation: 4.5817887455261195
Not the best estimator so far.
Finished trial 1
Running trial 2
Learning rate set to 0.003698
0:	learn: 7.2930809	test: 6.7204777	best: 6.7204777 (0)	total: 9.99ms	remaining: 16m 38s
1000:	learn: 5.3601430	test: 5.4885409	best: 5.4885409 (1000)	total: 10.9s	remaining: 17m 56s
2000:	learn: 4.8531767	test: 5.3416976	best: 5.3416976 (2000)	total: 21.1s	remaining: 17m 13s
3000:	learn: 4.4076704	test: 5.2876921	best: 5.2875377 (2999)	total: 31.5s	remaining: 16m 59s


[32m[I 2022-08-13 14:03:29,766][0m Trial 2 finished with value: 4.549314282818119 and parameters: {'early_stopping_rounds': 50}. Best is trial 2 with value: 4.549314282818119.[0m


Stopped by overfitting detector  (50 iterations wait)

bestTest = 5.274474714
bestIteration = 3353

Shrink model to first 3354 iterations.
error on hyp search validation: 4.549314282818119
Best estimator so far. Saving estimator, started.
Saving estimator, done.
Finished trial 2
Running trial 3
Learning rate set to 0.003698
0:	learn: 7.2927446	test: 6.7205410	best: 6.7205410 (0)	total: 10.4ms	remaining: 17m 16s
1000:	learn: 5.3643608	test: 5.4874499	best: 5.4874499 (1000)	total: 10.5s	remaining: 17m 17s
2000:	learn: 4.8543980	test: 5.3421377	best: 5.3421377 (2000)	total: 20.9s	remaining: 17m 5s
3000:	learn: 4.4100894	test: 5.2833266	best: 5.2831455 (2997)	total: 31.9s	remaining: 17m 11s
4000:	learn: 3.9865812	test: 5.2565712	best: 5.2565364 (3997)	total: 42.8s	remaining: 17m 5s


[32m[I 2022-08-13 14:04:15,140][0m Trial 3 finished with value: 4.550983246226302 and parameters: {'early_stopping_rounds': 65}. Best is trial 2 with value: 4.549314282818119.[0m


Stopped by overfitting detector  (65 iterations wait)

bestTest = 5.252403158
bestIteration = 4135

Shrink model to first 4136 iterations.
error on hyp search validation: 4.550983246226302
Not the best estimator so far.
Finished trial 3
Running trial 4
Learning rate set to 0.003698
0:	learn: 7.2928292	test: 6.7202306	best: 6.7202306 (0)	total: 10.3ms	remaining: 17m 11s
1000:	learn: 5.3654251	test: 5.4846227	best: 5.4846227 (1000)	total: 10.5s	remaining: 17m 22s
2000:	learn: 4.8532868	test: 5.3393044	best: 5.3393044 (2000)	total: 21.6s	remaining: 17m 36s
3000:	learn: 4.4087904	test: 5.2851643	best: 5.2842928 (2947)	total: 32.3s	remaining: 17m 24s
Stopped by overfitting detector  (57 iterations wait)

bestTest = 5.284292839
bestIteration = 2947

Shrink model to first 2948 iterations.
error on hyp search validation: 4.582943099389445


[32m[I 2022-08-13 14:04:48,011][0m Trial 4 finished with value: 4.582943099389445 and parameters: {'early_stopping_rounds': 57}. Best is trial 2 with value: 4.549314282818119.[0m


Not the best estimator so far.
Finished trial 4
Running trial 5
Learning rate set to 0.003698
0:	learn: 7.2924275	test: 6.7201954	best: 6.7201954 (0)	total: 23.9ms	remaining: 39m 45s
1000:	learn: 5.3673657	test: 5.4917904	best: 5.4917904 (1000)	total: 10.7s	remaining: 17m 38s
2000:	learn: 4.8543454	test: 5.3427856	best: 5.3425933 (1997)	total: 21.6s	remaining: 17m 35s
3000:	learn: 4.4144923	test: 5.2874553	best: 5.2874553 (3000)	total: 32.2s	remaining: 17m 20s


[32m[I 2022-08-13 14:05:26,316][0m Trial 5 finished with value: 4.5720452842001045 and parameters: {'early_stopping_rounds': 54}. Best is trial 2 with value: 4.549314282818119.[0m


Stopped by overfitting detector  (54 iterations wait)

bestTest = 5.27353507
bestIteration = 3456

Shrink model to first 3457 iterations.
error on hyp search validation: 4.5720452842001045
Not the best estimator so far.
Finished trial 5
Running trial 6
Learning rate set to 0.003698
0:	learn: 7.2933139	test: 6.7208962	best: 6.7208962 (0)	total: 11.2ms	remaining: 18m 42s
1000:	learn: 5.3673404	test: 5.4950134	best: 5.4950134 (1000)	total: 10.7s	remaining: 17m 39s
2000:	learn: 4.8604755	test: 5.3452860	best: 5.3450211 (1987)	total: 22.1s	remaining: 18m 1s
3000:	learn: 4.4178753	test: 5.2838536	best: 5.2838536 (3000)	total: 32.8s	remaining: 17m 40s


[32m[I 2022-08-13 14:06:07,648][0m Trial 6 finished with value: 4.567986278531375 and parameters: {'early_stopping_rounds': 59}. Best is trial 2 with value: 4.549314282818119.[0m


Stopped by overfitting detector  (59 iterations wait)

bestTest = 5.263728816
bestIteration = 3678

Shrink model to first 3679 iterations.
error on hyp search validation: 4.567986278531375
Not the best estimator so far.
Finished trial 6
Running trial 7
Learning rate set to 0.003698
0:	learn: 7.2930370	test: 6.7195342	best: 6.7195342 (0)	total: 11.5ms	remaining: 19m 9s
1000:	learn: 5.3616943	test: 5.4872965	best: 5.4872965 (1000)	total: 10.8s	remaining: 17m 43s
2000:	learn: 4.8526374	test: 5.3441206	best: 5.3441087 (1999)	total: 21.7s	remaining: 17m 40s
3000:	learn: 4.4123525	test: 5.2840425	best: 5.2836441 (2985)	total: 33s	remaining: 17m 45s


[32m[I 2022-08-13 14:06:45,278][0m Trial 7 finished with value: 4.56567944919343 and parameters: {'early_stopping_rounds': 67}. Best is trial 2 with value: 4.549314282818119.[0m


Stopped by overfitting detector  (67 iterations wait)

bestTest = 5.271962439
bestIteration = 3325

Shrink model to first 3326 iterations.
error on hyp search validation: 4.56567944919343
Not the best estimator so far.
Finished trial 7
Running trial 8
Learning rate set to 0.003698
0:	learn: 7.2931854	test: 6.7205060	best: 6.7205060 (0)	total: 10ms	remaining: 16m 42s
1000:	learn: 5.3670705	test: 5.4849102	best: 5.4849102 (1000)	total: 10.9s	remaining: 17m 54s
2000:	learn: 4.8548573	test: 5.3339259	best: 5.3339259 (2000)	total: 22s	remaining: 17m 55s
3000:	learn: 4.4145853	test: 5.2714471	best: 5.2714471 (3000)	total: 32.8s	remaining: 17m 41s


[32m[I 2022-08-13 14:07:22,828][0m Trial 8 finished with value: 4.567616204096844 and parameters: {'early_stopping_rounds': 70}. Best is trial 2 with value: 4.549314282818119.[0m


Stopped by overfitting detector  (70 iterations wait)

bestTest = 5.259961753
bestIteration = 3291

Shrink model to first 3292 iterations.
error on hyp search validation: 4.567616204096844
Not the best estimator so far.
Finished trial 8
Running trial 9
Learning rate set to 0.003698
0:	learn: 7.2937393	test: 6.7208452	best: 6.7208452 (0)	total: 10.3ms	remaining: 17m 7s
1000:	learn: 5.3691654	test: 5.5059919	best: 5.5059919 (1000)	total: 10.9s	remaining: 17m 55s
2000:	learn: 4.8595825	test: 5.3615206	best: 5.3615206 (2000)	total: 22.1s	remaining: 18m 3s
3000:	learn: 4.4135992	test: 5.3082829	best: 5.3082531 (2997)	total: 33.1s	remaining: 17m 49s
4000:	learn: 3.9898630	test: 5.2847838	best: 5.2844691 (3995)	total: 44.4s	remaining: 17m 45s


[32m[I 2022-08-13 14:08:12,262][0m Trial 9 finished with value: 4.5547830971819 and parameters: {'early_stopping_rounds': 77}. Best is trial 2 with value: 4.549314282818119.[0m


Stopped by overfitting detector  (77 iterations wait)

bestTest = 5.280909997
bestIteration = 4334

Shrink model to first 4335 iterations.
error on hyp search validation: 4.5547830971819
Not the best estimator so far.
Finished trial 9
Running trial 10
Learning rate set to 0.003698
0:	learn: 7.2917993	test: 6.7195543	best: 6.7195543 (0)	total: 9.58ms	remaining: 15m 57s
1000:	learn: 5.3685815	test: 5.4984499	best: 5.4984499 (1000)	total: 11.2s	remaining: 18m 31s
2000:	learn: 4.8577574	test: 5.3458816	best: 5.3458816 (2000)	total: 22.2s	remaining: 18m 7s
3000:	learn: 4.4155904	test: 5.2927410	best: 5.2926873 (2989)	total: 33.4s	remaining: 17m 59s
4000:	learn: 3.9928310	test: 5.2725528	best: 5.2725528 (4000)	total: 44.8s	remaining: 17m 53s


[32m[I 2022-08-13 14:09:06,762][0m Trial 10 finished with value: 4.5552734337527 and parameters: {'early_stopping_rounds': 98}. Best is trial 2 with value: 4.549314282818119.[0m


Stopped by overfitting detector  (98 iterations wait)

bestTest = 5.260569584
bestIteration = 4740

Shrink model to first 4741 iterations.
error on hyp search validation: 4.5552734337527
Not the best estimator so far.
Finished trial 10
Running trial 11
Learning rate set to 0.003698
0:	learn: 7.2929469	test: 6.7203637	best: 6.7203637 (0)	total: 10.4ms	remaining: 17m 23s
1000:	learn: 5.3665995	test: 5.4915984	best: 5.4915984 (1000)	total: 11.4s	remaining: 18m 44s
2000:	learn: 4.8561222	test: 5.3464181	best: 5.3464181 (2000)	total: 22.4s	remaining: 18m 14s


[32m[I 2022-08-13 14:09:40,503][0m Trial 11 finished with value: 4.566784523341784 and parameters: {'early_stopping_rounds': 52}. Best is trial 2 with value: 4.549314282818119.[0m


Stopped by overfitting detector  (52 iterations wait)

bestTest = 5.287561881
bestIteration = 2924

Shrink model to first 2925 iterations.
error on hyp search validation: 4.566784523341784
Not the best estimator so far.
Finished trial 11
Running trial 12
Learning rate set to 0.003698
0:	learn: 7.2936811	test: 6.7205847	best: 6.7205847 (0)	total: 11.7ms	remaining: 19m 25s
1000:	learn: 5.3689816	test: 5.5011392	best: 5.5011392 (1000)	total: 11s	remaining: 18m 9s
2000:	learn: 4.8551266	test: 5.3544199	best: 5.3544199 (2000)	total: 22.4s	remaining: 18m 16s


[32m[I 2022-08-13 14:10:10,726][0m Trial 12 finished with value: 4.581772369063538 and parameters: {'early_stopping_rounds': 62}. Best is trial 2 with value: 4.549314282818119.[0m


Stopped by overfitting detector  (62 iterations wait)

bestTest = 5.309994532
bestIteration = 2623

Shrink model to first 2624 iterations.
error on hyp search validation: 4.581772369063538
Not the best estimator so far.
Finished trial 12
Running trial 13
Learning rate set to 0.003698
0:	learn: 7.2923396	test: 6.7198165	best: 6.7198165 (0)	total: 9.97ms	remaining: 16m 37s
1000:	learn: 5.3644056	test: 5.4921392	best: 5.4921392 (1000)	total: 11.3s	remaining: 18m 37s
2000:	learn: 4.8590981	test: 5.3397598	best: 5.3394774 (1998)	total: 22.5s	remaining: 18m 22s
3000:	learn: 4.4178737	test: 5.2825535	best: 5.2823014 (2998)	total: 33.5s	remaining: 18m 3s
4000:	learn: 3.9940464	test: 5.2557638	best: 5.2557359 (3994)	total: 44.7s	remaining: 17m 53s
5000:	learn: 3.6330959	test: 5.2398714	best: 5.2393866 (4988)	total: 55.7s	remaining: 17m 38s


[32m[I 2022-08-13 14:11:12,788][0m Trial 13 finished with value: 4.54609931774709 and parameters: {'early_stopping_rounds': 79}. Best is trial 13 with value: 4.54609931774709.[0m


Stopped by overfitting detector  (79 iterations wait)

bestTest = 5.234567319
bestIteration = 5406

Shrink model to first 5407 iterations.
error on hyp search validation: 4.54609931774709
Best estimator so far. Saving estimator, started.
Saving estimator, done.
Finished trial 13
Running trial 14
Learning rate set to 0.003698
0:	learn: 7.2923465	test: 6.7201904	best: 6.7201904 (0)	total: 22.3ms	remaining: 37m 13s
1000:	learn: 5.3627268	test: 5.4865608	best: 5.4864537 (998)	total: 11.2s	remaining: 18m 23s
2000:	learn: 4.8553329	test: 5.3426199	best: 5.3426056 (1999)	total: 22.9s	remaining: 18m 42s
3000:	learn: 4.4147472	test: 5.2838819	best: 5.2838819 (3000)	total: 33.9s	remaining: 18m 15s
4000:	learn: 3.9922833	test: 5.2616383	best: 5.2608756 (3975)	total: 45.3s	remaining: 18m 5s


[32m[I 2022-08-13 14:11:59,192][0m Trial 14 finished with value: 4.559309631247695 and parameters: {'early_stopping_rounds': 80}. Best is trial 13 with value: 4.54609931774709.[0m


Stopped by overfitting detector  (80 iterations wait)

bestTest = 5.260875636
bestIteration = 3975

Shrink model to first 3976 iterations.
error on hyp search validation: 4.559309631247695
Not the best estimator so far.
Finished trial 14
Running trial 15
Learning rate set to 0.003698
0:	learn: 7.2924466	test: 6.7200154	best: 6.7200154 (0)	total: 10.8ms	remaining: 18m 4s
1000:	learn: 5.3671256	test: 5.4833284	best: 5.4832933 (999)	total: 11.8s	remaining: 19m 27s
2000:	learn: 4.8567263	test: 5.3367604	best: 5.3366523 (1996)	total: 22.8s	remaining: 18m 38s
3000:	learn: 4.4118505	test: 5.2753584	best: 5.2752419 (2995)	total: 34s	remaining: 18m 19s


[32m[I 2022-08-13 14:12:35,928][0m Trial 15 finished with value: 4.580988300808124 and parameters: {'early_stopping_rounds': 90}. Best is trial 13 with value: 4.54609931774709.[0m


Stopped by overfitting detector  (90 iterations wait)

bestTest = 5.272511549
bestIteration = 3107

Shrink model to first 3108 iterations.
error on hyp search validation: 4.580988300808124
Not the best estimator so far.
Finished trial 15
Running trial 16
Learning rate set to 0.003698
0:	learn: 7.2930230	test: 6.7204395	best: 6.7204395 (0)	total: 10.5ms	remaining: 17m 32s
1000:	learn: 5.3692078	test: 5.4877089	best: 5.4877089 (1000)	total: 11.1s	remaining: 18m 15s
2000:	learn: 4.8592114	test: 5.3275859	best: 5.3275859 (2000)	total: 22.7s	remaining: 18m 33s
3000:	learn: 4.4112747	test: 5.2753963	best: 5.2738298 (2936)	total: 34s	remaining: 18m 19s


[32m[I 2022-08-13 14:13:10,575][0m Trial 16 finished with value: 4.569570213656224 and parameters: {'early_stopping_rounds': 83}. Best is trial 13 with value: 4.54609931774709.[0m


Stopped by overfitting detector  (83 iterations wait)

bestTest = 5.273829797
bestIteration = 2936

Shrink model to first 2937 iterations.
error on hyp search validation: 4.569570213656224
Not the best estimator so far.
Finished trial 16
Running trial 17
Learning rate set to 0.003698
0:	learn: 7.2926023	test: 6.7204663	best: 6.7204663 (0)	total: 12.3ms	remaining: 20m 30s
1000:	learn: 5.3622164	test: 5.4986476	best: 5.4986476 (1000)	total: 11.2s	remaining: 18m 26s
2000:	learn: 4.8519767	test: 5.3538398	best: 5.3538398 (2000)	total: 23s	remaining: 18m 46s
3000:	learn: 4.4093569	test: 5.2992958	best: 5.2992958 (3000)	total: 34.1s	remaining: 18m 20s
4000:	learn: 3.9877999	test: 5.2741693	best: 5.2741693 (4000)	total: 45.4s	remaining: 18m 9s


[32m[I 2022-08-13 14:14:02,800][0m Trial 17 finished with value: 4.550519096931488 and parameters: {'early_stopping_rounds': 95}. Best is trial 13 with value: 4.54609931774709.[0m


Stopped by overfitting detector  (95 iterations wait)

bestTest = 5.263332724
bestIteration = 4467

Shrink model to first 4468 iterations.
error on hyp search validation: 4.550519096931488
Not the best estimator so far.
Finished trial 17
Running trial 18
Learning rate set to 0.003698
0:	learn: 7.2936018	test: 6.7205574	best: 6.7205574 (0)	total: 11.4ms	remaining: 19m
1000:	learn: 5.3657450	test: 5.4976514	best: 5.4976492 (999)	total: 11.8s	remaining: 19m 30s
2000:	learn: 4.8542295	test: 5.3463696	best: 5.3463696 (2000)	total: 23.7s	remaining: 19m 23s
3000:	learn: 4.4101220	test: 5.2938548	best: 5.2938548 (3000)	total: 34.9s	remaining: 18m 47s


[32m[I 2022-08-13 14:14:43,782][0m Trial 18 finished with value: 4.571986974726781 and parameters: {'early_stopping_rounds': 75}. Best is trial 13 with value: 4.54609931774709.[0m


Stopped by overfitting detector  (75 iterations wait)

bestTest = 5.284875154
bestIteration = 3429

Shrink model to first 3430 iterations.
error on hyp search validation: 4.571986974726781
Not the best estimator so far.
Finished trial 18
Running trial 19
Learning rate set to 0.003698
0:	learn: 7.2927995	test: 6.7199639	best: 6.7199639 (0)	total: 12.5ms	remaining: 20m 50s
1000:	learn: 5.3647435	test: 5.4910098	best: 5.4910098 (1000)	total: 11.9s	remaining: 19m 37s
2000:	learn: 4.8526751	test: 5.3416213	best: 5.3416213 (2000)	total: 23.7s	remaining: 19m 19s
3000:	learn: 4.4103665	test: 5.2827961	best: 5.2827961 (3000)	total: 35.2s	remaining: 18m 58s


[32m[I 2022-08-13 14:15:25,497][0m Trial 19 finished with value: 4.576520775864733 and parameters: {'early_stopping_rounds': 89}. Best is trial 13 with value: 4.54609931774709.[0m


Stopped by overfitting detector  (89 iterations wait)

bestTest = 5.271343806
bestIteration = 3430

Shrink model to first 3431 iterations.
error on hyp search validation: 4.576520775864733
Not the best estimator so far.
Finished trial 19
Running trial 20
Learning rate set to 0.003698
0:	learn: 7.2926696	test: 6.7199031	best: 6.7199031 (0)	total: 11.8ms	remaining: 19m 37s
1000:	learn: 5.3661389	test: 5.4857097	best: 5.4857097 (1000)	total: 11.6s	remaining: 19m 6s
2000:	learn: 4.8565295	test: 5.3376439	best: 5.3375840 (1997)	total: 23.3s	remaining: 19m
3000:	learn: 4.4125967	test: 5.2750086	best: 5.2750086 (3000)	total: 34.9s	remaining: 18m 47s


[32m[I 2022-08-13 14:16:01,691][0m Trial 20 finished with value: 4.582405947506712 and parameters: {'early_stopping_rounds': 50}. Best is trial 13 with value: 4.54609931774709.[0m


Stopped by overfitting detector  (50 iterations wait)

bestTest = 5.274075618
bestIteration = 3026

Shrink model to first 3027 iterations.
error on hyp search validation: 4.582405947506712
Not the best estimator so far.
Finished trial 20
Running trial 21
Learning rate set to 0.003698
0:	learn: 7.2930072	test: 6.7205235	best: 6.7205235 (0)	total: 12.2ms	remaining: 20m 16s
1000:	learn: 5.3648959	test: 5.4908937	best: 5.4908937 (1000)	total: 11.6s	remaining: 19m 8s
2000:	learn: 4.8552669	test: 5.3395050	best: 5.3394800 (1998)	total: 23.1s	remaining: 18m 49s
3000:	learn: 4.4101579	test: 5.2828006	best: 5.2823432 (2990)	total: 34.6s	remaining: 18m 38s


[32m[I 2022-08-13 14:16:40,237][0m Trial 21 finished with value: 4.585442421874324 and parameters: {'early_stopping_rounds': 95}. Best is trial 13 with value: 4.54609931774709.[0m


Stopped by overfitting detector  (95 iterations wait)

bestTest = 5.274028352
bestIteration = 3209

Shrink model to first 3210 iterations.
error on hyp search validation: 4.585442421874324
Not the best estimator so far.
Finished trial 21
Running trial 22
Learning rate set to 0.003698
0:	learn: 7.2925690	test: 6.7200759	best: 6.7200759 (0)	total: 12.9ms	remaining: 21m 30s
1000:	learn: 5.3608915	test: 5.4891090	best: 5.4891090 (1000)	total: 11.4s	remaining: 18m 50s
2000:	learn: 4.8482165	test: 5.3478494	best: 5.3477367 (1999)	total: 22.9s	remaining: 18m 41s
3000:	learn: 4.4116826	test: 5.2910055	best: 5.2907488 (2998)	total: 34.4s	remaining: 18m 32s


[32m[I 2022-08-13 14:17:18,160][0m Trial 22 finished with value: 4.568358194954313 and parameters: {'early_stopping_rounds': 100}. Best is trial 13 with value: 4.54609931774709.[0m


Stopped by overfitting detector  (100 iterations wait)

bestTest = 5.284229371
bestIteration = 3170

Shrink model to first 3171 iterations.
error on hyp search validation: 4.568358194954313
Not the best estimator so far.
Finished trial 22
Running trial 23
Learning rate set to 0.003698
0:	learn: 7.2928398	test: 6.7207751	best: 6.7207751 (0)	total: 22.1ms	remaining: 36m 46s
1000:	learn: 5.3627063	test: 5.5035879	best: 5.5035181 (999)	total: 11.6s	remaining: 19m 8s
2000:	learn: 4.8558204	test: 5.3560176	best: 5.3560061 (1999)	total: 23s	remaining: 18m 46s
3000:	learn: 4.4138506	test: 5.2950003	best: 5.2949492 (2998)	total: 34.5s	remaining: 18m 34s
4000:	learn: 3.9865363	test: 5.2706492	best: 5.2706492 (4000)	total: 46.1s	remaining: 18m 26s
5000:	learn: 3.6257404	test: 5.2525172	best: 5.2524652 (4998)	total: 57.5s	remaining: 18m 13s


[32m[I 2022-08-13 14:18:21,141][0m Trial 23 finished with value: 4.549950182135785 and parameters: {'early_stopping_rounds': 92}. Best is trial 13 with value: 4.54609931774709.[0m


Stopped by overfitting detector  (92 iterations wait)

bestTest = 5.246202004
bestIteration = 5312

Shrink model to first 5313 iterations.
error on hyp search validation: 4.549950182135785
Not the best estimator so far.
Finished trial 23
Running trial 24
Learning rate set to 0.003698
0:	learn: 7.2930247	test: 6.7205770	best: 6.7205770 (0)	total: 12.4ms	remaining: 20m 41s
1000:	learn: 5.3656452	test: 5.4927581	best: 5.4927581 (1000)	total: 11.5s	remaining: 18m 53s
2000:	learn: 4.8549453	test: 5.3387745	best: 5.3387745 (2000)	total: 22.8s	remaining: 18m 37s
3000:	learn: 4.4085148	test: 5.2868568	best: 5.2868568 (3000)	total: 34.1s	remaining: 18m 21s
4000:	learn: 3.9882934	test: 5.2651924	best: 5.2651924 (4000)	total: 45.3s	remaining: 18m 7s


[32m[I 2022-08-13 14:19:11,268][0m Trial 24 finished with value: 4.5569104266087646 and parameters: {'early_stopping_rounds': 74}. Best is trial 13 with value: 4.54609931774709.[0m


Stopped by overfitting detector  (74 iterations wait)

bestTest = 5.257482882
bestIteration = 4293

Shrink model to first 4294 iterations.
error on hyp search validation: 4.5569104266087646
Not the best estimator so far.
Finished trial 24
Running trial 25
Learning rate set to 0.003698
0:	learn: 7.2928886	test: 6.7205729	best: 6.7205729 (0)	total: 11ms	remaining: 18m 16s
1000:	learn: 5.3652591	test: 5.4932896	best: 5.4932896 (1000)	total: 11.6s	remaining: 19m 10s
2000:	learn: 4.8547847	test: 5.3449051	best: 5.3449051 (2000)	total: 22.8s	remaining: 18m 38s
3000:	learn: 4.4123259	test: 5.2828225	best: 5.2828225 (3000)	total: 34s	remaining: 18m 18s


[32m[I 2022-08-13 14:19:52,338][0m Trial 25 finished with value: 4.565488995935334 and parameters: {'early_stopping_rounds': 81}. Best is trial 13 with value: 4.54609931774709.[0m


Stopped by overfitting detector  (81 iterations wait)

bestTest = 5.268088538
bestIteration = 3514

Shrink model to first 3515 iterations.
error on hyp search validation: 4.565488995935334
Not the best estimator so far.
Finished trial 25
Running trial 26
Learning rate set to 0.003698
0:	learn: 7.2928109	test: 6.7203454	best: 6.7203454 (0)	total: 19.7ms	remaining: 32m 45s
1000:	learn: 5.3690329	test: 5.5017423	best: 5.5017423 (1000)	total: 11.4s	remaining: 18m 48s
2000:	learn: 4.8537517	test: 5.3551780	best: 5.3551780 (2000)	total: 22.4s	remaining: 18m 15s
3000:	learn: 4.4101832	test: 5.2955167	best: 5.2946253 (2993)	total: 33.6s	remaining: 18m 5s
4000:	learn: 3.9879484	test: 5.2692012	best: 5.2690712 (3998)	total: 44.6s	remaining: 17m 49s


[32m[I 2022-08-13 14:20:43,382][0m Trial 26 finished with value: 4.5436211714457775 and parameters: {'early_stopping_rounds': 92}. Best is trial 26 with value: 4.5436211714457775.[0m


Stopped by overfitting detector  (92 iterations wait)

bestTest = 5.260228438
bestIteration = 4419

Shrink model to first 4420 iterations.
error on hyp search validation: 4.5436211714457775
Best estimator so far. Saving estimator, started.
Saving estimator, done.
Finished trial 26
Running trial 27
Learning rate set to 0.003698
0:	learn: 7.2935180	test: 6.7209235	best: 6.7209235 (0)	total: 12.5ms	remaining: 20m 50s
1000:	learn: 5.3628996	test: 5.5026692	best: 5.5026692 (1000)	total: 11.1s	remaining: 18m 21s
2000:	learn: 4.8550850	test: 5.3552310	best: 5.3552310 (2000)	total: 22.9s	remaining: 18m 42s
3000:	learn: 4.4097411	test: 5.2973468	best: 5.2972346 (2996)	total: 34s	remaining: 18m 17s
4000:	learn: 3.9831789	test: 5.2666258	best: 5.2666258 (4000)	total: 45.7s	remaining: 18m 17s


[32m[I 2022-08-13 14:21:37,618][0m Trial 27 finished with value: 4.55663114482655 and parameters: {'early_stopping_rounds': 87}. Best is trial 26 with value: 4.5436211714457775.[0m


Stopped by overfitting detector  (87 iterations wait)

bestTest = 5.25537226
bestIteration = 4631

Shrink model to first 4632 iterations.
error on hyp search validation: 4.55663114482655
Not the best estimator so far.
Finished trial 27
Running trial 28
Learning rate set to 0.003698
0:	learn: 7.2921312	test: 6.7198612	best: 6.7198612 (0)	total: 9.79ms	remaining: 16m 18s
1000:	learn: 5.3666315	test: 5.4984025	best: 5.4984025 (1000)	total: 11.9s	remaining: 19m 32s
2000:	learn: 4.8579503	test: 5.3481235	best: 5.3481235 (2000)	total: 23.8s	remaining: 19m 27s
3000:	learn: 4.4163049	test: 5.2898636	best: 5.2891463 (2981)	total: 34.8s	remaining: 18m 45s
4000:	learn: 3.9941345	test: 5.2680504	best: 5.2678140 (3977)	total: 46.5s	remaining: 18m 35s


[32m[I 2022-08-13 14:22:30,416][0m Trial 28 finished with value: 4.555813177882406 and parameters: {'early_stopping_rounds': 82}. Best is trial 26 with value: 4.5436211714457775.[0m


Stopped by overfitting detector  (82 iterations wait)

bestTest = 5.26024738
bestIteration = 4432

Shrink model to first 4433 iterations.
error on hyp search validation: 4.555813177882406
Not the best estimator so far.
Finished trial 28
Running trial 29
Learning rate set to 0.003698
0:	learn: 7.2930631	test: 6.7207003	best: 6.7207003 (0)	total: 9.98ms	remaining: 16m 38s
1000:	learn: 5.3654081	test: 5.5015575	best: 5.5015575 (1000)	total: 12s	remaining: 19m 42s
2000:	learn: 4.8546452	test: 5.3430158	best: 5.3430049 (1999)	total: 23.1s	remaining: 18m 49s
3000:	learn: 4.4119669	test: 5.2866115	best: 5.2866115 (3000)	total: 35s	remaining: 18m 50s


[32m[I 2022-08-13 14:23:12,248][0m Trial 29 finished with value: 4.5592282047466925 and parameters: {'early_stopping_rounds': 72}. Best is trial 26 with value: 4.5436211714457775.[0m


Stopped by overfitting detector  (72 iterations wait)

bestTest = 5.274953141
bestIteration = 3494

Shrink model to first 3495 iterations.
error on hyp search validation: 4.5592282047466925
Not the best estimator so far.
Finished trial 29
Loading best trial: {'early_stopping_rounds': 92} started
Loading best trial: {'early_stopping_rounds': 92} done


In [7]:
y_pred = best_creg.predict(x_test)
err = np.abs(y_pred - y_test).mean()
print("MAE on test set", err)

MAE on test set 4.5195189067305455


In [8]:
df_rf = pd.read_csv('data/synthetic_db_with_risk_factors.csv')
x_rf = df_rf[x_all.columns]
y_rf = df_rf['age_at_MRI']
df_rf.isna().any(1).sum()

0

In [9]:
best_creg_bc = BiasCorrectEstimatorCole(best_creg, x_rf, y_rf)
y_rf_pred = best_creg_bc.predict(x_rf)

df_rf['ca_delta'] = y_rf_pred - y_rf

In [10]:
reg = smf.ols(formula='ca_delta ~ rf_diabetes*sex + age_at_MRI + age_at_MRI^2', data=df_rf).fit()

reg.summary()

0,1,2,3
Dep. Variable:,ca_delta,R-squared:,0.001
Model:,OLS,Adj. R-squared:,0.001
Method:,Least Squares,F-statistic:,6.464
Date:,"Sat, 13 Aug 2022",Prob (F-statistic):,5.16e-06
Time:,14:23:13,Log-Likelihood:,-126380.0
No. Observations:,34137,AIC:,252800.0
Df Residuals:,34131,BIC:,252800.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.1218,0.454,0.268,0.789,-0.768,1.012
rf_diabetes,0.2275,0.254,0.896,0.370,-0.270,0.725
sex,-0.5656,0.113,-5.021,0.000,-0.786,-0.345
rf_diabetes:sex,-0.0285,0.344,-0.083,0.934,-0.703,0.646
age_at_MRI,-0.0427,0.028,-1.550,0.121,-0.097,0.011
age_at_MRI ^ 2,0.0448,0.027,1.687,0.092,-0.007,0.097

0,1,2,3
Omnibus:,91.586,Durbin-Watson:,1.98
Prob(Omnibus):,0.0,Jarque-Bera (JB):,77.127
Skew:,0.055,Prob(JB):,1.79e-17
Kurtosis:,2.795,Cond. No.,782.0
