In [59]:
!pip install -q optuna catboost

In [60]:
import numpy as np
import scipy.stats as stats
import pandas as pd
import os
import pickle

import optuna
import sklearn
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
import statsmodels.formula.api as smf

In [61]:
def mtest(self, x_test, y_test):
    if len(y_test.shape) == 1:
        y_test = y_test[:, None]
    y_pred = self.predict(x_test)
    diff = y_pred - y_test
    se = diff**2
    ss_res = se.sum()
    ss_total = ((y_test - self.y_train_mean)**2).sum()
    rsq = 1 - ss_res / ss_total
    rmse = np.sqrt(se.mean())
    mae = np.abs(diff).mean()
    return dict(mae=mae, rmse=rmse, rsq=rsq)

def train_with_hyp_search(x_trial, y_trial, esttype='catb', n_trials=30,
                          optuna_sql_path='/dev/shm/optuna.sqlite3',
                          optuna_storage_dir='/tmp/optuna_pkls'):

    x_trial_train, x_trial_test, y_trial_train, y_trial_test = train_test_split(x_trial, y_trial, test_size=0.1, random_state=0)
    if esttype != 'lasso':
        x_trial_train, x_trial_val, y_trial_train, y_trial_val = train_test_split(x_trial_train, y_trial_train, test_size=0.1, random_state=1)

    def objective(trial):
        successful_trials_so_far = len([t for t in study.trials if t.state.name == 'COMPLETE'])
        if successful_trials_so_far >= n_trials:
            study.stop()
            print('Maximum number of trials reached, prunning')
            raise optuna.TrialPruned()
        print(f"Running trial {trial.number}")

        if esttype == 'catb':
            param = {
                'iterations': 100_000,
                'early_stopping_rounds': trial.suggest_int("early_stopping_rounds", 50, 100),
                'verbose': 1000,
                'random_seed': successful_trials_so_far,
                #'task_type': 'GPU',
            }

            creg = CatBoostRegressor(**param)
            creg.fit(x_trial_train, y_trial_train,
                eval_set=(x_trial_val, y_trial_val)
            )

        elif esttype == 'lasso':
            alpha = trial.suggest_float("alpha", 0, 10)

            creg = sklearn.linear_model.Lasso(alpha=alpha)
            creg.fit(x_trial_train, y_trial_train)

        else:
            raise ValueError('Invalid estimator type')
        
        y_trial_pred = creg.predict(x_trial_test)
        diff = y_trial_pred - y_trial_test
        err = np.abs(diff).mean()

        trial.set_user_attr("err", err)
        print('error on hyp search validation:', err)

        best_estimator_performance = np.inf
        try:
            best_estimator_performance = study.best_trial.values[0]
        except Exception:
            pass

        if best_estimator_performance >= err:
            print('Best estimator so far. Saving estimator, started.')
            with open(f"{os.path.join(optuna_storage_dir, str(trial.number))}.pkl", "wb") as f:
                pickle.dump(creg, f)
            print('Saving estimator, done.')
        else:
            print('Not the best estimator so far.')

        print(f"Finished trial {trial.number}")
        return err

    optuna_storage_dir = os.path.join(optuna_storage_dir, esttype)
    os.makedirs(optuna_storage_dir, exist_ok=True)

    optuna_storage = optuna.storages.RDBStorage(url="sqlite:///"+optuna_sql_path, engine_kwargs={"connect_args": {"timeout": 600}})
    try:
        optuna.delete_study(storage=optuna_storage, study_name=esttype)
    except KeyError:
        pass
    study = optuna.create_study(storage=optuna_storage,
                                study_name=esttype, direction="minimize",
                                sampler=optuna.samplers.TPESampler(seed=1))
    study.optimize(objective, n_trials=n_trials)

    print("Loading best trial:", study.best_params, 'started')
    with open(f"{os.path.join(optuna_storage_dir, str(study.best_trial.number))}.pkl", "rb") as f:
        best_creg = pickle.load(f)
    print("Loading best trial:", study.best_params, 'done')

    return best_creg#, study.best_params, study

class BiasCorrectEstimatorCole:
    def __init__(self, est, x_sval_bc, y_sval_bc):
        self.est = est
        
        x_sval_bc = np.array(x_sval_bc)
        y_sval_bc = np.array(y_sval_bc).reshape(-1)
        
        reg = sklearn.linear_model.LinearRegression()
        reg.fit(y_sval_bc.reshape((-1, 1)), est.predict(x_sval_bc).reshape(-1))
        self.intercept = reg.intercept_
        self.coef = reg.coef_.item()
 
    def predict(self, x_pred, y_pred=None):
        x_pred = np.array(x_pred)
        pred = self.est.predict(x_pred)
        pred = (pred - self.intercept) / self.coef
        return pred

In [62]:
df = pd.read_csv('synthetic_db_healthy.csv')
df.isna().any(1).sum()

0

In [63]:
x_all, y_all = df.iloc[:, df.columns!='age_at_MRI'], df['age_at_MRI'], 
x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=0.2, random_state=0)

In [None]:
best_creg  = train_with_hyp_search(x_train, y_train, esttype='catb', n_trials=30)

[32m[I 2022-06-09 07:27:18,061][0m A new study created in RDB with name: catb[0m


Running trial 0
Learning rate set to 0.003698
0:	learn: 7.2927696	test: 6.7203045	best: 6.7203045 (0)	total: 42.8ms	remaining: 1h 11m 19s
1000:	learn: 5.3641452	test: 5.4879250	best: 5.4879250 (1000)	total: 47s	remaining: 1h 17m 25s


In [None]:
y_pred = best_creg.predict(x_test)
err = np.abs(y_pred - y_test).mean()
print("MAE on test set", err)

In [None]:
df_rf = pd.read_csv('synthetic_db_with_risk_factors.csv')
x_rf = df_rf[x_all.columns]
y_rf = df_rf['age_at_MRI']
df_rf.isna().any(1).sum()

In [None]:
best_creg_bc = BiasCorrectEstimatorCole(best_creg, x_rf, y_rf)
y_rf_pred = best_creg_bc.predict(x_rf)

df_rf['ca_delta'] = y_rf_pred - y_rf

In [None]:
reg = smf.ols(formula='ca_delta ~ rf_diabetes*sex + age_at_MRI + age_at_MRI^2', data=df_rf).fit()

reg.summary()