# Predicting cardiovascular age

### Loading the dependencies

In [1]:
#!pip install --user -q scipy optuna catboost scikit-learn statsmodels

In [2]:
import numpy as np
import scipy.stats as stats
import pandas as pd
import os
import pickle

import optuna
from sklearn import linear_model, ensemble
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
import statsmodels.formula.api as smf

np.random.seed(1) # Deterministic seed

We start by defining our hyperparameter searching function, you basically feed a x, y dataset and it will return the best catboost model after n_trial hyperpameter searches.

In [4]:
def train_with_hyp_search(x_trial, y_trial, esttype='catb', n_trials=30,
                          optuna_sql_path='/dev/shm/optuna.sqlite3',
                          optuna_storage_dir='/tmp/optuna_pkls'):
    """
    Parameters
    ----------
    x_trial : array_like
        Features to train
    y_trial : array_like
        Target (age) to train
    esttype : str
        Estimator name, can be 'catb' (for catboost) or 'lasso' or
        'ridge' or 'en' (for elastic net) or 'rf' (for random forests)
    optuna_sql_path : str
        Path to save optuna sql database
    optuna_storage_dir : str
        Path to save optuna model pickles
    """
    
    x_trial_train, x_trial_test, y_trial_train, y_trial_test = train_test_split(x_trial, y_trial, test_size=0.1, random_state=0)
    if esttype == 'catb':
        x_trial_train, x_trial_val, y_trial_train, y_trial_val = train_test_split(x_trial_train, y_trial_train, test_size=0.1, random_state=1)

    def objective(trial):
        successful_trials_so_far = len([t for t in study.trials if t.state.name == 'COMPLETE'])
        if successful_trials_so_far >= n_trials:
            study.stop()
            print('Maximum number of trials reached, prunning')
            raise optuna.TrialPruned()
        print(f"Running trial {trial.number}")

        if esttype == 'catb':
            param = {
                'iterations': 100_000,
                'early_stopping_rounds': trial.suggest_int("early_stopping_rounds", 50, 100),
                'verbose': 1000,
                'random_seed': successful_trials_so_far,
                #'task_type': 'GPU',
            }

            creg = CatBoostRegressor(**param)
            creg.fit(x_trial_train, y_trial_train,
                eval_set=(x_trial_val, y_trial_val)
            )

        elif esttype == 'lasso':
            alpha = trial.suggest_float("alpha", 0, 10)

            creg = linear_model.Lasso(alpha=alpha)
            creg.fit(x_trial_train, y_trial_train)

        elif esttype == 'ridge':
            alpha = trial.suggest_float("alpha", 0, 10)

            creg = linear_model.Ridge(alpha=alpha)
            creg.fit(x_trial_train, y_trial_train)

        elif esttype == 'en':
            alpha = trial.suggest_float("alpha", 0, 10)
            l1_ratio = trial.suggest_float("l1_ratio", 0, 1)

            creg = linear_model.ElasticNet(alpha=alpha,
                l1_ratio=l1_ratio)
            creg.fit(x_trial_train, y_trial_train)

        elif esttype == 'rf':
            n_estimators = trial.suggest_int("alpha", 100, 1000)

            creg = ensemble.RandomForestRegressor(
                n_estimators=n_estimators)
            creg.fit(x_trial_train, y_trial_train)

        else:
            raise ValueError('Invalid estimator type')
        
        y_trial_pred = creg.predict(x_trial_test)
        diff = y_trial_pred - y_trial_test
        err = np.abs(diff).mean()

        trial.set_user_attr("err", err)
        print('error on hyp search validation:', err)

        best_estimator_performance = np.inf
        try:
            best_estimator_performance = study.best_trial.values[0]
        except Exception:
            pass

        if best_estimator_performance >= err:
            print('Best estimator so far. Saving estimator, started.')
            with open(f"{os.path.join(optuna_storage_dir, str(trial.number))}.pkl", "wb") as f:
                pickle.dump(creg, f)
            print('Saving estimator, done.')
        else:
            print('Not the best estimator so far.')

        print(f"Finished trial {trial.number}")
        return err

    optuna_storage_dir = os.path.join(optuna_storage_dir, esttype)
    os.makedirs(optuna_storage_dir, exist_ok=True)

    optuna_storage = optuna.storages.RDBStorage(url="sqlite:///"+optuna_sql_path, engine_kwargs={"connect_args": {"timeout": 600}})
    try:
        optuna.delete_study(storage=optuna_storage, study_name=esttype)
    except KeyError:
        pass
    study = optuna.create_study(storage=optuna_storage,
                                study_name=esttype, direction="minimize",
                                sampler=optuna.samplers.TPESampler(seed=1))
    study.optimize(objective, n_trials=n_trials)

    print("Loading best trial:", study.best_params, 'started')
    with open(f"{os.path.join(optuna_storage_dir, str(study.best_trial.number))}.pkl", "rb") as f:
        best_creg = pickle.load(f)
    print("Loading best trial:", study.best_params, 'done')

    return best_creg

### Loading the data

Load the dataset for healthy patients and check for NAs.

In [5]:
df = pd.read_csv('data/synthetic_db_healthy.csv')
print('Number of NAs', df.isna().any(1).sum())
x_all, y_all = df.iloc[:, df.columns!='age_at_MRI'], df['age_at_MRI'], 
x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=0.2, random_state=0)

Number of NAs 0


Obtain the features and targets (ages)

In [6]:
x_all, y_all = df.iloc[:, df.columns!='age_at_MRI'], df['age_at_MRI']

Split into train and test datasets:

In [7]:
x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=0.2, random_state=0)

### Model training

Now fit catboost with hyperparameter searching doing n_trials paramater searches:

In [8]:
n_trials = 30

best_creg  = train_with_hyp_search(x_train, y_train, esttype='catb', n_trials=n_trials)

[32m[I 2022-08-15 16:54:48,940][0m A new study created in RDB with name: catb[0m


Running trial 0
Learning rate set to 0.003698
0:	learn: 7.2927696	test: 6.7203045	best: 6.7203045 (0)	total: 65.2ms	remaining: 1h 48m 41s
1000:	learn: 5.3641452	test: 5.4879250	best: 5.4879250 (1000)	total: 4.68s	remaining: 7m 43s
2000:	learn: 4.8505237	test: 5.3400868	best: 5.3400868 (2000)	total: 9.16s	remaining: 7m 28s
3000:	learn: 4.4107673	test: 5.2818156	best: 5.2815372 (2996)	total: 13.6s	remaining: 7m 19s


[32m[I 2022-08-15 16:55:06,376][0m Trial 0 finished with value: 4.56010275406753 and parameters: {'early_stopping_rounds': 71}. Best is trial 0 with value: 4.56010275406753.[0m


Stopped by overfitting detector  (71 iterations wait)

bestTest = 5.257545066
bestIteration = 3671

Shrink model to first 3672 iterations.
error on hyp search validation: 4.56010275406753
Best estimator so far. Saving estimator, started.
Saving estimator, done.
Finished trial 0
Loading best trial: {'early_stopping_rounds': 71} started
Loading best trial: {'early_stopping_rounds': 71} done


### Analysis of results

Now, we evaluate the MAE and R² on the test set:

In [9]:
y_pred = best_creg.predict(x_test)

mae = np.abs(y_pred - y_test).mean()
print("MAE on test set", mae)

se = (y_pred - y_test)**2
ss_res = se.sum()
ss_total = ((y_pred - y_train.mean())**2).sum()
rsq = 1 - ss_res / ss_total
    
print("R² on test set", rsq)

MAE on test set 4.527648784528185
R² on test set -0.4998542892003246


### Calculating the delta for patients with risk factors

We start by loading the dataset of patients with risk factors and checking for NAs.

In [10]:
df_rf = pd.read_csv('data/synthetic_db_with_risk_factors.csv')
x_rf = df_rf[x_all.columns]
y_rf = df_rf['age_at_MRI']
df_rf.isna().any(1).sum()

0

Before calculating the deltas, it's necessary to define the function that does the bias correction

In [11]:
# Apply the bias correction to a previosly trainer estimator
class BiasCorrectEstimator:
    def __init__(self, est, x_sval_bc, y_sval_bc):
        self.est = est
        
        x_sval_bc = np.array(x_sval_bc)
        y_sval_bc = np.array(y_sval_bc).reshape(-1)
        
        reg = linear_model.LinearRegression()
        reg.fit(y_sval_bc.reshape((-1, 1)), est.predict(x_sval_bc).reshape(-1))
        self.intercept = reg.intercept_
        self.coef = reg.coef_.item()
 
    def predict(self, x_pred, y_pred=None):
        x_pred = np.array(x_pred)
        pred = self.est.predict(x_pred)
        pred = (pred - self.intercept) / self.coef
        return pred

Then predict and calculate the deltas

In [12]:
best_creg_bc = BiasCorrectEstimator(best_creg, x_rf, y_rf)
y_rf_pred = best_creg_bc.predict(x_rf)

And save the new dataset for analysis in the next notebook (phenotype analysis)

In [13]:
df_rf_with_deltas = df_rf.copy()
df_rf_with_deltas['ca_delta'] = y_rf_pred - y_rf
df_rf_with_deltas.to_csv('data/synthetic_db_with_risk_factors_and_deltas.csv', index=False)

### Optional: fit the regression

This will be covered in more detail (and with propensity matching procudures) in the next notebook using R. However, if desired, it's already possible to run the regression using Python `statsmodels` package:

In [14]:
reg = smf.ols(formula='ca_delta ~ rf_diabetes*sex + age_at_MRI + age_at_MRI^2', data=df_rf_with_deltas).fit()

reg.summary()

0,1,2,3
Dep. Variable:,ca_delta,R-squared:,0.001
Model:,OLS,Adj. R-squared:,0.001
Method:,Least Squares,F-statistic:,7.788
Date:,"Mon, 15 Aug 2022",Prob (F-statistic):,2.46e-07
Time:,16:55:12,Log-Likelihood:,-126480.0
No. Observations:,34137,AIC:,253000.0
Df Residuals:,34131,BIC:,253000.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.1341,0.455,0.294,0.768,-0.759,1.027
rf_diabetes,0.2542,0.255,0.998,0.318,-0.245,0.753
sex,-0.6261,0.113,-5.541,0.000,-0.848,-0.405
rf_diabetes:sex,-0.0368,0.345,-0.107,0.915,-0.714,0.640
age_at_MRI,-0.0436,0.028,-1.579,0.114,-0.098,0.011
age_at_MRI ^ 2,0.0459,0.027,1.725,0.084,-0.006,0.098

0,1,2,3
Omnibus:,99.631,Durbin-Watson:,1.981
Prob(Omnibus):,0.0,Jarque-Bera (JB):,81.736
Skew:,0.049,Prob(JB):,1.78e-18
Kurtosis:,2.781,Cond. No.,782.0
