In [1]:
# load some libraries
import sys
import os
import pickle
import gzip
sys.path.insert(1, '..')
os.chdir('..')

import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import darts
from darts import metrics

from lib.gluformer.model import *
from lib.latent_ode.trainer_glunet import *
from utils.darts_processing import *
from utils.darts_dataset import *

In [2]:
# Hall dataset
# load data
formatter, series, scalers = load_data(dataset='hall',
                                        use_covs=True,
                                        cov_type='mixed',
                                        use_static_covs=True)

# define best params
def set_lags(in_len, use_covs):
    lags_past_covariates = None
    lags_future_covariates = None
    if use_covs:
        if series['train']['dynamic'] is not None:
            lags_past_covariates = in_len
        if series['train']['future'] is not None:
            lags_future_covariates = (in_len, formatter.params['length_pred'])
    return lags_past_covariates, lags_future_covariates
best_params = formatter.params['xgboost_covariates']
in_len = best_params['in_len']
out_len = formatter.params['length_pred']
stride = out_len // 2
lr = best_params['lr']
subsample = best_params['subsample']
min_child_weight = best_params['min_child_weight']
colsample_bytree = best_params['colsample_bytree']
max_depth = best_params['max_depth']
gamma = best_params['gamma']
alpha = best_params['alpha']
lambda_ = best_params['lambda_']
n_estimators = best_params['n_estimators']
lags_past_covariates, lags_future_covariates = set_lags(in_len, use_covs=True)

# load an train model
model = models.XGBModel(lags=in_len, 
                        lags_past_covariates = lags_past_covariates,
                        lags_future_covariates = lags_future_covariates,
                        n_estimators=n_estimators)
model.fit(series['train']['target'],
          past_covariates=series['train']['dynamic'],
          future_covariates=series['train']['future'])

# the input features to XGB are ordered as: 1) target series (glucose), 2) past covariates, 3) future covariates, 4) static covariates
# for hall no past covariates
# extract static feat importances
num_static_features = len(series['train']['target'][0].static_covariates.columns)
feat_importances_static = model.model.feature_importances_[-num_static_features:]
feat_importances_static = pd.DataFrame({'feature': list(series['train']['target'][0].static_covariates.columns), 
                                        'importance': feat_importances_static})

# extract future feat importances
feat_importances_future = model.model.feature_importances_[in_len:-num_static_features]
ind = [name for _ in range(132) for name in ['year', 'month', 'day', 'hour', 'minute']]
feat_importances_future = pd.DataFrame({'feature': ind, 'importance': feat_importances_future})
# for each future feature select max importance
feat_importances = feat_importances_future.groupby('feature').max().reset_index()

# merge future and static feat importances
feat_importances = pd.concat([feat_importances, feat_importances_static], axis=0)
# select top-10 features
top10features = feat_importances.sort_values(by='importance', ascending=False).head(10)
print(top10features.to_latex())


--------------------------------
Loading column definition...
Checking column definition...
Loading data...
Dropping columns / rows...
Checking for NA values...
Setting data types...
Dropping columns / rows...
Encoding data...
	Updated column definition:
		id: REAL_VALUED (ID)
		time: DATE (TIME)
		gl: REAL_VALUED (TARGET)
		Age: REAL_VALUED (STATIC_INPUT)
		BMI: REAL_VALUED (STATIC_INPUT)
		A1C: REAL_VALUED (STATIC_INPUT)
		FBG: REAL_VALUED (STATIC_INPUT)
		ogtt.2hr: REAL_VALUED (STATIC_INPUT)
		insulin: REAL_VALUED (STATIC_INPUT)
		hs.CRP: REAL_VALUED (STATIC_INPUT)
		Tchol: REAL_VALUED (STATIC_INPUT)
		Trg: REAL_VALUED (STATIC_INPUT)
		HDL: REAL_VALUED (STATIC_INPUT)
		LDL: REAL_VALUED (STATIC_INPUT)
		mean_glucose: REAL_VALUED (STATIC_INPUT)
		sd_glucose: REAL_VALUED (STATIC_INPUT)
		range_glucose: REAL_VALUED (STATIC_INPUT)
		min_glucose: REAL_VALUED (STATIC_INPUT)
		max_glucose: REAL_VALUED (STATIC_INPUT)
		quartile.25_glucose: REAL_VALUED (STATIC_INPUT)
		median_glucose: REAL_VA

  print(top10features.to_latex())


In [3]:
# Colas dataset
# load data
formatter, series, scalers = load_data(dataset='colas',
                                        use_covs=True,
                                        cov_type='mixed',
                                        use_static_covs=True)

# define best params
def set_lags(in_len, use_covs):
    lags_past_covariates = None
    lags_future_covariates = None
    if use_covs:
        if series['train']['dynamic'] is not None:
            lags_past_covariates = in_len
        if series['train']['future'] is not None:
            lags_future_covariates = (in_len, formatter.params['length_pred'])
    return lags_past_covariates, lags_future_covariates
best_params = formatter.params['xgboost_covariates']
in_len = best_params['in_len']
out_len = formatter.params['length_pred']
stride = out_len // 2
lr = best_params['lr']
subsample = best_params['subsample']
min_child_weight = best_params['min_child_weight']
colsample_bytree = best_params['colsample_bytree']
max_depth = best_params['max_depth']
gamma = best_params['gamma']
alpha = best_params['alpha']
lambda_ = best_params['lambda_']
n_estimators = best_params['n_estimators']
lags_past_covariates, lags_future_covariates = set_lags(in_len, use_covs=True)

# load an train model
model = models.XGBModel(lags=in_len, 
                        lags_past_covariates = lags_past_covariates,
                        lags_future_covariates = lags_future_covariates,
                        n_estimators=n_estimators)
model.fit(series['train']['target'],
          past_covariates=series['train']['dynamic'],
          future_covariates=series['train']['future'])

# the input features to XGB are ordered as: 1) target series (glucose), 2) past covariates, 3) future covariates, 4) static covariates
# for hall no past covariates
# extract static feat importances
num_static_features = len(series['train']['target'][0].static_covariates.columns)
feat_importances_static = model.model.feature_importances_[-num_static_features:]
feat_importances_static = pd.DataFrame({'feature': list(series['train']['target'][0].static_covariates.columns), 
                                        'importance': feat_importances_static})

# # extract future feat importances
feat_importances_future = model.model.feature_importances_[in_len:-num_static_features]
ind = [name for _ in range(in_len + out_len) for name in ['year', 'month', 'day', 'hour', 'minute']]
feat_importances_future = pd.DataFrame({'feature': ind, 'importance': feat_importances_future})
# for each future feature select max importance
feat_importances = feat_importances_future.groupby('feature').max().reset_index()

# merge future and static feat importances
feat_importances = pd.concat([feat_importances, feat_importances_static], axis=0)
# select top-10 features
top10features = feat_importances.sort_values(by='importance', ascending=False).head(10)
print(top10features.to_latex())


--------------------------------
Loading column definition...
Checking column definition...
Loading data...
Dropping columns / rows...
Checking for NA values...
Setting data types...
Dropping columns / rows...
Encoding data...
	Updated column definition:
		id: REAL_VALUED (ID)
		time: DATE (TIME)
		gl: REAL_VALUED (TARGET)
		gender: REAL_VALUED (STATIC_INPUT)
		age: REAL_VALUED (STATIC_INPUT)
		BMI: REAL_VALUED (STATIC_INPUT)
		glycaemia: REAL_VALUED (STATIC_INPUT)
		HbA1c: REAL_VALUED (STATIC_INPUT)
		follow.up: REAL_VALUED (STATIC_INPUT)
		T2DM: REAL_VALUED (STATIC_INPUT)
		time_year: REAL_VALUED (KNOWN_INPUT)
		time_month: REAL_VALUED (KNOWN_INPUT)
		time_day: REAL_VALUED (KNOWN_INPUT)
		time_hour: REAL_VALUED (KNOWN_INPUT)
		time_minute: REAL_VALUED (KNOWN_INPUT)
Interpolating data...
	Dropped segments: 63
	Extracted segments: 205
	Interpolated values: 241
	Percent of values interpolated: 0.22%
Splitting data...
	Train: 72275 (45.89%)
	Val: 35713 (22.68%)
	Test: 38253 (24.29%)
	Tes

  print(top10features.to_latex())


In [4]:
# weinstock dataset
# load data
formatter, series, scalers = load_data(dataset='weinstock',
                                        use_covs=True,
                                        cov_type='mixed',
                                        use_static_covs=True)

# define best params
def set_lags(in_len, use_covs):
    lags_past_covariates = None
    lags_future_covariates = None
    if use_covs:
        if series['train']['dynamic'] is not None:
            lags_past_covariates = in_len
        if series['train']['future'] is not None:
            lags_future_covariates = (in_len, formatter.params['length_pred'])
    return lags_past_covariates, lags_future_covariates
best_params = formatter.params['xgboost_covariates']
in_len = best_params['in_len']
out_len = formatter.params['length_pred']
stride = out_len // 2
lr = best_params['lr']
subsample = best_params['subsample']
min_child_weight = best_params['min_child_weight']
colsample_bytree = best_params['colsample_bytree']
max_depth = best_params['max_depth']
gamma = best_params['gamma']
alpha = best_params['alpha']
lambda_ = best_params['lambda_']
n_estimators = best_params['n_estimators']
lags_past_covariates, lags_future_covariates = set_lags(in_len, use_covs=True)

# load an train model
model = models.XGBModel(lags=in_len, 
                        lags_past_covariates = lags_past_covariates,
                        lags_future_covariates = lags_future_covariates,
                        n_estimators=n_estimators)
model.fit(series['train']['target'],
          past_covariates=series['train']['dynamic'],
          future_covariates=series['train']['future'])

# the input features to XGB are ordered as: 1) target series (glucose), 2) past covariates, 3) future covariates, 4) static covariates
# for hall no past covariates
# extract static feat importances
num_static_features = len(series['train']['target'][0].static_covariates.columns)
feat_importances_static = model.model.feature_importances_[-num_static_features:]
feat_importances_static = pd.DataFrame({'feature': list(series['train']['target'][0].static_covariates.columns), 
                                        'importance': feat_importances_static})

# extract future feat importances
feat_importances_future = model.model.feature_importances_[in_len:-num_static_features]
ind = [name for _ in range(in_len + out_len) for name in ['year', 'month', 'day', 'hour', 'minute']]
feat_importances_future = pd.DataFrame({'feature': ind, 'importance': feat_importances_future})
# for each future feature select max importance
feat_importances = feat_importances_future.groupby('feature').max().reset_index()

# merge future and static feat importances
feat_importances = pd.concat([feat_importances, feat_importances_static], axis=0)
# select top-10 features
top10features = feat_importances.sort_values(by='importance', ascending=False).head(10)
print(top10features.to_latex())

--------------------------------
Loading column definition...
Checking column definition...
Loading data...
Dropping columns / rows...
Checking for NA values...
Setting data types...
Dropping columns / rows...
Encoding data...
	Updated column definition:
		id: REAL_VALUED (ID)
		time: DATE (TIME)
		gl: REAL_VALUED (TARGET)
		Height: REAL_VALUED (STATIC_INPUT)
		Weight: REAL_VALUED (STATIC_INPUT)
		Gender: REAL_VALUED (STATIC_INPUT)
		Race: REAL_VALUED (STATIC_INPUT)
		EduLevel: REAL_VALUED (STATIC_INPUT)
		AnnualInc: REAL_VALUED (STATIC_INPUT)
		MaritalStatus: REAL_VALUED (STATIC_INPUT)
		DaysWkEx: REAL_VALUED (STATIC_INPUT)
		DaysWkDrinkAlc: REAL_VALUED (STATIC_INPUT)
		DaysMonBingeAlc: REAL_VALUED (STATIC_INPUT)
		T1DDiagAge: REAL_VALUED (STATIC_INPUT)
		NumHospDKA: REAL_VALUED (STATIC_INPUT)
		NumSHSinceT1DDiag: REAL_VALUED (STATIC_INPUT)
		InsDeliveryMethod: REAL_VALUED (STATIC_INPUT)
		UnitsInsTotal: REAL_VALUED (STATIC_INPUT)
		NumMeterCheckDay: REAL_VALUED (STATIC_INPUT)
		Aspir

  print(top10features.to_latex())


In [5]:
# dubosson dataset
# load data
formatter, series, scalers = load_data(dataset='dubosson',
                                        use_covs=True,
                                        cov_type='mixed',
                                        use_static_covs=True)

# define best params
def set_lags(in_len, use_covs):
    lags_past_covariates = None
    lags_future_covariates = None
    if use_covs:
        if series['train']['dynamic'] is not None:
            lags_past_covariates = in_len
        if series['train']['future'] is not None:
            lags_future_covariates = (in_len, formatter.params['length_pred'])
    return lags_past_covariates, lags_future_covariates
best_params = formatter.params['xgboost_covariates']
in_len = best_params['in_len']
out_len = formatter.params['length_pred']
stride = out_len // 2
lr = best_params['lr']
subsample = best_params['subsample']
min_child_weight = best_params['min_child_weight']
colsample_bytree = best_params['colsample_bytree']
max_depth = best_params['max_depth']
gamma = best_params['gamma']
alpha = best_params['alpha']
lambda_ = best_params['lambda_']
n_estimators = best_params['n_estimators']
lags_past_covariates, lags_future_covariates = set_lags(in_len, use_covs=True)

# load an train model
model = models.XGBModel(lags=in_len, 
                        lags_past_covariates = lags_past_covariates,
                        lags_future_covariates = lags_future_covariates,
                        n_estimators=n_estimators)
model.fit(series['train']['target'],
          past_covariates=series['train']['dynamic'],
          future_covariates=series['train']['future'])

# the input features to XGB are ordered as: 1) target series (glucose), 2) past covariates, 3) future covariates, 4) static covariates
# extract static feat importances
num_static_features = len(series['train']['target'][0].static_covariates.columns)
feat_importances_static = model.model.feature_importances_[-num_static_features:]
feat_importances_static = pd.DataFrame({'feature': list(series['train']['target'][0].static_covariates.columns), 
                                        'importance': feat_importances_static})

# extract past covariates
feat_importances_past = model.model.feature_importances_[in_len:in_len+in_len*series['train']['dynamic'][0].n_components]
ind = [name for _ in range(in_len) for name in list(series['train']['dynamic'][0].columns)]
feat_importances_past = pd.DataFrame({'feature': ind, 'importance': feat_importances_past})
# for each past feature select max importance
feat_importances_past = feat_importances_past.groupby('feature').max().reset_index()

# extract future feat importances
feat_importances_future = model.model.feature_importances_[in_len+in_len*series['train']['dynamic'][0].n_components : -num_static_features]
ind = [name for _ in range(in_len + out_len) for name in ['year', 'month', 'day', 'hour', 'minute']]
feat_importances_future = pd.DataFrame({'feature': ind, 'importance': feat_importances_future})
# for each future feature select max importance
feat_importances_future = feat_importances_future.groupby('feature').max().reset_index()

# merge future and static feat importances
feat_importances = pd.concat([feat_importances_future, feat_importances_static, feat_importances_past], axis=0)
# select top-10 features
top10features = feat_importances.sort_values(by='importance', ascending=False).head(10)
print(top10features.to_latex())

--------------------------------
Loading column definition...
Checking column definition...
Loading data...
Dropping columns / rows...
Checking for NA values...
Setting data types...
Dropping columns / rows...
Encoding data...
	Updated column definition:
		id: REAL_VALUED (ID)
		time: DATE (TIME)
		gl: REAL_VALUED (TARGET)
		fast_insulin: REAL_VALUED (OBSERVED_INPUT)
		slow_insulin: REAL_VALUED (OBSERVED_INPUT)
		calories: REAL_VALUED (OBSERVED_INPUT)
		balance: REAL_VALUED (OBSERVED_INPUT)
		quality: REAL_VALUED (OBSERVED_INPUT)
		HR: REAL_VALUED (OBSERVED_INPUT)
		BR: REAL_VALUED (OBSERVED_INPUT)
		Posture: REAL_VALUED (OBSERVED_INPUT)
		Activity: REAL_VALUED (OBSERVED_INPUT)
		HRV: REAL_VALUED (OBSERVED_INPUT)
		CoreTemp: REAL_VALUED (OBSERVED_INPUT)
		time_year: REAL_VALUED (KNOWN_INPUT)
		time_month: REAL_VALUED (KNOWN_INPUT)
		time_day: REAL_VALUED (KNOWN_INPUT)
		time_hour: REAL_VALUED (KNOWN_INPUT)
		time_minute: REAL_VALUED (KNOWN_INPUT)
Interpolating data...
	Dropped segments

  print(top10features.to_latex())


In [6]:
# iglu (Broll) dataset
# load data
formatter, series, scalers = load_data(dataset='iglu',
                                        use_covs=True,
                                        cov_type='mixed',
                                        use_static_covs=True)

# define best params
def set_lags(in_len, use_covs):
    lags_past_covariates = None
    lags_future_covariates = None
    if use_covs:
        if series['train']['dynamic'] is not None:
            lags_past_covariates = in_len
        if series['train']['future'] is not None:
            lags_future_covariates = (in_len, formatter.params['length_pred'])
    return lags_past_covariates, lags_future_covariates
best_params = formatter.params['xgboost_covariates']
in_len = best_params['in_len']
out_len = formatter.params['length_pred']
stride = out_len // 2
lr = best_params['lr']
subsample = best_params['subsample']
min_child_weight = best_params['min_child_weight']
colsample_bytree = best_params['colsample_bytree']
max_depth = best_params['max_depth']
gamma = best_params['gamma']
alpha = best_params['alpha']
lambda_ = best_params['lambda_']
n_estimators = best_params['n_estimators']
lags_past_covariates, lags_future_covariates = set_lags(in_len, use_covs=True)

# load an train model
model = models.XGBModel(lags=in_len, 
                        lags_past_covariates = lags_past_covariates,
                        lags_future_covariates = lags_future_covariates,
                        n_estimators=n_estimators)
model.fit(series['train']['target'],
          past_covariates=series['train']['dynamic'],
          future_covariates=series['train']['future'])

# the input features to XGB are ordered as: 1) target series (glucose), 2) past covariates, 3) future covariates, 4) static covariates
# for hall no past covariates
# extract static feat importances
num_static_features = len(series['train']['target'][0].static_covariates.columns)
feat_importances_static = model.model.feature_importances_[-num_static_features:]
feat_importances_static = pd.DataFrame({'feature': list(series['train']['target'][0].static_covariates.columns), 
                                        'importance': feat_importances_static})

# extract future feat importances
feat_importances_future = model.model.feature_importances_[in_len:-num_static_features]
ind = [name for _ in range(in_len + out_len) for name in ['year', 'month', 'day', 'hour', 'minute', 'second']]
feat_importances_future = pd.DataFrame({'feature': ind, 'importance': feat_importances_future})
# for each future feature select max importance
feat_importances = feat_importances_future.groupby('feature').max().reset_index()

# merge future and static feat importances
feat_importances = pd.concat([feat_importances, feat_importances_static], axis=0)
# select top-10 features
top10features = feat_importances.sort_values(by='importance', ascending=False).head(10)
print(top10features.to_latex())

--------------------------------
Loading column definition...
Checking column definition...
Loading data...
Dropping columns / rows...
Checking for NA values...
Setting data types...
Dropping columns / rows...
Encoding data...
	Updated column definition:
		id: REAL_VALUED (ID)
		time: DATE (TIME)
		gl: REAL_VALUED (TARGET)
		time_year: REAL_VALUED (KNOWN_INPUT)
		time_month: REAL_VALUED (KNOWN_INPUT)
		time_day: REAL_VALUED (KNOWN_INPUT)
		time_hour: REAL_VALUED (KNOWN_INPUT)
		time_minute: REAL_VALUED (KNOWN_INPUT)
		time_second: REAL_VALUED (KNOWN_INPUT)
Interpolating data...
	Dropped segments: 17
	Extracted segments: 15
	Interpolated values: 561
	Percent of values interpolated: 4.37%
Splitting data...
	Train: 9056 (64.79%)
	Val: 1774 (12.69%)
	Test: 1848 (13.22%)
	Test OOD: 1300 (9.30%)
Scaling data...
	No scaling applied
Data formatting complete.
--------------------------------
\begin{tabular}{llr}
\toprule
{} & feature &  importance \\
\midrule
3 &   month &    0.001428 \\
0 &   

  print(top10features.to_latex())
