In [None]:
# Load libraries and functions
%load_ext autoreload
%autoreload 2
%matplotlib inline
RANDOM_STATE = 42  # Pseudo-random state

from utils import *
sns.set_palette("tab10") # Default seaborn theme

In [None]:
# Upload dataset
fn_vae_data = glob.glob('./Updated*.pkl')
latest_fn_vae_data = max(fn_vae_data, key=os.path.getctime)

print("Loading... ",latest_fn_vae_data)
with open(latest_fn_vae_data, "rb") as f:
    vae_data_main = pickle.load(f)
print("Done")


## Calculate how many patients had how many cases

In [None]:
# Print number of patients and number of cases 
print('Total patients: ', vae_data_main[['ID']].groupby('ID').max().shape[0])
print('Total ICU admissions: ', vae_data_main[['ID_subid']].groupby('ID_subid').max().shape[0])

# ICU admissions per group
vae_data_main[['ID_subid', 'group']].groupby('ID_subid').max().group.value_counts()

In [None]:
# Patient per group
vae_data_main[['ID', 'group']].groupby('ID').max().group.value_counts()

### Number of cases per one ICU admission

In [None]:
# Define function

class AggFuncWFlag():
    def __init__(self):
        self.flag = False
        
    def __call__(self, x):
        if not self.flag:
            self.flag = True
            
            if x.iloc[0] == 0. and x.iloc[1] == 1:
                return 1
            
            elif x.iloc[0] == 0:
                return 0
            elif x.iloc[0] == 1.:
                return 1
            else:
                raise ValueError()
         
        return x.iloc[0] == 0. and x.iloc[1] == 1

In [None]:
# Calculation

agg_func = lambda x: x.iloc[0] == 0. and x.iloc[1] == 1
agg_func = AggFuncWFlag()

result = []
for COL in ['non_vap_resp_hai', 'vap', 'infection_respiratory']:

    res = []
    for uid in vae_data_main.loc[vae_data_main[COL] > 0, 'ID_subid'].unique():
        agg_func = AggFuncWFlag()
        res.append(
            vae_data_main.loc[vae_data_main.ID_subid == uid, COL].rolling(2).agg(agg_func).sum())

    # Check guys w/ one line only
    one_liners = vae_data_main.loc[vae_data_main[COL] > 0, 'ID_subid'].unique()
    one_liners_ids = one_liners[[i for (i,j) in enumerate(res) if j == 0]]

    assert vae_data_main.loc[(vae_data_main.ID_subid.isin(one_liners_ids) & 
                              vae_data_main[COL] > 0)].shape[0] == len(one_liners_ids)

    # OK
    result += [pd.Series(res).replace(0, 1).value_counts()]
    
pd.concat(result, axis=1)

In [None]:
total = pd.Series(res).replace(0, 1).value_counts()
print('Total HARTI cases: ', sum(total.index * total.values))

## The onset of HARTI

In [None]:
# The onset of HARTI depending on days in the ICU
nva = vae_data_main.loc[vae_data_main['group'] == 'NVA-HARTI']
nva = nva.loc[:, ['day_in_icu_bid', 'ID_subid']].groupby('ID_subid').max()
print('Median day in ICU when NVA-HARTI begins: ', nva.median()[0])

vap = vae_data_main.loc[vae_data_main['group'] == 'VA-HARTI']
vap = vap.loc[:, ['day_in_icu_bid', 'ID_subid']].groupby('ID_subid').max()
print('Median day in ICU when VA-HARTI begins: ', vap.median()[0])

fig, (ax, ax1) = plt.subplots(2, figsize=(8,4))
fig.suptitle('The onset of HARTI depending on days in the ICU', fontsize=15)
colors_sns = ['medium blue']
sns.set_palette(sns.xkcd_palette(colors_sns))
sns.boxplot(x=vap.day_in_icu_bid, orient='h', ax=ax)
ax.grid(linestyle='dotted')
ax.set_xlabel('')
ax.set_xticklabels([''])
ax.set_xlim(0,60)
ax.set_ylabel('VA-HARTI')
ax.minorticks_on()
ax.grid(linestyle='dotted', which='both')

colors_sns = ['orange']
sns.set_palette(sns.xkcd_palette(colors_sns))
sns.boxplot(x=nva.day_in_icu_bid, orient='h', ax=ax1)
ax1.minorticks_on()
ax1.grid(linestyle='dotted', which='both')
ax1.set_xlabel('Days in the ICU before the onset of infection')
ax1.set_xlim(0,60)
ax1.set_ylabel('NVA-HARTI')

plt.subplots_adjust(wspace=0, hspace=0)
plt.savefig('./pictures/onset_all.pdf', dpi=600)

# Restore default colors
sns.set_palette("tab10")

In [None]:
# The onset of VA-HARTI depending on days on mechanical ventilation
vap = vae_data_main.loc[vae_data_main['group'] == 'VA-HARTI']
vap = vap.loc[:, ['mech_vent_bid', 'ID_subid']].groupby('ID_subid').max()
# Print median days on ventilation before infection
print('Median days on ventilation when VA-HARTI begins: ', vap.median()[0])

fig, ax = plt.subplots(1, figsize=(8, 3))
sns.boxplot(x=vap.mech_vent_bid, orient='h', ax=ax)
ax.set_xlabel('Days on mechanical ventilation before the onset of VA-HARTI')
ax.grid(linestyle='dotted')
ax.set_xlim(0,60)
ax.set_ylabel('VA-HARTI')
ax.minorticks_on()
ax.grid(linestyle='dotted', which='both')
ax.set_title('The onset of VA-HARTI depending on days on mechanical ventilation')
plt.tight_layout()
plt.savefig('./pictures/onset_va.pdf', dpi=600)

## The duration of HARTI

In [None]:
# Define data
nva = vae_data_main.loc[vae_data_main['group'] == 'NVA-HARTI']
nva = nva.loc[:, ['non_vap_resp_hai', 'ID_subid']].groupby('ID_subid').sum()
vap = vae_data_main.loc[vae_data_main['group'] == 'VA-HARTI']
vap = vap.loc[:, ['vap', 'ID_subid']].groupby('ID_subid').sum()

# Print median duration of infections
print('Median duration of VA-HARTI: ', vap.median()[0])
print('Median duration of NVA-HARTI: ', nva.median()[0])

# Plot in boxplot format
fig, (ax, ax1) = plt.subplots(2, figsize=(8,4))
fig.suptitle('Duration of infection in patients with VA- and NVA-HARTI', fontsize=15)
colors_sns = ['medium blue']
sns.set_palette(sns.xkcd_palette(colors_sns))
sns.boxplot(x=vap.vap, orient='h', ax=ax)
ax.set_xlabel('')
ax.set_xticklabels([''])
ax.grid(linestyle='dotted')
ax.set_xlim(0,60)
ax.set_ylabel('VA-HARTI')
ax.minorticks_on()
ax.grid(linestyle='dotted', which='both')

colors_sns = ['orange']
sns.set_palette(sns.xkcd_palette(colors_sns))
sns.boxplot(x=nva.non_vap_resp_hai, orient='h', ax=ax1)
ax1.minorticks_on()
ax1.grid(linestyle='dotted', which='both')
ax1.set_xlabel('Duration of HARTI symptoms, days')
ax1.set_xlim(0,60)
ax1.set_ylabel('NVA-HARTI')

plt.subplots_adjust(wspace=0, hspace=0)
plt.savefig('./pictures/duration_boxplot.pdf', dpi=600)

sns.set_palette("tab10")

# Compare groups VA-, NVA-, No HAI, Other HAI, Dual HARTI

In [None]:
### Select group data
data_vap = vae_data_main[(vae_data_main.group=='VA-HARTI')].groupby('ID_subid').max()
data_nva = vae_data_main[(vae_data_main.group=='NVA-HARTI')].groupby('ID_subid').max()
data_no_hai = vae_data_main[(vae_data_main.group=='No HAI')].groupby('ID_subid').max()
data_other_hai = vae_data_main[(vae_data_main.group=='Other HAI')].groupby('ID_subid').max()
data_both = vae_data_main[(vae_data_main.group=='Dual HARTI')].groupby('ID_subid').max()

### Select factors to compare
# Numeric factors
FACTORS_numeric = ['age', 'charlson',  # condition on admission
           
                   'st_all_sum', 'st_craniotomy_len_sum', # surgeries
                   'st_device_len_sum', 'st_endonasal_len_sum', 'st_endovascular_len_sum',
                   'st_other_len_sum', 'st_spinal_len_sum', 'st_all_len_sum',
                   
                   'gcs', 'rass', 'pbss',  # severity of patients condition
           
                   'mech_vent_days',   'antibiotics_total_binary_days',  # ICU care
                   'central_line_days', 'feeding_tube_days', 'arterial_line_days', 'evd_days', 'icpm_days',
                   'urinary_catheter_days', 'hypothermia_days', 'hemodialysis_days',
                   'total_parenteral_feeding_days', 'sedation_days', 'anxiolytics_days',
                   'vasopressors_days', 'days_mech_vent_before_tracheostomy', 'days_before_tracheostomy',
                   'endotracheal_tube_1_days', 'endotracheal_tube_2_days', 'endotracheal_tube_3_days',
                   
                   'intestinal_dysfunction_days', 'infection_bloodstream_days', # complications
                   'infection_other_days', 'infection_urinary_days', 'infection_cns_days', 'infection_ssi_days'
                  ]

# Binary factors
FACTORS_binary = ['gender_M', 'disease_type_trauma', 'disease_type_tumor', # condition on admission
                  'disease_type_vascular', 'disease_type_other',
                  
                  'mutism', 'convulsions', 'aphasia', 'vegetative_state',  # severity of patients condition
                  
                  'st_device_count', 'st_other_count', 'st_craniotomy_count', 'st_endovascular_count', # surgeries
                  'st_endonasal_count', 'st_spinal_count',
                  
                  'mech_vent',   'antibiotics_total_binary',  # ICU care
                  'central_line', 'feeding_tube', 'arterial_line', 'evd', 'icpm',
                  'urinary_catheter', 'hypothermia', 'hemodialysis', 'total_parenteral_feeding',
                  'sedation', 'anxiolytics', 'vasopressors',
           
                  'intestinal_dysfunction', 'infection_bloodstream', 'infection_other',  # complications
                  'infection_urinary', 'infection_cns', 'infection_ssi', 'csfl_ne', 'csfl_ss'
                 ]


### All groups

In [None]:
dsets = {'vap': data_vap, 
         'nva': data_nva, 
         'no_hai': data_no_hai,
         'other_hai' : data_other_hai,
         'both' : data_both
        }
# Numeric factors
group_dict = {}
for key, d in dsets.items():
    median = []
    q25 = []
    q75 = []
    for col in FACTORS_numeric:
        median.append(d[col].median())
        q25.append(np.percentile(d[col].dropna(), 25))
        q75.append(np.percentile(d[col].dropna(), 75))
    group_dict[key] = pd.DataFrame(list(zip(median, q25, q75)),
                       columns = ['Median / n_' + key, 'Q25 / lower_'+key, 'Q75 / upper_'+key],
                                   index = FACTORS_numeric)

numeric = pd.concat([pd.DataFrame(group_dict['vap']), pd.DataFrame(group_dict['nva']),
                    pd.DataFrame(group_dict['no_hai']), pd.DataFrame(group_dict['other_hai']),
                    pd.DataFrame(group_dict['both'])], axis=1)

# Binary factors
group_dict = {}
for key, d in dsets.items():
    n = []
    perc = []
    lower = []
    upper = []
    for col in FACTORS_binary:
        n.append((d[col]>0).sum())
        perc.append(round(((d[col]>0).sum() / len(d[col])*100), 1))
        nobs = len(d[col])
        count = (d[col]>0).sum()
        left, right = ci(count, nobs)
        lower.append(round(left*100, 1))
        upper.append(round(right*100, 1))
    
    group_dict[key] = pd.DataFrame(list(zip(n, perc, lower, upper)),
                       columns = ['Median / n_' + key, '%_'+key, 'Q25 / lower_'+key, 'Q75 / upper_'+key],
                                   index = FACTORS_binary)   

binary = pd.concat([pd.DataFrame(group_dict['vap']), pd.DataFrame(group_dict['nva']),
                    pd.DataFrame(group_dict['no_hai']), pd.DataFrame(group_dict['other_hai']),
                    pd.DataFrame(group_dict['both'])], axis=1)
binary["Median / n_vap"] = binary["Median / n_vap"].astype(int).map(str) +" ("+ binary["%_vap"].map(str) + "%)"
binary["Median / n_nva"] = binary["Median / n_nva"].astype(int).map(str) +" ("+ binary["%_nva"].map(str) + "%)"
binary["Median / n_no_hai"] = binary["Median / n_no_hai"].astype(int).map(str) +" ("+ binary["%_no_hai"].map(str) + "%)"
binary["Median / n_other_hai"] = binary["Median / n_other_hai"].astype(int).map(str) +" ("+ binary["%_other_hai"].map(str) + "%)"
binary["Median / n_both"] = binary["Median / n_both"].astype(int).map(str) +" ("+ binary["%_both"].map(str) + "%)"
binary.drop(['%_vap', '%_nva', '%_no_hai', '%_other_hai', '%_both'], axis=1, inplace=True)

#Combine numeric and binary tables
compare_all_groups = pd.concat([numeric, binary], axis=0)

# Format columns
compare_all_groups['Q/CI_vap'] = compare_all_groups[['Q25 / lower_vap','Q75 / upper_vap']].apply(lambda x : '[{}; {}]'.format(x[0],x[1]), axis=1)
compare_all_groups['Q/CI_nva'] = compare_all_groups[['Q25 / lower_nva','Q75 / upper_nva']].apply(lambda x : '[{}; {}]'.format(x[0],x[1]), axis=1)
compare_all_groups['Q/CI_no_hai'] = compare_all_groups[['Q25 / lower_no_hai','Q75 / upper_no_hai']].apply(lambda x : '[{}; {}]'.format(x[0],x[1]), axis=1)
compare_all_groups['Q/CI_other_hai'] = compare_all_groups[['Q25 / lower_other_hai','Q75 / upper_other_hai']].apply(lambda x : '[{}; {}]'.format(x[0],x[1]), axis=1)
compare_all_groups['Q/CI_both'] = compare_all_groups[['Q25 / lower_both','Q75 / upper_both']].apply(lambda x : '[{}; {}]'.format(x[0],x[1]), axis=1)

compare_all_groups.drop(['Q25 / lower_vap', 'Q75 / upper_vap', 'Q25 / lower_nva',
                         'Q75 / upper_nva', 'Q25 / lower_no_hai', 'Q75 / upper_no_hai',
                        'Q25 / lower_other_hai', 'Q75 / upper_other_hai', 'Q25 / lower_both',
                         'Q75 / upper_both',], axis=1, inplace=True)
compare_all_groups = compare_all_groups[['Median / n_nva', 'Q/CI_nva',
                                         'Median / n_no_hai', 'Q/CI_no_hai',
                                         'Median / n_vap', 'Q/CI_vap',
                                         'Median / n_other_hai', 'Q/CI_other_hai',
                                         'Median / n_both', 'Q/CI_both',]]



In [None]:
# Test hypothesis
# Kruskal test for numeric (skip NaN)
res = {}
for factor in FACTORS_numeric:
    res[factor] = kruskal(data_vap[factor].values, data_nva[factor].values,
                          data_no_hai[factor].values, data_other_hai[factor].values, data_both[factor].values,
                          nan_policy='omit').pvalue

if not compare_all_groups.columns.str.contains('pvalue').max():
    compare_all_groups = compare_all_groups.join(pd.DataFrame({'pvalue': res}))
    
# Chi-square for binary
res = {}
for factor in FACTORS_binary:
    contigency = pd.crosstab(vae_data_main[['ID_subid', 'group']].groupby('ID_subid').max()['group'],
                            vae_data_main[['ID_subid', factor]].groupby('ID_subid').max()[factor]>0)
    pvalue = chi(contigency)[1]
    compare_all_groups.loc[factor, 'pvalue'] = pvalue
    
# Adjust p-value for multiple comparison 
compare_all_groups['adjusted_pvalue'] = pd.Series(multipletests(compare_all_groups.pvalue.dropna().values)[1],
                                       index=compare_all_groups.pvalue.dropna().index)
compare_all_groups['adjusted_pvalue'] = compare_all_groups['adjusted_pvalue'].apply(lambda x: round(x, 5))
compare_all_groups['pvalue'] = compare_all_groups['pvalue'].apply(lambda x: round(x, 5))

# Match column names from json dict
f = open("./columns_dict.json")
columns_dict = json.load(f)
compare_all_groups.index = compare_all_groups.index.to_series().map(columns_dict.get)

# Save table
compare_all_groups.to_csv('./output_data/compare_all_groups.csv', sep='\t', encoding='utf-8')

compare_all_groups

### Compare VA-HARTI and NVA-HARTI

In [None]:
dsets = {'vap': data_vap, 
         'nva': data_nva}

# Numeric factors
group_dict = {}
for key, d in dsets.items():
    median = []
    q25 = []
    q75 = []
    for col in FACTORS_numeric:
        median.append(d[col].median())
        q25.append(np.percentile(d[col].dropna(), 25))
        q75.append(np.percentile(d[col].dropna(), 75))
    group_dict[key] = pd.DataFrame(list(zip(median, q25, q75)),
                       columns = ['Median / n_' + key, 'Q25 / lower_'+key, 'Q75 / upper_'+key],
                                   index = FACTORS_numeric)

numeric = pd.concat([pd.DataFrame(group_dict['vap']), pd.DataFrame(group_dict['nva'])], axis=1)

# Binary factors
group_dict = {}
for key, d in dsets.items():
    n = []
    perc = []
    lower = []
    upper = []
    for col in FACTORS_binary:
        n.append((d[col]>0).sum())
        perc.append(round(((d[col]>0).sum() / len(d[col])*100), 1))
        nobs = len(d[col])
        count = (d[col]>0).sum()
        left, right = ci(count, nobs)
        lower.append(round(left*100, 1))
        upper.append(round(right*100, 1))
    
    group_dict[key] = pd.DataFrame(list(zip(n, perc, lower, upper)),
                       columns = ['Median / n_' + key, '%_'+key, 'Q25 / lower_'+key, 'Q75 / upper_'+key],
                                   index = FACTORS_binary)   

binary = pd.concat([pd.DataFrame(group_dict['vap']), pd.DataFrame(group_dict['nva'])], axis=1)
binary["Median / n_vap"] = binary["Median / n_vap"].astype(int).map(str) +" ("+ binary["%_vap"].map(str) + "%)"
binary["Median / n_nva"] = binary["Median / n_nva"].astype(int).map(str) +" ("+ binary["%_nva"].map(str) + "%)"
binary.drop(['%_vap', '%_nva'], axis=1, inplace=True)

#Combine numeric and binary tables
compare_harti_groups = pd.concat([numeric, binary], axis=0)

# Format columns
compare_harti_groups['Q/CI_vap'] = compare_harti_groups[['Q25 / lower_vap','Q75 / upper_vap']].apply(lambda x : '[{}; {}]'.format(x[0],x[1]), axis=1)
compare_harti_groups['Q/CI_nva'] = compare_harti_groups[['Q25 / lower_nva','Q75 / upper_nva']].apply(lambda x : '[{}; {}]'.format(x[0],x[1]), axis=1)
compare_harti_groups.drop(['Q25 / lower_vap','Q75 / upper_vap', 'Q25 / lower_nva','Q75 / upper_nva'], axis=1, inplace=True)
compare_harti_groups = compare_harti_groups[['Median / n_vap', 'Q/CI_vap', 'Median / n_nva', 'Q/CI_nva']]



In [None]:
# Test hypothesis
# Kruskal test for numeric (skip NaN)
res = {}
for factor in FACTORS_numeric:
    res[factor] = kruskal(data_vap[factor].values,data_nva[factor].values, nan_policy='omit').pvalue

if not compare_harti_groups.columns.str.contains('pvalue').max():
    compare_harti_groups = compare_harti_groups.join(pd.DataFrame({'pvalue': res}))
    
# Chi-square for binary
res = {}
for factor in FACTORS_binary:
    contigency = pd.crosstab(vae_data_main[['ID_subid', 'group']].groupby('ID_subid').max()['group'],
                            vae_data_main[['ID_subid', factor]].groupby('ID_subid').max()[factor]>0)
    contigency = contigency.loc[["NVA-HARTI", "VA-HARTI"]]
    pvalue = chi(contigency)[1]
    compare_harti_groups.loc[factor, 'pvalue'] = pvalue
    
# Adjust p-value for multiple comparison 
compare_harti_groups['adjusted_pvalue'] = pd.Series(multipletests(compare_harti_groups.pvalue.dropna().values)[1],
                                       index=compare_harti_groups.pvalue.dropna().index)
compare_harti_groups['adjusted_pvalue'] = compare_harti_groups['adjusted_pvalue'].apply(lambda x: round(x, 5))
compare_harti_groups['pvalue'] = compare_harti_groups['pvalue'].apply(lambda x: round(x, 5))

# Match column names from json dict
f = open("./columns_dict.json")
columns_dict = json.load(f)
compare_harti_groups.index = compare_harti_groups.index.to_series().map(columns_dict.get)

# Save table
compare_harti_groups.to_csv('./output_data/compare_harti_groups.csv', sep='\t', encoding='utf-8')

compare_harti_groups

# Population dynamics in time
[Stationarity](https://www.analyticsvidhya.com/blog/2018/09/non-stationary-time-series-python/)
[ADF Test](https://www.statsmodels.org/stable/generated/statsmodels.tsa.stattools.adfuller.html)

#### Columns that are used to diagnose HARTI - we don't use them:
cols_dx = ['tbd_sanation', 'fio2', 'purulent_sputum', 'xray_inf', 'pleural_drain', 'temperature']

In [None]:
# Calculate ICU admissions per year
vae_data_main[['ID_subid', 'year']].groupby('ID_subid').max().reset_index().groupby('year').count()

In [None]:
# Select patient characteristics and interventions with possible changes over time

FACTORS = ['age', 'charlson', # condition on admission
           'gender_M', 'disease_type_trauma', 'disease_type_tumor',
           'disease_type_vascular', 'disease_type_other',
           
           'st_all_sum', 'st_craniotomy_len_sum', # surgeries
           'st_device_len_sum', 'st_endonasal_len_sum', 'st_endovascular_len_sum',
           'st_other_len_sum', 'st_spinal_len_sum', 'st_all_len_sum',
           'st_device_count', 'st_other_count', 'st_craniotomy_count', 'st_endovascular_count',
           'st_endonasal_count', 'st_spinal_count',
           
           'gcs', 'rass', 'pbss',  # severity of patients condition
           'mutism', 'convulsions', 'aphasia', 'vegetative_state',
           
           'mech_vent_days',   'antibiotics_total_binary_days',  # ICU care
           'central_line_days', 'feeding_tube_days', 'arterial_line_days', 'evd_days', 'icpm_days',
           'urinary_catheter_days', 'hypothermia_days', 'hemodialysis_days', 'total_parenteral_feeding_days',
           'sedation_days', 'anxiolytics_days', 'vasopressors_days', 'days_mech_vent_before_tracheostomy',
           'days_before_tracheostomy', 'endotracheal_tube_1_days', 'endotracheal_tube_2_days',
           'endotracheal_tube_3_days', 'mech_vent',   'antibiotics_total_binary',
           'central_line', 'feeding_tube', 'arterial_line', 'evd', 'icpm',
           'urinary_catheter', 'hypothermia', 'hemodialysis', 'total_parenteral_feeding',
           'sedation', 'anxiolytics', 'vasopressors',
            
           'intestinal_dysfunction', 'infection_bloodstream', 'infection_other',  # complications
           'infection_urinary', 'infection_cns', 'infection_ssi', 'csfl_ne', 'csfl_ss'
           ]

In [None]:
##### Adfuller tests to check stationarity: If we fail to reject the null hypothesis (p-value >0.05),
##### we can say that the series is non-stationary.

# Define function to calculate timeseries statistics
def get_yy_by_col(dataframe, yy_col='gender_M', date_col='yearmonth', abs_values='ratio',
                  inner_agg_func='max', uid='ID_subid', return_by=None):
    assert date_col in ['yearmonth', 'year', 'date', 'halfyear']
    assert return_by != 'year' or date_col == 'date', 'Return by year only works with date'
    
    ID_subid_first_date = dataframe[[uid, date_col]].groupby(uid).min()

    if inner_agg_func in 'sum' and abs_values == 'median':
        mask = (dataframe[[uid, yy_col]].groupby(uid).agg(inner_agg_func) > 0)
        a = dataframe[[uid, yy_col]].groupby(uid).agg(inner_agg_func).join(ID_subid_first_date)[mask.values].groupby(date_col)
    else:  # default behaviour
        a = dataframe[[uid, yy_col]].groupby(uid).agg(inner_agg_func).join(ID_subid_first_date).groupby(date_col)
    
    res_sum = None
    if abs_values == 'ratio':
        res = a.sum() / a.count()
        res_sum = a.sum()
    elif abs_values in ['count', 'sum', 'median']:
        res = a.agg(abs_values)
    elif abs_values == 'meanwithoutzeros':
        res = a.agg(lambda x: x.replace(0., np.nan).agg(np.nanmedian))
    else:
        raise ValueError
    
    if inner_agg_func == 'sum' and abs_values == 'median':
        if date_col == 'date':
            percentiles_25_75 = a.sum().describe().T.iloc[:,[4,6]]
            percentiles_25_75.index = [0]
        else:
            percentiles_25_75 = a.describe().iloc[:,[4,6]]
        
        cil = percentiles_25_75.iloc[:,[0]]
        cir = percentiles_25_75.iloc[:,[1]]
        
    elif date_col == 'date':
        cil, cir = ci(a.sum().sum(), a.count().sum())
        cil, cir = pd.DataFrame(cil).T, pd.DataFrame(cir).T

    else:
        cil, cir = ci(a.sum(), a.count())
    
    if cir.isnull().all().any() or cil.isnull().all().any():
        if date_col == 'date':
            percentiles_25_75 = a.median().describe().T.iloc[:,[4,6]]
            percentiles_25_75.index = [0]
        else:
            percentiles_25_75 = a.describe().iloc[:,[4,6]]
        cil = percentiles_25_75.iloc[:,[0]]
        cir = percentiles_25_75.iloc[:,[1]]
        
    cil.columns = ['cil']
    cir.columns = ['cir']
    
    vals = [f"{l:.3f}; {r:.3f}" for l, r in zip(cil['cil'], cir['cir'])]
    cols = cil.index.astype('str') + '_ci'
    res_ci = pd.DataFrame(vals, index=cols, columns=[yy_col])

    if date_col == 'date' and inner_agg_func == 'sum' and abs_values == 'median':
        result_dict = {yy_col: pd.DataFrame(np.median(res.values), columns=[yy_col], index=['overall']), yy_col + '_ci': res_ci}
    elif date_col == 'date' and inner_agg_func == 'max' and abs_values == 'median':
        # day_in_icu_max, los        
        result_dict = {yy_col: pd.DataFrame(res.median().values, columns=[yy_col], index=['overall']), yy_col + '_ci': res_ci}
        
    elif date_col == 'date':
        result_dict = {yy_col: pd.DataFrame(res.median().values, columns=[yy_col], index=['overall']), yy_col + '_ci': res_ci}
    else:
        result_dict = {yy_col: res, yy_col + '_ci': res_ci}
        
    if abs_values == 'ratio':
        res_sum.index = res_sum.index.astype('str') + '_count'
        result_dict[yy_col + '_count'] = res_sum        

    return result_dict


In [None]:
# Assess dynamics in factors using both linreg and Adfuller tests; monthly data;
# Date for each patient is attributed to the month of admission

table = {}
for col in FACTORS:
    UID = 'ID_subid' if col != 'outcome_death' else 'ID'

    if col == 'ID_subid':
        abs_value_flag = 'count'
    elif 'st_' in col and '_len_mean' in col:
        abs_value_flag = 'meanwithoutzeros'
    else:
        abs_value_flag = 'ratio'    

    ts = get_yy_by_col(vae_data_main, yy_col=col, date_col='yearmonth', abs_values=abs_value_flag, uid=UID)[col]

    if ts.isna().max()[0]:  # if Nan 
        ts = ts.fillna(method='ffill').fillna(method='bfill')

    pval = adfuller(ts.values.reshape(-1))[1]
    a = linregress(ts.values.reshape(-1), np.arange(len(ts.values.reshape(-1)))).pvalue

    ts2: dict = get_yy_by_col(vae_data_main, yy_col=col, date_col='year', abs_values=abs_value_flag, uid=UID)
        
    if abs_value_flag == 'ratio':
        ts2_count = ts2[col + '_count']
        
    ts2, ts2_ci = ts2[col], ts2[col + '_ci']  # '2011, 2012, ...'
        

    ts3 = get_yy_by_col(vae_data_main, yy_col=col, date_col='date', abs_values=abs_value_flag, uid=UID)
    ts3, ts3_ci = ts3[col], ts3[col + '_ci']  # 'Overall' column

    to_concat = [ts2, ts2_ci, ts3, ts3_ci]
    if abs_value_flag == 'ratio':
        to_concat += [ts2_count]
        
    table[col] = pd.concat(to_concat).to_dict()[col]
    table[col]['adfuller_pvalue'] = np.round(pval, 5)
    table[col]['linreg'] = np.round(a, 5)
    

In [None]:
# create final table
table_years = pd.DataFrame.from_dict(table).T

# multiple comparisons adjustment in linreg p-value
table_years['adj_linreg'] = multitest.multipletests(table_years.linreg, alpha=0.05, method='holm')[1]

table_years.to_csv('./table_years.csv', sep='\t', encoding='utf-8')


________