In [13]:
import os
import numpy as np
import pandas as pd
from biomarker_modeling.biomarker_model import BiomarkerModel


event_col = 'MORTSTAT'
duration_col = 'PERMTH_EXM'

gender = 'male'

output_dir = f'data/output_{gender}'
df = pd.read_csv(f'data/NHANES_data_{gender}.csv')

# Load biomarker info, remove features that are not being modeled with NHANES data.
biomarker_info = pd.read_excel('data/biomarker_info.xlsx', sheet_name='biomarker_info', index_col='feature')
biomarker_info = biomarker_info[biomarker_info[f'source_{gender}'].isin(['NHANES'])]
features = biomarker_info.index


for feature in features:

    print(f'Fitting model for {feature}...')

    # Create directory
    model_dir = f'{output_dir}/{feature}/'
    os.makedirs(model_dir, exist_ok=True)

    info = biomarker_info.loc[feature]
    df_marker = df[[duration_col, event_col, feature, 'age']]

    # If cardiovascular biomarker, limit to cardiovascular deaths.
    if info.domain == 'cardiovascular':
        df_marker = df[df['cause'].isin(['cardiovascular','alive'])][[duration_col, event_col, feature, 'age']]

    # Remove values outside 2.5/97.5th percentiles to avoid fitting to extreme outliers.
    df_marker = df_marker[
        (df_marker[feature] >= df_marker[feature].quantile(0.025)) &
        (df_marker[feature] <= df_marker[feature].quantile(0.975))
    ]

    df_marker = df_marker[df_marker[duration_col] > 0]

    df_marker.to_csv(f'{model_dir}/input_data.csv', index=False)

    model = BiomarkerModel(
        feature=feature,
        duration_col=duration_col,
        event_col=event_col,
        data_path=f'{model_dir}/input_data.csv',
        model_dir=model_dir,
        n_bootstraps=0,
        feature_info={'units':info.units,'label':feature}
    )


Fitting model for a1c...
Fitting model for albumin...
Fitting model for alp...
Fitting model for alt...
Fitting model for apoa1...
Fitting model for apob...
Fitting model for apob_apoa1_ratio...
Fitting model for ast...
Fitting model for bmi...
Fitting model for calcium...
Fitting model for creatinine...
Fitting model for ferritin...
Fitting model for folate...
Fitting model for ggt...
Fitting model for glucose_fasting...
Fitting model for glucose_random...
Fitting model for hct...
Fitting model for hdlc...
Fitting model for hemoglobin...
Fitting model for insulin_fasting...
Fitting model for iron...
Fitting model for ldlc...
Fitting model for lymphocytes...
Fitting model for mch...
Fitting model for mchc...
Fitting model for mcv...
Fitting model for monocytes...
Fitting model for mpv...
Fitting model for neutrophils...
Fitting model for nonhdlc...
Fitting model for phosphate...
Fitting model for platelets...
Fitting model for rbc...
Fitting model for rdw...
Fitting model for saturatio

In [17]:
from custom_modeling import delta_age_creation

# Load biomarker info, remove features that are not being modeled from reference values.
configs = pd.read_excel('data/biomarker_info.xlsx', sheet_name='external_references')

for _, config in configs.iterrows():

    # Create feature directories.
    model_dir = f'{output_dir}/{config.feature}/model'
    plots_dir = f'{output_dir}/{config.feature}/plots'
    os.makedirs(model_dir, exist_ok=True)
    os.makedirs(plots_dir, exist_ok=True)

    delta_age_creation.write_delta_age_array(config, output_dir)


In [21]:
import json
from model_aggregating import aggregation
from glob import glob

delta_age_paths = glob(f'{output_dir}/*/model/delta_age-array.csv')
delta_age_paths.sort()

features = [path.split('/')[2] for path in delta_age_paths]
delta_age_paths = dict(zip(features, delta_age_paths))

with open(f'{output_dir}/delta_age_paths.json','w') as f:
    json.dump(delta_age_paths, f, indent=4)


delta_age_df = aggregation.get_delta_age_dataframe(
    f'data/NHANES_data_{gender}.csv',
    f'{output_dir}/delta_age_paths.json',
    output_dir
)

delta_age_df.to_csv(f'{output_dir}/biomarker_delta_ages.csv', index=False)
weights = aggregation.get_feature_weights(delta_age_df, output_dir)


# Need to resave the weighted delta-age arrays.