# Creating a data table from Arivale for prediction

In [1]:
!ls ../copied_csvs

bp.csv		       mets_t2d.csv	       prots_t2d_full.csv
chems_all.csv	       mets_t2d_full.csv       saliva.csv
chems_subset.csv       mets_t2d_pvals.csv      selected_patient_table.csv
clients.csv	       not_t2d_client_ids.csv  snps.csv
meds.csv	       not_t2d_samples.csv     t2d_client_ids.csv
mets_imputed.csv       prots_not_t2d.csv       t2d_samples.csv
mets_not_t2d.csv       prots_not_t2d_full.csv  weight.csv
mets_not_t2d_full.csv  prots_subset.csv
mets_subset.csv        prots_t2d.csv


## 1. Load data, preprocessing

In [2]:
import numpy as np
import pandas as pd

In [3]:
chems = pd.read_csv('../copied_csvs/chems_all.csv', index_col=0, dtype={'public_client_id': np.str_})
clients = pd.read_csv('../copied_csvs/clients.csv', index_col=0, dtype={'public_client_id': np.str_})
bp = pd.read_csv('../copied_csvs/bp.csv', index_col=0, dtype={'public_client_id': np.str_})
weight = pd.read_csv('../copied_csvs/weight.csv', index_col=0, dtype={'public_client_id': np.str_})
meds = pd.read_csv('../copied_csvs/meds.csv', index_col=0, dtype={'public_client_id': np.str_})

In [4]:
prots_t2d = pd.read_csv('../copied_csvs/prots_t2d.csv', index_col=0,
                       dtype={'public_client_id':np.str_})
prots_not_t2d = pd.read_csv('../copied_csvs/prots_not_t2d.csv', index_col=0,
                            dtype={'public_client_id':np.str_})
prots_all = pd.concat([prots_t2d, prots_not_t2d])
# we want to keep at least one is_t2d column.

In [5]:
mets_t2d = pd.read_csv('../copied_csvs/mets_t2d.csv', index_col=0,
                       dtype={'public_client_id':np.str_})
mets_not_t2d = pd.read_csv('../copied_csvs/mets_not_t2d.csv', index_col=0,
                            dtype={'public_client_id':np.str_})
mets_all = pd.concat([mets_t2d, mets_not_t2d])
if 'is_t2d' in mets_all.columns:
    del mets_all['is_t2d']

In [6]:
samples_t2d = pd.read_csv('../copied_csvs/t2d_samples.csv', index_col=0,
                       dtype={'public_client_id':np.str_})
samples_t2d = pd.read_csv('../copied_csvs/not_t2d_samples.csv', index_col=0,
                            dtype={'public_client_id':np.str_})
samples_all = pd.concat([samples_t2d, samples_t2d])

  samples_t2d = pd.read_csv('../copied_csvs/not_t2d_samples.csv', index_col=0,


### Analysis of questionnaires and medications

In [7]:
diabetes_type2_current = 'assessment:health-history:cardiometabolic_type2_diabetes:self_current'
samples_all[diabetes_type2_current]
samples_all['is_t2d'] = samples_all[diabetes_type2_current]

  samples_all['is_t2d'] = samples_all[diabetes_type2_current]


In [8]:
samples_all['is_t1d'] = samples_all['assessment:health-history:autoimmune_type1_diabetes:self_current']

  samples_all['is_t1d'] = samples_all['assessment:health-history:autoimmune_type1_diabetes:self_current']


In [9]:
samples_all['assessment:health-history:cancer_basal_cell_carcinoma:self_current']
samples_all['assessment:health-history:cardiometabolic_metabolic_syndrome:self_current']
samples_all['assessment:health-history:cardiovascular_hypertension:self_current']
samples_all['assessment:health-history:bladder_kidney_kidney_disease:self_current']

0        False
1          NaN
2        False
3          NaN
4        False
         ...  
11880    False
11881      NaN
11882    False
11883    False
11884    False
Name: assessment:health-history:bladder_kidney_kidney_disease:self_current, Length: 23344, dtype: object

In [10]:
samples_all.columns[['med' in x for x in samples_all.columns]]

Index(['assessment:digestion:medications:enum',
       'assessment:health-history:over_the_counter_medication_use:int',
       'assessment:health-history:prescription_medication_use_yes_no:int',
       'assessment:journey:daily_medical_support:int',
       'assessments:journey:medical_treatment',
       'assessment_health_history_prescription_medication_use_yes_no_int',
       'assessment_health_history_over_the_counter_medication_use_int',
       'meds_cholesterol', 'meds_blood_sugar', 'meds_blood_pressure',
       'meds_antibiotics_last_3_months'],
      dtype='object')

In [11]:
samples_all['meds_blood_sugar'].unique()

array(['No', nan, 'Yes'], dtype=object)

In [12]:
samples_all['meds_blood_pressure'].unique()

array(['Yes', nan, 'No'], dtype=object)

In [13]:
# TODO: identify clients that are on diabetes medication
# unfortunately, the data does not seem to show medication start times.
# diabetes drugs:
# - insulin
# - metformin
# - pioglitazone (thiazolidinediones )
# - dulaglutide, liraglutide, exenatide  (GLP-1 agonists)
# - canagliflozin (SGLT2 inhibitors)
# - glyburide, glipizide (Sulfonylureas)
# - pioglitazone

### Removing duplicates

In [14]:
chems = chems.sort_values(['public_client_id', 'days_in_program'])
print(chems.shape)

(11167, 140)


In [15]:
bp.index = pd.MultiIndex.from_frame(bp[['public_client_id', 'days_in_program']])
bp = bp[~bp.index.duplicated(keep='first')]

chems.index = pd.MultiIndex.from_frame(chems[['public_client_id', 'days_in_program']])
chems = chems[~chems.index.duplicated(keep='first')]

weight.index = pd.MultiIndex.from_frame(weight[['public_client_id', 'days_in_program']])
weight = weight[~weight.index.duplicated(keep='first')]

prots_all.index = pd.MultiIndex.from_frame(prots_all[['public_client_id', 'days_in_program']])
prots_all = prots_all[~prots_all.index.duplicated(keep='first')]

mets_all.index = pd.MultiIndex.from_frame(mets_all[['public_client_id', 'days_in_program']])
mets_all = mets_all[~mets_all.index.duplicated(keep='first')]

In [16]:
samples_all.index = pd.MultiIndex.from_frame(samples_all[['public_client_id', 'days_in_program']])
samples_all = samples_all[~samples_all.index.duplicated(keep='first')]

In [17]:
print(chems.shape)

(11167, 140)


### Generating a prev/next field for chems

In [18]:
chems['days_since_previous'] = [0]*len(chems)
chems['days_til_next'] = [0]*len(chems)

chems_selected = ['HbA1C', 'GFR', 'GLUCOSE', 'INSULIN', 'HOMA-IR']
chems_to_column = {
    'HbA1C': 'GLYCOHEMOGLOBIN A1C',
    'GFR': 'GFR, MDRD',
    'GLUCOSE': 'GLUCOSE',
    'INSULIN': 'INSULIN',
    'HOMA-IR': 'HOMA-IR'
}

for c in chems_selected:
    chems['d_'+c] = [0]*len(chems)
    chems['prev_'+c] = [0]*len(chems)
    chems['next_'+c] = [0]*len(chems)

In [19]:
current_id = None
prev_time = 0
prev_chems = {c: 0 for c in chems_selected}
prev_glucose = 0
prev_hba1c = 0
prev_gfr = 0
for i, (index, row) in enumerate(chems.iterrows()):
    if row.public_client_id != current_id:
        current_id = row.public_client_id
    else:
        chems.loc[index, 'days_since_previous'] = row.days_in_program - prev_time
        for c in chems_selected:
            chems.loc[index, 'prev_' + c] = prev_chems[c]
    prev_time = row.days_in_program
    for c in chems_selected:
        prev_chems[c] = row[chems_to_column[c]]
    if i < len(chems) - 1:
        next_row = chems.iloc[i+1]
        if next_row.public_client_id == current_id:
            chems.loc[index, 'days_til_next'] = next_row.days_in_program - prev_time
            for c in chems_selected:
                chems.loc[index, 'next_' + c] = next_row[chems_to_column[c]]
                chems.loc[index, 'd_' + c] = next_row[chems_to_column[c]] - row[chems_to_column[c]]

#### Generating longer-term differences (1 year, 2 years, etc)

In [20]:
chems['days_til_next_1y'] = [0]*len(chems)
chems['days_til_next_2y'] = [0]*len(chems)

for c in chems_selected:
    chems['next_1y_'+c] = [np.nan]*len(chems)
    chems['d_1y_'+c] = [np.nan]*len(chems)
    chems['next_2y_'+c] = [np.nan]*len(chems)
    chems['d_2y_'+c] = [np.nan]*len(chems)

In [21]:
current_id = None
prev_time = 0
for i, (index, row) in enumerate(chems.iterrows()):
    if row.public_client_id != current_id:
        current_id = row.public_client_id
    prev_time = row.days_in_program
    j = i + 1
    while j < len(chems) - 1:
        next_row = chems.iloc[j]
        if next_row.public_client_id == current_id:
            days_til_next = next_row.days_in_program - prev_time
            if days_til_next < 420 and days_til_next > 300:
                chems.loc[index, 'days_til_next_1y'] = days_til_next
                for c in chems_selected:
                    chems.loc[index, 'next_1y_' + c] = next_row[chems_to_column[c]]
                    chems.loc[index, 'd_1y_' + c] = next_row[chems_to_column[c]] - row[chems_to_column[c]]
            if days_til_next > 660 and days_til_next < 780:
                chems.loc[index, 'days_til_next_2y'] = days_til_next
                for c in chems_selected:
                    chems.loc[index, 'next_2y_' + c] = next_row[chems_to_column[c]]
                    chems.loc[index, 'd_2y_' + c] = next_row[chems_to_column[c]] - row[chems_to_column[c]]
            j += 1
        else:
            break

In [22]:
print((~chems['next_1y_GFR'].isna()).sum(), (~chems['next_2y_GFR'].isna()).sum())

2407 695


## 2. Creating a combined clinical-metabolomic-proteomic table

In [23]:
combined_table_mets = pd.concat([mets_all, prots_all, chems], axis=1)

In [24]:
combined_table_mets_filtered = combined_table_mets[~combined_table_mets['public_client_id'].isna().any(1)]

  combined_table_mets_filtered = combined_table_mets[~combined_table_mets['public_client_id'].isna().any(1)]


In [25]:
combined_table_mets_filtered = combined_table_mets_filtered.loc[:, ~combined_table_mets_filtered.columns.duplicated()]

In [26]:
combined_table_mets_filtered.shape

(3182, 1170)

In [27]:
# TODO: this messes up the dtypes
combined_table_mets_filtered = combined_table_mets_filtered.T.drop_duplicates(keep='first').T
combined_table_mets_filtered = combined_table_mets_filtered.infer_objects()

In [28]:
combined_table_mets_filtered.shape

(3182, 1170)

In [29]:
combined_table_mets_filtered = combined_table_mets_filtered.sort_index()

### Adding additional fields to the table - age, weight, meds, etc

In [30]:
# TODO: add meds_blood_sugar as a feature
samples_all.index = samples_all.public_client_id
weight.index = weight.public_client_id

In [31]:
def get_nearest_index(df, col, val):
    "Gets the nearest index to the val, given a df sorted by col."
    min_distance = np.inf
    best_index = None
    best_row = None
    for index, row in df.iterrows():
        distance = np.abs(row[col] - val)
        if distance < min_distance:
            min_distance = distance
            best_index = index
            best_row = row
        else:
            if best_index is not None:
                return best_index, best_row
    return best_index, best_row

In [32]:
clients.index = clients.public_client_id
combined_table_mets_filtered = combined_table_mets_filtered.copy()
combined_table_mets_filtered['age'] = np.zeros(combined_table_mets_filtered.shape[0]) + np.nan
combined_table_mets_filtered['is_m'] = np.zeros(combined_table_mets_filtered.shape[0]) + np.nan
combined_table_mets_filtered['weight'] = np.zeros(combined_table_mets_filtered.shape[0]) + np.nan
combined_table_mets_filtered['height'] = np.zeros(combined_table_mets_filtered.shape[0]) + np.nan
combined_table_mets_filtered['bmi'] = np.zeros(combined_table_mets_filtered.shape[0]) + np.nan
combined_table_mets_filtered['meds_blood_sugar'] = np.zeros(combined_table_mets_filtered.shape[0]) + np.nan


for i, row in combined_table_mets_filtered.iterrows():
    # ci is client id
    ci = i[0]
    days = i[1]
    try:
        combined_table_mets_filtered.loc[i,'age'] = clients.loc[ci].age
        combined_table_mets_filtered.loc[i,'is_m'] = int(clients.loc[ci].sex == 'M')
    except:
        print('age not available for', ci)
    try:
        weights_subset = weight.loc[ci]
        if isinstance(weights_subset, pd.DataFrame):
            weight_index, weight_row = get_nearest_index(weights_subset, 'days_in_program', days)
        else:
            weight_row = weights_subset
        combined_table_mets_filtered.loc[i,'weight'] = weight_row.WEIGHT_CALC
        combined_table_mets_filtered.loc[i,'height'] = weight_row.HEIGHT_CALC
        combined_table_mets_filtered.loc[i,'bmi'] = weight_row.BMI_CALC
    except:
        print('weights not available for', ci)
    try:
        samples_subset = samples_all.loc[ci]
        if isinstance(samples_subset, pd.DataFrame):
            sample_index, sample_row = get_nearest_index(samples_subset, 'days_in_program', days)
            combined_table_mets_filtered.loc[i,'meds_blood_sugar'] = sample_row.meds_blood_sugar
        else:
            combined_table_mets_filtered.loc[i,'meds_blood_sugar'] = samples_subset.meds_blood_sugar
    except:
        print('sample questionnaires not available for', ci)

age not available for 01000261
sample questionnaires not available for 01021265
weights not available for 01029483
weights not available for 01029483
weights not available for 01029483
sample questionnaires not available for 01036226
age not available for 01037794
age not available for 01039674
age not available for 01049319
age not available for 01051690
age not available for 01053293
age not available for 01059410
sample questionnaires not available for 01066842
sample questionnaires not available for 01067542
sample questionnaires not available for 01067542
sample questionnaires not available for 01067542
age not available for 01071744
age not available for 01090838
age not available for 01097466
age not available for 01111951
age not available for 01112253
age not available for 01112759
age not available for 01116375
age not available for 01131110
age not available for 01136823
age not available for 01140627
age not available for 01148961
age not available for 01156139
age not avai

In [33]:
combined_table_mets_filtered

Unnamed: 0_level_0,Unnamed: 1_level_0,public_client_id,sample_id,days_in_program,days_since_first_call,days_since_first_draw,month,weekday,season,55,93,...,next_1y_HOMA-IR,d_1y_HOMA-IR,next_2y_HOMA-IR,d_2y_HOMA-IR,age,is_m,weight,height,bmi,meds_blood_sugar
public_client_id,days_in_program,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
01000261,65,01000261,A477AV558-002,65.0,-9.0,0.0,Jan,Tue,winter,1.155771,0.947589,...,,,,,,,142.0,66.0,22.916896,No
01001621,11,01001621,A776BI445-003,11.0,-31.0,0.0,Jul,Mon,summer,1.328050,1.366908,...,,,,,54.0,0.0,140.0,65.0,23.294675,No
01001621,265,01001621,A391BM948-002,265.0,223.0,254.0,Apr,Wed,spring,0.619724,1.334809,...,,,,,54.0,0.0,140.0,65.0,23.294675,No
01002183,13,01002183,A595AV320-002,13.0,-1.0,0.0,Jan,Wed,winter,0.425073,0.850577,...,2.040741,-0.751852,1.622222,-1.17037,68.0,0.0,128.0,65.0,21.297988,No
01002412,13,01002412,A294AU415-002,13.0,-6.0,0.0,Nov,Thu,fall,1.008430,0.877139,...,,,,,52.0,0.0,135.0,63.0,23.911565,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HX344502,25,HX344502,A392BL829-002,25.0,21.0,0.0,Feb,Fri,winter,1.378031,1.017652,...,,,,,44.0,0.0,175.4,60.5,33.687917,No
HX409129,5,HX409129,A581BK409-002,5.0,2.0,0.0,Dec,Fri,winter,2.116840,,...,1.044938,0.184938,,,47.0,0.0,179.4,64.0,30.790576,No
HX460562,28,HX460562,A641BO324-003,28.0,7.0,0.0,Aug,Thu,summer,1.731506,0.804149,...,,,,,48.0,0.0,330.0,68.0,50.170848,No
HX794171,56,HX794171,A229BM682-002,56.0,-17.0,0.0,Mar,Fri,spring,0.660610,,...,,,,,37.0,1.0,245.7,72.0,33.319271,No


In [34]:
combined_table_mets_filtered[['GLYCOHEMOGLOBIN A1C', 'next_HbA1C', 'd_HbA1C']]

Unnamed: 0_level_0,Unnamed: 1_level_0,GLYCOHEMOGLOBIN A1C,next_HbA1C,d_HbA1C
public_client_id,days_in_program,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
01000261,65,5.1,5.5,0.4
01001621,11,5.6,5.5,-0.1
01001621,265,5.5,0.0,0.0
01002183,13,5.3,5.7,0.4
01002412,13,4.8,5.2,0.4
...,...,...,...,...
HX344502,25,5.2,0.0,0.0
HX409129,5,5.3,5.3,0.0
HX460562,28,6.1,5.8,-0.3
HX794171,56,5.2,0.0,0.0


In [35]:
(combined_table_mets_filtered.next_HbA1C == 0).sum()

629

In [36]:
(combined_table_mets_filtered.next_HbA1C.isna()).sum()

19

In [37]:
(combined_table_mets_filtered.next_GLUCOSE.isna()).sum()

17

### Averaging duplicated protein columns - if protein columns are duplicated across sets, average them.

In [38]:
prot_names = pd.read_csv('../arivale_data/arivale_prots.tsv', sep='\t')
gene_names = prot_names.groupby('gene_name')['index'].unique()
duplicate_gene_names = gene_names[gene_names.map(len) > 1]
combined_table_mets_prots = combined_table_mets_filtered.copy()
for index, ids in duplicate_gene_names.items():
    table_ids = [i for i in combined_table_mets_filtered.columns if i in ids]
    if len(table_ids) > 1:
        print(index, table_ids)
        print(combined_table_mets_filtered[table_ids].corr())
        table_mean = combined_table_mets_filtered[table_ids].mean(axis=1, skipna=True)
        combined_table_mets_prots[table_ids[0]] = table_mean
        combined_table_mets_prots = combined_table_mets_prots.drop(table_ids[1:], axis=1)

CCL2 ['CVD3_P13500', 'INF_P13500']
             CVD3_P13500  INF_P13500
CVD3_P13500     1.000000    0.714038
INF_P13500      0.714038    1.000000
CCL3 ['CVD2_P10147', 'INF_P10147']
             CVD2_P10147  INF_P10147
CVD2_P10147     1.000000    0.905243
INF_P10147      0.905243    1.000000
CXCL1 ['CVD2_P09341', 'INF_P09341']
             CVD2_P09341  INF_P09341
CVD2_P09341     1.000000    0.949313
INF_P09341      0.949313    1.000000
FGF21 ['CVD2_Q9NSA1', 'INF_Q9NSA1']
             CVD2_Q9NSA1  INF_Q9NSA1
CVD2_Q9NSA1     1.000000    0.974045
INF_Q9NSA1      0.974045    1.000000
FGF23 ['CVD2_Q9GZV9', 'INF_Q9GZV9']
             CVD2_Q9GZV9  INF_Q9GZV9
CVD2_Q9GZV9     1.000000    0.853413
INF_Q9GZV9      0.853413    1.000000
IL18 ['CVD2_Q14116', 'INF_Q14116']
             CVD2_Q14116  INF_Q14116
CVD2_Q14116     1.000000    0.705567
INF_Q14116      0.705567    1.000000
IL6 ['CVD2_P05231', 'INF_P05231']
             CVD2_P05231  INF_P05231
CVD2_P05231     1.000000    0.956517
INF_P05231   

In [39]:
print(combined_table_mets_prots.shape)

(3182, 1166)


### Taking only the first time point

In [40]:
#  try taking only the first patient?
first_items = []
prev_client = None
for c in combined_table_mets_prots.public_client_id:
    if c == prev_client:
        first_items.append(False)
    else:
        prev_client = c
        first_items.append(True)
first_items = np.array(first_items)

combined_table_mets_filtered_first_items = combined_table_mets_prots[first_items]

In [41]:
print(combined_table_mets_filtered_first_items.shape)

(2008, 1166)


### Filtering columns by NaNs, selecting columns to use

1. remove rows that have missing next-time-point values
2. identify -omics/clinical columns that have less than 10% missingness, select these columns to use for further analysis

In [42]:
combined_table_mets_filtered_1 = combined_table_mets_filtered_first_items[(combined_table_mets_filtered_first_items.next_HbA1C != 0) &\
                                                (combined_table_mets_filtered_first_items.next_GFR != 0) &\
                                                (combined_table_mets_filtered_first_items.next_GLUCOSE != 0) &\
                                                (combined_table_mets_filtered_first_items.next_INSULIN != 0) &\
                                                (~combined_table_mets_filtered_first_items.next_HbA1C.isna()) &\
                                                (~combined_table_mets_filtered_first_items.next_GFR.isna()) &\
                                                (~combined_table_mets_filtered_first_items.next_GLUCOSE.isna()) &\
                                                (~combined_table_mets_filtered_first_items.next_INSULIN.isna())]

In [43]:
for chem, col_name in chems_to_column.items():
    combined_table_mets_filtered_1 = combined_table_mets_filtered_1[~combined_table_mets_filtered_1[col_name].isna()]

In [44]:
print(combined_table_mets_filtered_1.shape)

(1705, 1166)


In [45]:
na_counts = combined_table_mets_filtered_1.isna().sum(0)

Cutofff of 10% missingness for columns

In [46]:
less_than_10 = (na_counts < combined_table_mets_filtered_1.shape[0]*0.1) & (combined_table_mets_filtered_1.dtypes == np.float64)

In [47]:
is_next = combined_table_mets_filtered_1.columns.str.contains('d_') |\
          combined_table_mets_filtered_1.columns.str.contains('next_')| \
          combined_table_mets_filtered_1.columns.str.contains('prev_')

In [48]:
selected_columns_full = combined_table_mets_filtered_1.columns[less_than_10 & ~is_next].to_numpy()
columns_to_exclude = set(['client_id', 'days_til_next', 'days_since_previous', 'days_til_next_1y', 'days_til_next_2y',
                          'observation_id', 'days_in_program',
                          'days_since_first_call', 'days_since_first_draw'])
selected_columns_full = np.array([x for x in selected_columns_full if x not in columns_to_exclude])

In [49]:
selected_chem_bp_cols = [x for x in selected_columns_full if x in chems.columns or x in bp.columns or x in ['age', 'is_m', 'bmi', 'meds_blood_sugar'] or x in weight.columns]

print('number of total clinical columns:', len(selected_chem_bp_cols))

number of total clinical columns: 70


In [50]:
print(selected_chem_bp_cols)

['A/G RATIO', 'ADIPONECTIN, SERUM', 'ALAT (SGPT)', 'ALBUMIN', 'ALKALINE PHOSPHATASE', 'ARACHIDONIC ACID', 'ASAT (SGOT)', 'BASOPHILS', 'BASOPHILS ABSOLUTE', 'BILIRUBIN, TOTAL', 'BUN/CREAT RATIO', 'CALCIUM', 'CARBON DIOXIDE (CO2)', 'CHLORIDE', 'CHOLESTEROL, TOTAL', 'CREATININE ENZ, SER', 'CRP HIGH SENSITIVITY', 'DHA', 'EOSINOPHILS', 'EOSINOPHILS ABSOLUTE', 'EPA', 'FERRITIN', 'FOLIC ACID, SERUM', 'GFR, MDRD', 'GGT', 'GLOBULIN', 'GLUCOSE', 'GLYCOHEMOGLOBIN A1C', 'HDL CHOL DIRECT', 'HEMATOCRIT', 'HEMOGLOBIN', 'HOMA-IR', 'HOMOCYSTEINE, SERUM', 'IL-6', 'IL-8', 'INSULIN', 'LDL PARTICLE NUMBER', 'LDL SMALL', 'LDL-CHOL CALCULATION', 'LEAD, BLOOD', 'LYMPHOCYTES', 'LYMPHOCYTES ABSOLUTE', 'MCH', 'MCHC', 'MCV', 'MERCURY, BLOOD', 'METHYLMALONIC ACID', 'MONOCYTES', 'MONOCYTES ABSOLUTE', 'OMEGA-3 INDEX', 'OMEGA-6/OMEGA-3 RATIO', 'PLATELET COUNT THOUSAND', 'POTASSIUM', 'PROTEIN, TOTAL SERUM', 'RDW', 'RED CELL COUNT', 'SODIUM', 'TNF-ALPHA', 'TOTAL NEUTROPHILS', 'TOTAL NEUTROPHILS AB', 'TRIGLYCERIDES', 'T

In [51]:
selected_chems_only = [x for x in selected_columns_full if x in chems.columns]


In [52]:
chem_subset_cols = ['INSULIN', 'GLUCOSE', 'GLYCOHEMOGLOBIN A1C', 'GFR, MDRD',
                  'ALBUMIN', 'CREATININE ENZ, SER',
                  'HDL CHOL DIRECT', 'HEMATOCRIT', 'HEMOGLOBIN', 'LDL-CHOL CALCULATION', 'TRIGLYCERIDES',
                  'MEAN_ARTERIAL_BLOOD_PRESSURE', 'PULSE_PRESSURE', 'diastolic', 'systolic',
                  'age', 'is_m', 'meds_blood_sugar', 'bmi']
chem_subset_cols = [x for x in chem_subset_cols if x in selected_columns_full]
print('number of selected clinical columns:', len(chem_subset_cols))
print(chem_subset_cols)

number of selected clinical columns: 14
['INSULIN', 'GLUCOSE', 'GLYCOHEMOGLOBIN A1C', 'GFR, MDRD', 'ALBUMIN', 'CREATININE ENZ, SER', 'HDL CHOL DIRECT', 'HEMATOCRIT', 'HEMOGLOBIN', 'LDL-CHOL CALCULATION', 'TRIGLYCERIDES', 'age', 'is_m', 'bmi']


Somewhat arbitrarily, we're only including proteins with less than 5 missing values. This is different from clinical variables because of the different distributions of missing-ness; having a more stringent missingness threshold will remove a lot of clinical variables that we want to include, and will also remove too many metabolites.

In [53]:
less_than_5 = (na_counts < 5)

In [54]:
selected_prot_cols = [x for x in selected_columns_full if x in prots_all.columns and less_than_5[x]]
print('number of protein columns:', len(selected_prot_cols))

number of protein columns: 262


In [55]:
selected_met_cols = [x for x in selected_columns_full if x in mets_all.columns]
print('number of metabolite columns:', len(selected_met_cols))

number of metabolite columns: 710


In [56]:
selected_columns_full = selected_chem_bp_cols + selected_prot_cols + selected_met_cols

In [57]:
print('number of total columns:', len(selected_columns_full))

number of total columns: 1042


### Remove rows that have NaN values for chems, set metabolites with NaNs to 0

try: setting missing metabolites to the median, remove rows with missing chems.

In [58]:
combined_table_mets_filtered_1['bmi'].isna().sum()

103

In [59]:
chem_has_na = combined_table_mets_filtered_1[selected_chem_bp_cols].isna().sum(1)
met_has_na = combined_table_mets_filtered_1[selected_met_cols].isna().sum(1)
prot_has_na = combined_table_mets_filtered_1[selected_prot_cols].isna().sum(1)
has_na = chem_has_na + met_has_na + prot_has_na

nans_above_threshold = (has_na > combined_table_mets_filtered_1.shape[0]*0.1)
print('# of samples with nans > 10%:', nans_above_threshold.sum())

print('# of samples with missing clinical var:', sum(chem_has_na > 0))

print('# of samples with missing protein:', sum(prot_has_na > 0))

# only remove the chem and prot NaN rows, set the met NaN rows to 0
combined_table_mets_filtered_2 = combined_table_mets_filtered_1[(~nans_above_threshold) & (chem_has_na == 0) & (prot_has_na == 0)]

combined_table_mets_set_medians = combined_table_mets_filtered_2.copy()


print('shape after removing values with missing chems/prots:', combined_table_mets_filtered_2.shape)

# of samples with nans > 10%: 14
# of samples with missing clinical var: 449
# of samples with missing protein: 9
shape after removing values with missing chems/prots: (1241, 1166)


### Filter by time interval between samples

In [60]:
combined_table_mets_filtered_times = combined_table_mets_filtered_2[\
                                                                (combined_table_mets_filtered_2.days_til_next < 270) \
                                                                & (combined_table_mets_filtered_2.days_til_next > 90)]
print(combined_table_mets_filtered_times.shape)

(1149, 1166)


In [61]:
combined_table_mets_set_zeros = combined_table_mets_filtered_times.copy()
combined_table_mets_set_zeros[selected_met_cols] = combined_table_mets_set_zeros[selected_met_cols].fillna(0)
combined_table_mets_set_zeros[selected_prot_cols] = combined_table_mets_set_zeros[selected_prot_cols].fillna(0)


### Imputation for metabolites using missforest

In [62]:
import os
from missingpy import MissForest
import warnings
warnings.filterwarnings("ignore")

imputer = MissForest(max_iter=5, decreasing=False, missing_values=np.nan,
                     copy=True, criterion=('squared_error', 'gini'),
                     max_depth=15, min_samples_split=2, min_samples_leaf=1,
                     min_weight_fraction_leaf=0.0,
                     max_leaf_nodes=None, min_impurity_decrease=0.0,
                     bootstrap=True, oob_score=False, n_jobs=-1,
                     verbose=0, warm_start=True, class_weight=None,
                     max_features=1)

data_selected_cols = combined_table_mets_filtered_times[selected_met_cols]

imputed = imputer.fit_transform(data_selected_cols)

Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4


In [63]:
combined_table_mets_imputed = combined_table_mets_filtered_times.copy()

In [64]:
#if not os.path.exists('clinical_multiomics_combined_data_imputed.csv'):
data_selected_cols[:] = imputed
#else:
#    data_selected_cols = pd.read_csv('clinical_multiomics_combined_data_imputed.csv')
combined_table_mets_imputed[selected_met_cols] = data_selected_cols.values

In [65]:
combined_table_mets_imputed.shape

(1149, 1166)

### Filter rows by output outliers

Thresholds are somewhat arbitrary, based on inspection

In [66]:
# TODO: remove outliers for next_hba1c, glucose, gfr
row_has_na = combined_table_mets_set_zeros[selected_met_cols].isna().any(1)
items_to_exclude = {'HbA1C', 'GLUCOSE', 'GFR', 'INSULIN'}
upper_bounds = {'HbA1C': 12, 'GLUCOSE': 200, 'GFR': 200, 'INSULIN': 60}
lower_bounds = {'HbA1C': 0, 'GLUCOSE': 0, 'GFR': 0, 'INSULIN': 0}

for k in items_to_exclude:
    item_data = combined_table_mets_imputed[chems_to_column[k]]
    mean = np.mean(item_data)
    std = np.std(item_data)
    row_has_na = row_has_na | (item_data > upper_bounds[k]) | (item_data < lower_bounds[k])
    item_data = combined_table_mets_set_medians['next_' + k]
    mean = np.mean(item_data)
    std = np.std(item_data)
    row_has_na = row_has_na | (item_data > upper_bounds[k]) | (item_data < lower_bounds[k])

combined_table_imputed_outliers = combined_table_mets_imputed[~row_has_na]
combined_table_mets_zeros_times_outliers = combined_table_mets_set_zeros[~row_has_na]

In [67]:
combined_table_imputed_outliers.shape

(1131, 1166)

### Create data arrays for training/stats

In [68]:
#  try taking only the first patient?
first_items = []
prev_client = None
for c in combined_table_imputed_outliers.public_client_id:
    if c == prev_client:
        first_items.append(False)
    else:
        prev_client = c
        first_items.append(True)
first_items = np.array(first_items)

combined_table_imputed_outliers = combined_table_imputed_outliers[first_items]
combined_table_mets_zeros_times_outliers = combined_table_mets_zeros_times_outliers[first_items]

In [69]:
print(combined_table_imputed_outliers.shape)

(1131, 1166)


#### Convert deltas to discrete values for classification

In [84]:
for key, val in chems_to_column.items():
    if key != 'GFR':
        pos = (combined_table_imputed_outliers['d_'+key] >= combined_table_imputed_outliers[val]*0.05)
        neg = (combined_table_imputed_outliers['d_'+key] < combined_table_imputed_outliers[val]*0.05)
    else:
        pos = (combined_table_imputed_outliers['d_'+key] <= combined_table_imputed_outliers[val]*-0.05)
        neg = (combined_table_imputed_outliers['d_'+key] > combined_table_imputed_outliers[val]*-0.05)
    combined_table_imputed_outliers['d_' + key + '_class'] = np.zeros(pos.shape) + pos
    combined_table_imputed_outliers['d_' + key + '_class'][combined_table_imputed_outliers['d_'+key].isna()] = pd.NA

In [85]:
for key, val in chems_to_column.items():
    if key != 'GFR':
        pos = (combined_table_imputed_outliers['d_1y_'+key] >= combined_table_imputed_outliers[val]*0.05)
        neg = (combined_table_imputed_outliers['d_1y_'+key] < combined_table_imputed_outliers[val]*0.05)
    else:
        pos = (combined_table_imputed_outliers['d_1y_'+key] <= combined_table_imputed_outliers[val]*-0.05)
        neg = (combined_table_imputed_outliers['d_1y_'+key] > combined_table_imputed_outliers[val]*-0.05)
    combined_table_imputed_outliers['d_1y_' + key + '_class'] = np.zeros(pos.shape) + pos
    combined_table_imputed_outliers['d_1y_' + key + '_class'][combined_table_imputed_outliers['d_1y_'+key].isna()] = pd.NA

## Saving table, saving variable lists

In [86]:
!mkdir results_2023_10_18

mkdir: cannot create directory ‘results_2023_10_18’: File exists


In [87]:
combined_table_imputed_outliers.to_csv('results_2023_10_18/combined_data_table.csv', index=None)

In [88]:
# save lists of chems, prots, mets
np.savetxt('results_2023_10_18/selected_columns_full.txt', selected_columns_full, fmt='%s')
np.savetxt('results_2023_10_18/chem_subset_cols.txt', chem_subset_cols, fmt='%s')
np.savetxt('results_2023_10_18/selected_chem_bp_cols.txt', selected_chem_bp_cols, fmt='%s')
np.savetxt('results_2023_10_18/selected_prot_cols.txt', selected_prot_cols, fmt='%s')
np.savetxt('results_2023_10_18/selected_met_cols.txt', selected_met_cols, fmt='%s')


## Aggregate data statistics

In [89]:
combined_table_imputed_outliers.agg({'age': ['count', 'min', 'mean', 'median', 'max', 'std'],
                    'bmi': ['count', 'min', 'mean', 'median', 'max', 'std'],
                    'GLYCOHEMOGLOBIN A1C': ['count', 'min', 'mean', 'median', 'max', 'std'],
                    'GLUCOSE': ['count', 'min', 'mean', 'median', 'max', 'std'],
                    'GFR, MDRD': ['count', 'min', 'mean', 'median', 'max', 'std'],
                    'INSULIN': ['count', 'min', 'mean', 'median', 'max', 'std'],
                    'HOMA-IR': ['count', 'min', 'mean', 'median', 'max', 'std'],}).T.style.format(precision=2)

Unnamed: 0,count,min,mean,median,max,std
age,1131.0,18.0,49.53,49.0,87.0,11.29
bmi,1131.0,17.74,27.63,26.15,53.35,6.04
GLYCOHEMOGLOBIN A1C,1131.0,3.6,5.51,5.5,8.3,0.43
GLUCOSE,1131.0,70.0,93.24,91.0,199.0,12.18
"GFR, MDRD",1131.0,41.0,90.17,90.0,131.0,15.17
INSULIN,1131.0,1.4,10.67,8.9,55.1,7.34
HOMA-IR,1131.0,0.25,2.56,1.98,20.34,2.16


In [90]:
combined_table_imputed_outliers.agg({'d_HbA1C': ['count', 'min', 'mean', 'median', 'max', 'std'],
                    'd_GLUCOSE': ['count', 'min', 'mean', 'median', 'max', 'std'],
                    'd_GFR': ['count', 'min', 'mean', 'median', 'max', 'std'],
                    'd_INSULIN': ['count', 'min', 'mean', 'median', 'max', 'std'],
                    'd_HOMA-IR': ['count', 'min', 'mean', 'median', 'max', 'std'],}).T.style.format(precision=2)

Unnamed: 0,count,min,mean,median,max,std
d_HbA1C,1131.0,-1.5,-0.05,-0.1,1.4,0.29
d_GLUCOSE,1131.0,-49.0,-0.48,0.0,39.0,7.78
d_GFR,1131.0,-30.0,1.04,0.0,45.0,9.44
d_INSULIN,1131.0,-31.5,-0.89,-0.4,22.7,4.97
d_HOMA-IR,1131.0,-11.09,-0.24,-0.12,7.09,1.43


In [91]:
combined_table_imputed_outliers.agg({'d_1y_HbA1C': ['count', 'min', 'mean', 'median', 'max', 'std'],
                    'd_1y_GLUCOSE': ['count', 'min', 'mean', 'median', 'max', 'std'],
                    'd_1y_GFR': ['count', 'min', 'mean', 'median', 'max', 'std'],
                    'd_1y_INSULIN': ['count', 'min', 'mean', 'median', 'max', 'std'],
                    'd_1y_HOMA-IR': ['count', 'min', 'mean', 'median', 'max', 'std'],}).T.style.format(precision=2)

Unnamed: 0,count,min,mean,median,max,std
d_1y_HbA1C,639.0,-1.9,-0.11,-0.1,1.0,0.31
d_1y_GLUCOSE,639.0,-57.0,-0.18,1.0,51.0,8.89
d_1y_GFR,639.0,-28.0,-0.05,0.0,53.0,10.44
d_1y_INSULIN,639.0,-33.2,-0.98,-0.6,54.4,5.41
d_1y_HOMA-IR,639.0,-12.9,-0.25,-0.11,23.29,1.8
