# Full Moderation/Mediation Analysis
##### Including all metabolites and multiple forms of activity

## Make Dataframes 
#### Saving one with the raw data for future demographics use, & saving one with log-transformed & standardized data

In [1]:
import warnings
from openpyxl import Workbook
from arivale_data_interface import get_snapshot
import pandas as pd 
warnings.simplefilter("ignore")


#begin dataframe, starting with activity
activity = get_snapshot("wearables_monthly") #use monthly
active = activity[["public_client_id", "days_in_program", "days_since_first_draw", 
                   "activities_distance", "activities_minutesFairlyActive", "activities_minutesLightlyActive",
                  "activities_minutesSedentary", "activities_minutesVeryActive",
                  "heartrate_resting", "sleep_efficiency"]] # includes mulitple forms of activity this time, if you change this, must change chemistry features
#add in client info
clients = get_snapshot("clients")[["public_client_id", "sex", "age", "race"]] # does earlier tasks in one step
active = pd.merge(active, clients, on="public_client_id", how="inner") #merge based on client id
#now education
education = get_snapshot("assessments_education")[["public_client_id", "assessment:education:education:int"]]
active = pd.merge(active, education, on="public_client_id", how="left")
#BMI
weights = get_snapshot("weight")[["BMI_CALC", "public_client_id", "days_in_program"]].dropna()
active = pd.merge_asof(
    active[active.public_client_id.isin(weights.public_client_id)].sort_values(by="days_in_program"), 
    weights.sort_values(by="days_in_program"), 
    by="public_client_id", on="days_in_program", direction="nearest")
#ancestry information
ancestry = get_snapshot("genetics_ancestry")[["public_client_id", "PC1", "PC2", "PC3", "PC4", "PC5"]]
active = pd.merge(active, ancestry, on = "public_client_id", how = "inner")
#blood pressure
bp_data = get_snapshot("blood_pressure")[["public_client_id", "days_in_program", 
                                          "MEAN_ARTERIAL_BLOOD_PRESSURE", "PULSE_PRESSURE",
                                          "diastolic", "systolic"]].dropna()
active = pd.merge_asof(
    active[active.public_client_id.isin(bp_data.public_client_id)].sort_values(by="days_in_program"), 
    bp_data.sort_values(by="days_in_program"), 
    by="public_client_id", on="days_in_program", direction="nearest")

#now, add metabolites to a separate dataframe
metabolites = get_snapshot("metabolomics_corrected").sort_values(by="days_in_program")
active = active.sort_values(by="days_in_program")
metabolite_features = metabolites.columns[8:] # gives an index of just the column names
metabolites.days_in_program = metabolites.days_in_program.astype("float64") #so merging works
#merge metabolite and activity data into one frame
act_met_merged = pd.merge_asof(
    active, metabolites, 
    by="public_client_id", 
    on="days_in_program", 
    direction="nearest", 
    tolerance=30.0).dropna(subset=metabolite_features, how="all") 
metabolite_features = metabolites.columns[metabolites.columns.isin(metabolite_features)]
act_met_merged.sort_values(by = "days_in_program", inplace = True)
#add chemistry data to its own df
chemistry = get_snapshot("chemistries").sort_values(by="days_in_program")
act_met_merged = act_met_merged.sort_values(by="days_in_program")
chemistry_features = chemistry.columns[12:] #index for chemistry values
chemistry.days_in_program = chemistry.days_in_program.astype("float64")
#finally, merge chemistry and act/met data into one frame
mac_raw = pd.merge_asof(
    act_met_merged, chemistry, 
    by="public_client_id", 
    on="days_in_program", 
    direction="nearest", 
    tolerance=30.0).dropna(subset=chemistry_features, how="all") 
chemistry_features = chemistry.columns[chemistry.columns.isin(chemistry_features)]
mac_raw.sort_values(by = "days_in_program", inplace = True)
#mac_raw #named mac_raw because the data is not transformed yet

ModuleNotFoundError: No module named 'openpyxl'

In [2]:
# next, do the data cleaning that can come before log transforming & standardizing
import numpy as np
import scipy.stats.mstats

# first need to preprocess data, remove metabolites that aren't in at least 75% of samples
bad = mac_raw[metabolite_features].isnull().sum() / mac_raw.shape[0] > 0.25
metabolite_features = bad[~bad].index 
mac_raw = mac_raw.drop(columns=bad[bad].index)
# same with chemistries that aren't in at least 75% of samples
bad_2 = mac_raw[chemistry_features].isnull().sum() / mac_raw.shape[0] > 0.25
chemistry_features = bad_2[~bad_2].index 
mac_raw = mac_raw.drop(columns=bad_2[bad_2].index)
# drop duplicates so that there is only one entry per client
mac_raw.drop_duplicates(subset = 'public_client_id', keep = 'first', inplace = True)
#mac_raw

In [3]:
# fix the chemistry names
mac_raw.rename(columns= {"HOMA-IR" : "HOMA_IR", "LDL-CHOL CALCULATION" : "LDL_CHOL CALCULATION", 
                            "OMEGA-3 INDEX" : "OMEGA_3 INDEX", "OMEGA-6/OMEGA-3 RATIO" : "OMEGA_6/OMEGA_3 RATIO",
                            "VITAMIN D, 25-OH TOT" : "VITAMIN D, 25_OH TOT"},
                 inplace = True)
mac_raw.rename(columns= {'A/G RATIO' : 'A/G_RATIO', 'ADIPONECTIN, SERUM' : 'ADIPONECTIN_SERUM',
                           'ALAT (SGPT)' : 'ALAT_SGPT', 'ALKALINE PHOSPHATE' : 'ALKALINE_PHOSPHATE',
                           'ARACHIDONIC ACID' : 'ARACHIDONIC_ACID' , 'ASAT (SGOT)' : 'ASAT_SGOT',
                           'BASOPHILS ABSOLUTE' : 'BASOPHILS_ABSOLUTE', 'BILIRUBIN, TOTAL' : 'BILI_TOT',
                           'BUN/CREAT RATIO' : 'BUN/CREAT_RATIO', 'CARBON DIOXIDE (CO2)' : 'CARBON_DIOXIDE',
                           'CHOLESTEROL, TOTAL' : 'CHOLESTEROL_TOTAL', 'CREATININE ENZ, SER' : 'CREATININE_ENZ_SER',
                           'CRP HIGH SENSITIVITY' : 'CRP_HIGH_SENSITIVITY', 'EOSINOPHILS ABSOLUTE' : 'EOSINOPHILS_ABSOLUTE',
                           'GFR, MDRD' : 'GFR_MDRD', 'GFR, MDRD, AFRICAN AM' : 'GFR_MDRD_AFRICAN_AM', 'GLYCOHEMOGLOBIN A1C' : 'GLYCOHEMOGLOBIN_A1C',
                           'HDL CHOL DIRECT' : 'HDL_CHOL_DIRECT', 'HDL PARTICLE NUMBER' : 'HDL_PARTICLE_NUMBER',
                           'HOMOCYSTEINE, SERUM' : 'HOMOCYSTEINE_SERUM', 'IMMATURE GRANULOCYTES' : 'IMMATURE_GRANULOCYTES',
                           'IMMATURE GRANULOCYTES ABSOLUTE' : 'IMMATURE_GRANULOCYTES_ABSOLUTE', 'LDL PARTICLE NUMBER' : 'LDL_PARTICLE_NUMBER',
                           'LDL SMALL' : 'LDL_SMALL', 'LDL_CHOL CALCULATION' : 'LDL_CHOL_CALCULATION', 'LYMPHOCYTES ABSOLUTE' : 'LYMPHOCYTES_ABSOLUTE',
                           'MERCURY, BLOOD' : 'MERCURY_BLOOD', 'METHYLMALONIC ACID' : 'METHYLMALONIC_ACID', 'MONOCYTES ABSOLUTE' : 'MONOCYTES_ABSOLUTE',
                           'OMEGA_3 INDEX' : 'OMEGA_3_INDEX', 'OMEGA_6/OMEGA_3 RATIO' : 'OMEGA_6/OMEGA_3_RATIO', 'PLATELET COUNT THOUSAND' : 'PLATELET_COUNT_THOUSAND',
                           'PROTEIN, TOTAL SERUM' : 'PROTEIN_TOTAL_SERUM', 'RED CELL COUNT' : 'RED_CELL_COUNT', 'TOTAL NEUTROPHILS' : 'TOTAL_NEUTROPHILS',
                           'TOTAL NEUTROPHILS AB' : 'TOTAL_NEUTROPHILS_AB', 'Triglyceride HDL Ratio' : 'TRIGLYCERIDE_HDL_RATIO',
                           'UREA NITROGEN' : 'UREA_NITROGEN', 'URIC ACID' : 'URIC_ACID', 'VITAMIN D, 25_OH TOT' : 'VIT_D_25_OH_TOT',
                           'WHITE CELL COUNT' : 'WHITE_CELL_COUNT'},
                 inplace = True)
mac_raw.rename(columns= {'A/G_RATIO' : 'A_G_RATIO', 'ALKALINE PHOSPHATASE' : 'ALKALINE_PHOSPHATASE',
                            'BUN/CREAT_RATIO' : 'BUN_CREAT_RATIO', 'OMEGA_6/OMEGA_3_RATIO' : 'OMEGA6_OMEGA3_RATIO'},
                  inplace = True)
mac_raw.drop(columns=['BASOPHILS', 'BASOPHILS_ABSOLUTE', 'EOSINOPHILS', 'EOSINOPHILS_ABSOLUTE',
                         'MERCURY_BLOOD', 'IMMATURE_GRANULOCYTES_ABSOLUTE', 'IMMATURE_GRANULOCYTES',
                         'EPA'], inplace = True)

In [4]:
chemistry_features = mac_raw.columns[941:] # this number may change if new activites or other columns are added in/taken out
#chemistry_features

In [5]:
# now to log-standardize and transform data! save as new dataframe
# make a copy of dataframe to save as a new name
mac_str = mac_raw.copy()
# log-transform and standardize all metabolites
mac_str[metabolite_features] = np.log(mac_str[metabolite_features])
mac_str[metabolite_features] = mac_str[metabolite_features].apply(lambda x: (x - x.mean()) / x.std())

# same as above, except log-transforming and standardizing chemistries
mac_str[chemistry_features] = np.log(mac_str[chemistry_features])
mac_str[chemistry_features] = mac_str[chemistry_features].apply(lambda x: (x - x.mean()) / x.std())

# add in word "metabolite" so that regressions work
mac_str.rename(columns=dict(zip(metabolite_features, "metabolite_" + metabolite_features)), inplace=True)
metabolite_features = "metabolite_" + metabolite_features

# drop duplicates so that there is only one entry per client
mac_str.drop_duplicates(subset = 'public_client_id', keep = 'first', inplace = True)
mac_str

Unnamed: 0,public_client_id,days_in_program,days_since_first_draw_x,activities_distance,activities_minutesFairlyActive,activities_minutesLightlyActive,activities_minutesSedentary,activities_minutesVeryActive,heartrate_resting,sleep_efficiency,...,RED_CELL_COUNT,SODIUM,TOTAL_NEUTROPHILS,TOTAL_NEUTROPHILS_AB,TRIGLYCERIDES,TRIGLYCERIDE_HDL_RATIO,UREA_NITROGEN,URIC_ACID,VIT_D_25_OH_TOT,WHITE_CELL_COUNT
0,01158621,-29.0,-30.0,3.135200,37.400000,156.440000,941.040000,12.000000,,92.800000,...,1.047910,1.328344,0.820022,1.378087,-1.264120,-0.383763,0.425495,1.102395,-4.109235,1.364113
1,01902561,-28.0,-30.0,4.237586,16.000000,260.724138,703.551724,6.448276,62.965517,95.321429,...,0.721279,0.874160,-1.070264,-0.873164,-0.056226,-0.161282,0.662078,0.619610,-4.815407,-0.355066
2,01821552,-27.0,-30.0,3.283871,7.096774,218.806452,740.806452,22.354839,,95.500000,...,-0.321276,-0.507835,-0.571219,-0.419871,0.705014,0.181459,-1.060844,0.353903,0.649371,-0.280640
3,01285674,-26.0,-28.0,3.830000,5.500000,169.000000,1249.000000,16.500000,78.000000,,...,-0.369989,-0.507835,-1.335612,-1.130343,-1.305653,-1.377679,0.172584,-0.081842,0.142965,-0.668446
4,01893068,-26.0,-30.0,3.055000,42.500000,268.500000,720.000000,37.500000,59.500000,90.500000,...,-0.152459,0.874160,-0.221516,-0.217854,-0.687495,-0.248253,-0.099079,-0.660231,-1.315732,-0.207659
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5172,01673066,268.0,221.0,4.704737,14.526316,239.315789,846.421053,16.578947,61.888889,95.000000,...,-0.567033,-0.043884,0.820022,-0.029316,-0.104531,0.484472,-0.392496,-2.573437,0.213422,-0.508491
5262,01277016,302.0,276.0,2.814194,13.580645,197.838710,601.741935,15.129032,59.903226,95.433333,...,0.743359,0.874160,-0.335918,-0.637441,1.093099,0.803891,-0.711460,0.619610,0.010569,-0.668446
5276,01533942,309.0,238.0,6.661667,69.333333,132.833333,769.333333,134.000000,62.666667,91.750000,...,-0.894966,0.416766,0.109610,0.546264,-0.056226,0.155274,1.292041,0.928247,1.041576,0.688381
6079,01211854,436.0,408.0,5.370000,42.200000,124.133333,1166.533333,107.133333,57.172414,,...,0.475421,2.672081,-5.934347,-0.122014,-1.305653,-0.642197,0.172584,0.554853,1.493267,3.321862


In [6]:
#save both dataframes to csv
mac_raw.to_csv('mac_raw_df.csv', index = False)
mac_str.to_csv('mac_str_df.csv', index = False)

## Regression Analyses

### Make Association Bins
#### Genetic, Microbe, Hybrid 
##### Goal is to create them as "features" so they can be easily used in regressions

In [6]:
# create dataframes for the genetic and microbe metabolites
genetic_assoc = pd.read_csv('genetic_associations.csv')
microbe_assoc = pd.read_csv('sig_metabolite_taxon.csv')

# drop the duplicate metabolites that are present in each dataframe
genetic_assoc.drop_duplicates(subset=['metabolite_id'], inplace=True)
microbe_assoc.drop_duplicates(subset=['metabolite'], inplace=True)

# pivot tables to allow the metabolites to be "columns"
genetic_assoc = genetic_assoc[['metabolite_id', 'rsid', 'p']].copy()
genetic_assoc = genetic_assoc.pivot_table(index='rsid', columns='metabolite_id', values='p')
microbe_assoc = microbe_assoc[['metabolite', 'r', 'p']].copy()
microbe_assoc = microbe_assoc.pivot_table(index='r', columns='metabolite', values='p')

# turn these column names in each dataframe into "features"
genetic_features = genetic_assoc.columns[0:]
genetic_features = mac_str.columns[mac_str.columns.isin(genetic_features)]
microbe_features = microbe_assoc.columns[0:]
microbe_features = mac_str.columns[mac_str.columns.isin(microbe_features)]

# turn into list so I can run loop through them to sort into bins
genetic_list = genetic_features.tolist()
microbe_list = microbe_features.tolist()

# see which metabolites are in both genetic and microbe
hybrid = []
for elem in genetic_list:
    if elem in genetic_list and elem in microbe_list:
        hybrid.append(elem)
        
# pull out only genetic and microbe only
genetic_only = [i for i in genetic_list if i not in hybrid]
microbe_only = [i for i in microbe_list if i not in hybrid]

In [12]:
#pwd

In [8]:
# just checking for accuracy
print(len(hybrid))
print(len(genetic_only))
print(len(microbe_only))

332
274
155


In [7]:
# now, need to turn these lists into index "features"
gen_only_features = mac_str.columns[mac_str.columns.isin(genetic_only)]
microbe_only_features = mac_str.columns[mac_str.columns.isin(microbe_only)]
hybrid_features = mac_str.columns[mac_str.columns.isin(hybrid)]

##### I now have three indexes, one for each association "bin" that I can use in my regressions: <br>gen_only_features, microbe_only_features, and hybrid_features. <br>Each only includes metabolites that are in that specific bin 

### Initial Regression
#### What metabolites & chemistries are associated

In [8]:
from rich.progress import track

from statsmodels.formula.api import ols
from statsmodels.formula.api import glm
import statsmodels.api as sm
import statsmodels.genmod.families.links as links
from statsmodels.stats.multitest import multipletests
from statsmodels.stats.mediation import Mediation

In [9]:
## uploading meta names so it can be added to data frames
meta_names = get_snapshot("metabolomics_metadata")
meta_names.rename(columns={'CHEMICAL_ID': 'feature'}, inplace=True)
meta_names.drop(meta_names.columns[[4,5,6,7,8,9,10,11,12,13,14,15,16]],axis = 1, inplace=True)
meta_names['feature'] = 'metabolite_' + meta_names['feature'].astype(str)
meta_names

Unnamed: 0,feature,SUB_PATHWAY,SUPER_PATHWAY,BIOCHEMICAL_NAME
0,metabolite_35,Glutamate Metabolism,Amino Acid,S-1-pyrroline-5-carboxylate
1,metabolite_50,Polyamine Metabolism,Amino Acid,spermidine
2,metabolite_55,Nicotinate and Nicotinamide Metabolism,Cofactors and Vitamins,1-methylnicotinamide
3,metabolite_62,"Fatty Acid, Dihydroxy",Lipid,"12,13-DiHOME"
4,metabolite_71,Tryptophan Metabolism,Amino Acid,5-hydroxyindoleacetate
...,...,...,...,...
1343,metabolite_999954834,,,X - 24806
1344,metabolite_999954839,,,X - 24811
1345,metabolite_999954840,,,X - 24812
1346,metabolite_100002397,,,alpha-ketoglutaramate*


##### HDL Cholesterol ~ metabolites + [covar]

In [10]:
# GENETIC
def get_results(feature):
    
    """Get a single association.
    
    As long as `args` and `df` are assigned this can be used on
    any data set.
    """
    formula = f"HDL_CHOL_DIRECT ~ {feature} + C(sex) + age + BMI_CALC + PC1 + PC2 + PC3 + PC4 + PC5"
    fitted = ols(formula, data=df).fit()
    return pd.DataFrame({
        "feature": feature,
        "beta": fitted.params[feature],
        "t_statistic": fitted.tvalues[feature],
        "p": fitted.pvalues[feature],
        "n": fitted.nobs
        }, index=[feature])

args = gen_only_features
df = mac_str
results = map(get_results, track(args))
results = list(results)
tests = pd.concat(results)
tests["q"] = multipletests(tests.p, method="fdr_bh")[1]
gen_hdl_sig = tests[tests.q <= 0.05]
## merge in names + pathway info to this data
gen_hdl_sig = pd.merge(gen_hdl_sig, meta_names, on = "feature", how = "inner")
## adding in column with association bin so we can merge all three into one, cuts down on number of dataframes, and so i can do FDR correction in future
gen_hdl_sig['assoc_bin']='genetic'
gen_hdl_sig

Unnamed: 0,feature,beta,t_statistic,p,n,q,SUB_PATHWAY,SUPER_PATHWAY,BIOCHEMICAL_NAME,assoc_bin
0,metabolite_55,0.058805,2.843437,4.518099e-03,1656.0,9.903674e-03,Nicotinate and Nicotinamide Metabolism,Cofactors and Vitamins,1-methylnicotinamide,genetic
1,metabolite_179,0.069933,3.138879,1.729995e-03,1466.0,4.232308e-03,"Fatty Acid, Dihydroxy",Lipid,"9,10-DiHOME",genetic
2,metabolite_197,-0.058326,-2.562955,1.047392e-02,1533.0,2.079605e-02,"Methionine, Cysteine, SAM and Taurine Metabolism",Amino Acid,S-adenosylhomocysteine (SAH),genetic
3,metabolite_212,-0.064420,-2.947296,3.252086e-03,1610.0,7.364228e-03,Polyamine Metabolism,Amino Acid,5-methylthioadenosine (MTA),genetic
4,metabolite_234,-0.086382,-4.078564,4.744685e-05,1681.0,1.604992e-04,Alanine and Aspartate Metabolism,Amino Acid,aspartate,genetic
...,...,...,...,...,...,...,...,...,...,...
152,metabolite_999946390,-0.055947,-2.534417,1.135514e-02,1662.0,2.238351e-02,,,X - 11308,genetic
153,metabolite_999946460,-0.139586,-6.290866,4.046006e-10,1641.0,2.771514e-09,,,X - 11444,genetic
154,metabolite_999946601,-0.053628,-2.486684,1.299330e-02,1642.0,2.507157e-02,,,X - 11470,genetic
155,metabolite_999946633,-0.077918,-3.754487,1.797355e-04,1656.0,5.077065e-04,,,X - 12844,genetic


In [11]:
# MICROBE
def get_results(feature):
    
    """Get a single association.
    
    As long as `args` and `df` are assigned this can be used on
    any data set.
    """
    formula = f"HDL_CHOL_DIRECT ~ {feature} + C(sex) + age + BMI_CALC + PC1 + PC2 + PC3 + PC4 + PC5"
    fitted = ols(formula, data=df).fit()
    return pd.DataFrame({
        "feature": feature,
        "beta": fitted.params[feature],
        "t_statistic": fitted.tvalues[feature],
        "p": fitted.pvalues[feature],
        "n": fitted.nobs
        }, index=[feature])

args = microbe_only_features
df = mac_str
results = map(get_results, track(args))
results = list(results)
tests = pd.concat(results)
tests["q"] = multipletests(tests.p, method="fdr_bh")[1]
mic_hdl_sig = tests[tests.q <= 0.05]
## merge in names + pathway info to this data
mic_hdl_sig = pd.merge(mic_hdl_sig, meta_names, on = "feature", how = "inner")
mic_hdl_sig['assoc_bin']='microbe'

In [12]:
# HYBRID
def get_results(feature):
    
    """Get a single association.
    
    As long as `args` and `df` are assigned this can be used on
    any data set.
    """
    formula = f"HDL_CHOL_DIRECT ~ {feature} + C(sex) + age + BMI_CALC + PC1 + PC2 + PC3 + PC4 + PC5"
    fitted = ols(formula, data=df).fit()
    return pd.DataFrame({
        "feature": feature,
        "beta": fitted.params[feature],
        "t_statistic": fitted.tvalues[feature],
        "p": fitted.pvalues[feature],
        "n": fitted.nobs
        }, index=[feature])

args = hybrid_features
df = mac_str
results = map(get_results, track(args))
results = list(results)
tests = pd.concat(results)
tests["q"] = multipletests(tests.p, method="fdr_bh")[1]
hyb_hdl_sig = tests[tests.q <= 0.05]
## merge in names + pathway info to this data
hyb_hdl_sig = pd.merge(hyb_hdl_sig, meta_names, on = "feature", how = "inner")
hyb_hdl_sig['assoc_bin']='hybrid'

In [13]:
# merge these three into one dataframe
hdl_sig = pd.concat([gen_hdl_sig, mic_hdl_sig, hyb_hdl_sig])
hdl_sig

Unnamed: 0,feature,beta,t_statistic,p,n,q,SUB_PATHWAY,SUPER_PATHWAY,BIOCHEMICAL_NAME,assoc_bin
0,metabolite_55,0.058805,2.843437,4.518099e-03,1656.0,9.903674e-03,Nicotinate and Nicotinamide Metabolism,Cofactors and Vitamins,1-methylnicotinamide,genetic
1,metabolite_179,0.069933,3.138879,1.729995e-03,1466.0,4.232308e-03,"Fatty Acid, Dihydroxy",Lipid,"9,10-DiHOME",genetic
2,metabolite_197,-0.058326,-2.562955,1.047392e-02,1533.0,2.079605e-02,"Methionine, Cysteine, SAM and Taurine Metabolism",Amino Acid,S-adenosylhomocysteine (SAH),genetic
3,metabolite_212,-0.064420,-2.947296,3.252086e-03,1610.0,7.364228e-03,Polyamine Metabolism,Amino Acid,5-methylthioadenosine (MTA),genetic
4,metabolite_234,-0.086382,-4.078564,4.744685e-05,1681.0,1.604992e-04,Alanine and Aspartate Metabolism,Amino Acid,aspartate,genetic
...,...,...,...,...,...,...,...,...,...,...
214,metabolite_999947671,0.056495,2.752564,5.977621e-03,1665.0,1.033630e-02,,,X - 18921,hybrid
215,metabolite_999947788,0.101035,4.686124,3.021835e-06,1600.0,8.574780e-06,,,X - 13431,hybrid
216,metabolite_999949515,0.115902,5.277082,1.489556e-07,1633.0,5.052937e-07,,,X - 23639,hybrid
217,metabolite_999949592,0.132822,6.040220,1.899534e-09,1653.0,7.507684e-09,,,X - 11315,hybrid


In [60]:
# this is creating the file
hdl_sig.to_excel("MAC_Project_data.xlsx", sheet_name='hdl_sig')

In [14]:
## turn features column into index feature to use for regression down the line
hdl_met_to_col = hdl_sig[['feature', 'assoc_bin', 'p']].copy()
hdl_met_to_col = hdl_met_to_col.pivot_table(index='assoc_bin', columns='feature', values='p')

hdl_features = hdl_met_to_col.columns[0:]
hdl_features

Index(['metabolite_100000036', 'metabolite_100000042', 'metabolite_100000054',
       'metabolite_100000263', 'metabolite_100000265', 'metabolite_100000406',
       'metabolite_100000437', 'metabolite_100000442', 'metabolite_100000453',
       'metabolite_100000463',
       ...
       'metabolite_999947804', 'metabolite_999947905', 'metabolite_999948094',
       'metabolite_999949463', 'metabolite_999949515', 'metabolite_999949555',
       'metabolite_999949592', 'metabolite_999949681', 'metabolite_999952877',
       'metabolite_999953267'],
      dtype='object', name='feature', length=454)

##### Hematocrit ~ metabolites + [covar]

In [15]:
# GENETIC
def get_results(feature):
    
    """Get a single association.
    
    As long as `args` and `df` are assigned this can be used on
    any data set.
    """
    formula = f"HEMATOCRIT ~ {feature} + C(sex) + age + BMI_CALC + PC1 + PC2 + PC3 + PC4 + PC5"
    fitted = ols(formula, data=df).fit()
    return pd.DataFrame({
        "feature": feature,
        "beta": fitted.params[feature],
        "t_statistic": fitted.tvalues[feature],
        "p": fitted.pvalues[feature],
        "n": fitted.nobs
        }, index=[feature])

args = gen_only_features
df = mac_str
results = map(get_results, track(args))
results = list(results)
tests = pd.concat(results)
tests["q"] = multipletests(tests.p, method="fdr_bh")[1]
gen_hem_sig = tests[tests.q <= 0.05]
## merge in names + pathway info to this data
gen_hem_sig = pd.merge(gen_hem_sig, meta_names, on = "feature", how = "inner")
gen_hem_sig['assoc_bin']='genetic'

In [16]:
# MICROBE
def get_results(feature):
    
    """Get a single association.
    
    As long as `args` and `df` are assigned this can be used on
    any data set.
    """
    formula = f"HEMATOCRIT ~ {feature} + C(sex) + age + BMI_CALC + PC1 + PC2 + PC3 + PC4 + PC5"
    fitted = ols(formula, data=df).fit()
    return pd.DataFrame({
        "feature": feature,
        "beta": fitted.params[feature],
        "t_statistic": fitted.tvalues[feature],
        "p": fitted.pvalues[feature],
        "n": fitted.nobs
        }, index=[feature])

args = microbe_only_features
df = mac_str
results = map(get_results, track(args))
results = list(results)
tests = pd.concat(results)
tests["q"] = multipletests(tests.p, method="fdr_bh")[1]
mic_hem_sig = tests[tests.q <= 0.05]
## merge in names + pathway info to this data
mic_hem_sig = pd.merge(mic_hem_sig, meta_names, on = "feature", how = "inner")
mic_hem_sig['assoc_bin']='microbe'

In [17]:
# HYBRID
def get_results(feature):
    
    """Get a single association.
    
    As long as `args` and `df` are assigned this can be used on
    any data set.
    """
    formula = f"HEMATOCRIT ~ {feature} + C(sex) + age + BMI_CALC + PC1 + PC2 + PC3 + PC4 + PC5"
    fitted = ols(formula, data=df).fit()
    return pd.DataFrame({
        "feature": feature,
        "beta": fitted.params[feature],
        "t_statistic": fitted.tvalues[feature],
        "p": fitted.pvalues[feature],
        "n": fitted.nobs
        }, index=[feature])

args = hybrid_features
df = mac_str
results = map(get_results, track(args))
results = list(results)
tests = pd.concat(results)
tests["q"] = multipletests(tests.p, method="fdr_bh")[1]
hyb_hem_sig = tests[tests.q <= 0.05]
## merge in names + pathway info to this data
hyb_hem_sig = pd.merge(hyb_hem_sig, meta_names, on = "feature", how = "inner")
hyb_hem_sig['assoc_bin']='hybrid'

In [21]:
# merge these three into one dataframe
hem_sig = pd.concat([gen_hem_sig, mic_hem_sig, hyb_hem_sig])

In [None]:
with pd.ExcelWriter('MAC_Project_data.xlsx',
                    mode='a') as writer:  
    hem_sig.to_excel(writer, sheet_name='hem_sig')

In [22]:
## turn features column into index feature to use for regression down the line
hem_met_to_col = hem_sig[['feature', 'assoc_bin', 'p']].copy()
hem_met_to_col = hem_met_to_col.pivot_table(index='assoc_bin', columns='feature', values='p')

hem_features = hem_met_to_col.columns[0:]
hem_features

Index(['metabolite_100000036', 'metabolite_100000039', 'metabolite_100000042',
       'metabolite_100000257', 'metabolite_100000276', 'metabolite_100000447',
       'metabolite_100000551', 'metabolite_100000616', 'metabolite_100000665',
       'metabolite_100000706',
       ...
       'metabolite_999947006', 'metabolite_999947642', 'metabolite_999947650',
       'metabolite_999947788', 'metabolite_999948076', 'metabolite_999949515',
       'metabolite_999952286', 'metabolite_999952502', 'metabolite_999952909',
       'metabolite_999953267'],
      dtype='object', name='feature', length=332)

##### Triglycerides ~ metabolites + [covar]

In [23]:
# GENETIC
def get_results(feature):
    
    """Get a single association.
    
    As long as `args` and `df` are assigned this can be used on
    any data set.
    """
    formula = f"TRIGLYCERIDES ~ {feature} + C(sex) + age + BMI_CALC + PC1 + PC2 + PC3 + PC4 + PC5"
    fitted = ols(formula, data=df).fit()
    return pd.DataFrame({
        "feature": feature,
        "beta": fitted.params[feature],
        "t_statistic": fitted.tvalues[feature],
        "p": fitted.pvalues[feature],
        "n": fitted.nobs
        }, index=[feature])

args = gen_only_features
df = mac_str
results = map(get_results, track(args))
results = list(results)
tests = pd.concat(results)
tests["q"] = multipletests(tests.p, method="fdr_bh")[1]
gen_tri_sig = tests[tests.q <= 0.05]
## merge in names + pathway info to this data
gen_tri_sig = pd.merge(gen_tri_sig, meta_names, on = "feature", how = "inner")

In [24]:
# MICROBE
def get_results(feature):
    
    """Get a single association.
    
    As long as `args` and `df` are assigned this can be used on
    any data set.
    """
    formula = f"TRIGLYCERIDES ~ {feature} + C(sex) + age + BMI_CALC + PC1 + PC2 + PC3 + PC4 + PC5"
    fitted = ols(formula, data=df).fit()
    return pd.DataFrame({
        "feature": feature,
        "beta": fitted.params[feature],
        "t_statistic": fitted.tvalues[feature],
        "p": fitted.pvalues[feature],
        "n": fitted.nobs
        }, index=[feature])

args = microbe_only_features
df = mac_str
results = map(get_results, track(args))
results = list(results)
tests = pd.concat(results)
tests["q"] = multipletests(tests.p, method="fdr_bh")[1]
mic_tri_sig = tests[tests.q <= 0.05]
## merge in names + pathway info to this data
mic_tri_sig = pd.merge(mic_tri_sig, meta_names, on = "feature", how = "inner")

In [25]:
# HYBRID
def get_results(feature):
    
    """Get a single association.
    
    As long as `args` and `df` are assigned this can be used on
    any data set.
    """
    formula = f"TRIGLYCERIDES ~ {feature} + C(sex) + age + BMI_CALC + PC1 + PC2 + PC3 + PC4 + PC5"
    fitted = ols(formula, data=df).fit()
    return pd.DataFrame({
        "feature": feature,
        "beta": fitted.params[feature],
        "t_statistic": fitted.tvalues[feature],
        "p": fitted.pvalues[feature],
        "n": fitted.nobs
        }, index=[feature])

args = hybrid_features
df = mac_str
results = map(get_results, track(args))
results = list(results)
tests = pd.concat(results)
tests["q"] = multipletests(tests.p, method="fdr_bh")[1]
hyb_tri_sig = tests[tests.q <= 0.05]
## merge in names + pathway info to this data
hyb_tri_sig = pd.merge(hyb_tri_sig, meta_names, on = "feature", how = "inner")

In [26]:
## now, putting the association bins + merging + excel sheet into one code block
gen_tri_sig['assoc_bin']='genetic'
mic_tri_sig['assoc_bin']='microbe'
hyb_tri_sig['assoc_bin']='hybrid'

tri_sig = pd.concat([gen_tri_sig, mic_tri_sig, hyb_tri_sig])

In [None]:
with pd.ExcelWriter('MAC_Project_data.xlsx',
                    mode='a') as writer:  
    tri_sig.to_excel(writer, sheet_name='tri_sig')

In [27]:
## turn features column into index feature to use for regression down the line
tri_met_to_col = tri_sig[['feature', 'assoc_bin', 'p']].copy()
tri_met_to_col = tri_met_to_col.pivot_table(index='assoc_bin', columns='feature', values='p')

tri_features = tri_met_to_col.columns[0:]
tri_features

Index(['metabolite_100000010', 'metabolite_100000014', 'metabolite_100000036',
       'metabolite_100000042', 'metabolite_100000054', 'metabolite_100000257',
       'metabolite_100000265', 'metabolite_100000276', 'metabolite_100000406',
       'metabolite_100000436',
       ...
       'metabolite_999948094', 'metabolite_999949515', 'metabolite_999949521',
       'metabolite_999949555', 'metabolite_999949557', 'metabolite_999949592',
       'metabolite_999949681', 'metabolite_999952286', 'metabolite_999952877',
       'metabolite_999952909'],
      dtype='object', name='feature', length=479)

##### Red Cell Count ~ metabolites + [covar]

In [28]:
# GENETIC
def get_results(feature):
    
    """Get a single association.
    
    As long as `args` and `df` are assigned this can be used on
    any data set.
    """
    formula = f"RED_CELL_COUNT ~ {feature} + C(sex) + age + BMI_CALC + PC1 + PC2 + PC3 + PC4 + PC5"
    fitted = ols(formula, data=df).fit()
    return pd.DataFrame({
        "feature": feature,
        "beta": fitted.params[feature],
        "t_statistic": fitted.tvalues[feature],
        "p": fitted.pvalues[feature],
        "n": fitted.nobs
        }, index=[feature])

args = gen_only_features
df = mac_str
results = map(get_results, track(args))
results = list(results)
tests = pd.concat(results)
tests["q"] = multipletests(tests.p, method="fdr_bh")[1]
gen_rbc_sig = tests[tests.q <= 0.05]
## merge in names + pathway info to this data
gen_rbc_sig = pd.merge(gen_rbc_sig, meta_names, on = "feature", how = "inner")

In [29]:
# MICROBE
def get_results(feature):
    
    """Get a single association.
    
    As long as `args` and `df` are assigned this can be used on
    any data set.
    """
    formula = f"RED_CELL_COUNT ~ {feature} + C(sex) + age + BMI_CALC + PC1 + PC2 + PC3 + PC4 + PC5"
    fitted = ols(formula, data=df).fit()
    return pd.DataFrame({
        "feature": feature,
        "beta": fitted.params[feature],
        "t_statistic": fitted.tvalues[feature],
        "p": fitted.pvalues[feature],
        "n": fitted.nobs
        }, index=[feature])

args = microbe_only_features
df = mac_str
results = map(get_results, track(args))
results = list(results)
tests = pd.concat(results)
tests["q"] = multipletests(tests.p, method="fdr_bh")[1]
mic_rbc_sig = tests[tests.q <= 0.05]
## merge in names + pathway info to this data
mic_rbc_sig = pd.merge(mic_rbc_sig, meta_names, on = "feature", how = "inner")

In [30]:
# HYBRID
def get_results(feature):
    
    """Get a single association.
    
    As long as `args` and `df` are assigned this can be used on
    any data set.
    """
    formula = f"RED_CELL_COUNT ~ {feature} + C(sex) + age + BMI_CALC + PC1 + PC2 + PC3 + PC4 + PC5"
    fitted = ols(formula, data=df).fit()
    return pd.DataFrame({
        "feature": feature,
        "beta": fitted.params[feature],
        "t_statistic": fitted.tvalues[feature],
        "p": fitted.pvalues[feature],
        "n": fitted.nobs
        }, index=[feature])

args = hybrid_features
df = mac_str
results = map(get_results, track(args))
results = list(results)
tests = pd.concat(results)
tests["q"] = multipletests(tests.p, method="fdr_bh")[1]
hyb_rbc_sig = tests[tests.q <= 0.05]
## merge in names + pathway info to this data
hyb_rbc_sig = pd.merge(hyb_rbc_sig, meta_names, on = "feature", how = "inner")

In [31]:
## now, putting the association bins + merging + excel sheet into one code block
gen_rbc_sig['assoc_bin']='genetic'
mic_rbc_sig['assoc_bin']='microbe'
hyb_rbc_sig['assoc_bin']='hybrid'

rbc_sig = pd.concat([gen_rbc_sig, mic_rbc_sig, hyb_rbc_sig])

In [None]:
with pd.ExcelWriter('MAC_Project_data.xlsx',
                    mode='a') as writer:  
    rbc_sig.to_excel(writer, sheet_name='rbc_sig')

In [32]:
## turn features column into index feature to use for regression down the line
rbc_met_to_col = rbc_sig[['feature', 'assoc_bin', 'p']].copy()
rbc_met_to_col = rbc_met_to_col.pivot_table(index='assoc_bin', columns='feature', values='p')

rbc_features = rbc_met_to_col.columns[0:]
rbc_features

Index(['metabolite_100000036', 'metabolite_100000039', 'metabolite_100000257',
       'metabolite_100000263', 'metabolite_100000442', 'metabolite_100000551',
       'metabolite_100000656', 'metabolite_100000657', 'metabolite_100000707',
       'metabolite_100000776',
       ...
       'metabolite_999946977', 'metabolite_999946997', 'metabolite_999947417',
       'metabolite_999947642', 'metabolite_999947804', 'metabolite_999947905',
       'metabolite_999949515', 'metabolite_999949557', 'metabolite_999949592',
       'metabolite_999952502'],
      dtype='object', name='feature', length=216)

##### A1C ~ metabolites + [covar]

In [33]:
# GENETIC
def get_results(feature):
    
    """Get a single association.
    
    As long as `args` and `df` are assigned this can be used on
    any data set.
    """
    formula = f"GLYCOHEMOGLOBIN_A1C ~ {feature} + C(sex) + age + BMI_CALC + PC1 + PC2 + PC3 + PC4 + PC5"
    fitted = ols(formula, data=df).fit()
    return pd.DataFrame({
        "feature": feature,
        "beta": fitted.params[feature],
        "t_statistic": fitted.tvalues[feature],
        "p": fitted.pvalues[feature],
        "n": fitted.nobs
        }, index=[feature])

args = gen_only_features
df = mac_str
results = map(get_results, track(args))
results = list(results)
tests = pd.concat(results)
tests["q"] = multipletests(tests.p, method="fdr_bh")[1]
gen_a1c_sig = tests[tests.q <= 0.05]
## merge in names + pathway info to this data
gen_a1c_sig = pd.merge(gen_a1c_sig, meta_names, on = "feature", how = "inner")

In [34]:
# MICROBE
def get_results(feature):
    
    """Get a single association.
    
    As long as `args` and `df` are assigned this can be used on
    any data set.
    """
    formula = f"GLYCOHEMOGLOBIN_A1C ~ {feature} + C(sex) + age + BMI_CALC + PC1 + PC2 + PC3 + PC4 + PC5"
    fitted = ols(formula, data=df).fit()
    return pd.DataFrame({
        "feature": feature,
        "beta": fitted.params[feature],
        "t_statistic": fitted.tvalues[feature],
        "p": fitted.pvalues[feature],
        "n": fitted.nobs
        }, index=[feature])

args = microbe_only_features
df = mac_str
results = map(get_results, track(args))
results = list(results)
tests = pd.concat(results)
tests["q"] = multipletests(tests.p, method="fdr_bh")[1]
mic_a1c_sig = tests[tests.q <= 0.05]
## merge in names + pathway info to this data
mic_a1c_sig = pd.merge(mic_a1c_sig, meta_names, on = "feature", how = "inner")

In [35]:
# HYBRID
def get_results(feature):
    
    """Get a single association.
    
    As long as `args` and `df` are assigned this can be used on
    any data set.
    """
    formula = f"GLYCOHEMOGLOBIN_A1C ~ {feature} + C(sex) + age + BMI_CALC + PC1 + PC2 + PC3 + PC4 + PC5"
    fitted = ols(formula, data=df).fit()
    return pd.DataFrame({
        "feature": feature,
        "beta": fitted.params[feature],
        "t_statistic": fitted.tvalues[feature],
        "p": fitted.pvalues[feature],
        "n": fitted.nobs
        }, index=[feature])

args = hybrid_features
df = mac_str
results = map(get_results, track(args))
results = list(results)
tests = pd.concat(results)
tests["q"] = multipletests(tests.p, method="fdr_bh")[1]
hyb_a1c_sig = tests[tests.q <= 0.05]
## merge in names + pathway info to this data
hyb_a1c_sig = pd.merge(hyb_a1c_sig, meta_names, on = "feature", how = "inner")

In [36]:
## now, putting the association bins + merging + excel sheet into one code block
gen_a1c_sig['assoc_bin']='genetic'
mic_a1c_sig['assoc_bin']='microbe'
hyb_a1c_sig['assoc_bin']='hybrid'

a1c_sig = pd.concat([gen_a1c_sig, mic_a1c_sig, hyb_a1c_sig])

In [None]:
with pd.ExcelWriter('MAC_Project_data.xlsx',
                    mode='a') as writer:  
    a1c_sig.to_excel(writer, sheet_name='a1c_sig')

In [37]:
## turn features column into index feature to use for regression down the line
a1c_met_to_col = a1c_sig[['feature', 'assoc_bin', 'p']].copy()
a1c_met_to_col = a1c_met_to_col.pivot_table(index='assoc_bin', columns='feature', values='p')

a1c_features = a1c_met_to_col.columns[0:]
a1c_features

Index(['metabolite_100000007', 'metabolite_100000036', 'metabolite_100000263',
       'metabolite_100000276', 'metabolite_100000282', 'metabolite_100000406',
       'metabolite_100000551', 'metabolite_100000580', 'metabolite_100000656',
       'metabolite_100000657',
       ...
       'metabolite_999946707', 'metabolite_999946970', 'metabolite_999946977',
       'metabolite_999947006', 'metabolite_999947642', 'metabolite_999947905',
       'metabolite_999947993', 'metabolite_999949515', 'metabolite_999949592',
       'metabolite_999952877'],
      dtype='object', name='feature', length=311)

##### Insulin ~ metabolites + [covar]

In [38]:
# GENETIC
def get_results(feature):
    
    """Get a single association.
    
    As long as `args` and `df` are assigned this can be used on
    any data set.
    """
    formula = f"INSULIN ~ {feature} + C(sex) + age + BMI_CALC + PC1 + PC2 + PC3 + PC4 + PC5"
    fitted = ols(formula, data=df).fit()
    return pd.DataFrame({
        "feature": feature,
        "beta": fitted.params[feature],
        "t_statistic": fitted.tvalues[feature],
        "p": fitted.pvalues[feature],
        "n": fitted.nobs
        }, index=[feature])

args = gen_only_features
df = mac_str
results = map(get_results, track(args))
results = list(results)
tests = pd.concat(results)
tests["q"] = multipletests(tests.p, method="fdr_bh")[1]
gen_ins_sig = tests[tests.q <= 0.05]
## merge in names + pathway info to this data
gen_ins_sig = pd.merge(gen_ins_sig, meta_names, on = "feature", how = "inner")

In [39]:
# MICROBE
def get_results(feature):
    
    """Get a single association.
    
    As long as `args` and `df` are assigned this can be used on
    any data set.
    """
    formula = f"INSULIN ~ {feature} + C(sex) + age + BMI_CALC + PC1 + PC2 + PC3 + PC4 + PC5"
    fitted = ols(formula, data=df).fit()
    return pd.DataFrame({
        "feature": feature,
        "beta": fitted.params[feature],
        "t_statistic": fitted.tvalues[feature],
        "p": fitted.pvalues[feature],
        "n": fitted.nobs
        }, index=[feature])

args = microbe_only_features
df = mac_str
results = map(get_results, track(args))
results = list(results)
tests = pd.concat(results)
tests["q"] = multipletests(tests.p, method="fdr_bh")[1]
mic_ins_sig = tests[tests.q <= 0.05]
## merge in names + pathway info to this data
mic_ins_sig = pd.merge(mic_ins_sig, meta_names, on = "feature", how = "inner")

In [40]:
# HYBRID
def get_results(feature):
    
    """Get a single association.
    
    As long as `args` and `df` are assigned this can be used on
    any data set.
    """
    formula = f"INSULIN ~ {feature} + C(sex) + age + BMI_CALC + PC1 + PC2 + PC3 + PC4 + PC5"
    fitted = ols(formula, data=df).fit()
    return pd.DataFrame({
        "feature": feature,
        "beta": fitted.params[feature],
        "t_statistic": fitted.tvalues[feature],
        "p": fitted.pvalues[feature],
        "n": fitted.nobs
        }, index=[feature])

args = hybrid_features
df = mac_str
results = map(get_results, track(args))
results = list(results)
tests = pd.concat(results)
tests["q"] = multipletests(tests.p, method="fdr_bh")[1]
hyb_ins_sig = tests[tests.q <= 0.05]
## merge in names + pathway info to this data
hyb_ins_sig = pd.merge(hyb_ins_sig, meta_names, on = "feature", how = "inner")

In [41]:
## now, putting the association bins + merging + excel sheet into one code block
gen_ins_sig['assoc_bin']='genetic'
mic_ins_sig['assoc_bin']='microbe'
hyb_ins_sig['assoc_bin']='hybrid'

ins_sig = pd.concat([gen_ins_sig, mic_ins_sig, hyb_ins_sig])

In [None]:
with pd.ExcelWriter('MAC_Project_data.xlsx',
                    mode='a') as writer:  
    ins_sig.to_excel(writer, sheet_name='ins_sig')

In [42]:
## turn features column into index feature to use for regression down the line
ins_met_to_col = ins_sig[['feature', 'assoc_bin', 'p']].copy()
ins_met_to_col = ins_met_to_col.pivot_table(index='assoc_bin', columns='feature', values='p')

ins_features = ins_met_to_col.columns[0:]
ins_features

Index(['metabolite_100000036', 'metabolite_100000054', 'metabolite_100000263',
       'metabolite_100000265', 'metabolite_100000282', 'metabolite_100000406',
       'metabolite_100000436', 'metabolite_100000445', 'metabolite_100000463',
       'metabolite_100000467',
       ...
       'metabolite_999947971', 'metabolite_999947993', 'metabolite_999948076',
       'metabolite_999949515', 'metabolite_999949555', 'metabolite_999949557',
       'metabolite_999949592', 'metabolite_999949681', 'metabolite_999952502',
       'metabolite_999952909'],
      dtype='object', name='feature', length=426)

##### Protein ~ metabolites + [covar]

In [43]:
# GENETIC
def get_results(feature):
    
    """Get a single association.
    
    As long as `args` and `df` are assigned this can be used on
    any data set.
    """
    formula = f"PROTEIN_TOTAL_SERUM ~ {feature} + C(sex) + age + BMI_CALC + PC1 + PC2 + PC3 + PC4 + PC5"
    fitted = ols(formula, data=df).fit()
    return pd.DataFrame({
        "feature": feature,
        "beta": fitted.params[feature],
        "t_statistic": fitted.tvalues[feature],
        "p": fitted.pvalues[feature],
        "n": fitted.nobs
        }, index=[feature])

args = gen_only_features
df = mac_str
results = map(get_results, track(args))
results = list(results)
tests = pd.concat(results)
tests["q"] = multipletests(tests.p, method="fdr_bh")[1]
gen_pro_sig = tests[tests.q <= 0.05]
## merge in names + pathway info to this data
gen_pro_sig = pd.merge(gen_pro_sig, meta_names, on = "feature", how = "inner")

In [44]:
# MICROBE
def get_results(feature):
    
    """Get a single association.
    
    As long as `args` and `df` are assigned this can be used on
    any data set.
    """
    formula = f"PROTEIN_TOTAL_SERUM ~ {feature} + C(sex) + age + BMI_CALC + PC1 + PC2 + PC3 + PC4 + PC5"
    fitted = ols(formula, data=df).fit()
    return pd.DataFrame({
        "feature": feature,
        "beta": fitted.params[feature],
        "t_statistic": fitted.tvalues[feature],
        "p": fitted.pvalues[feature],
        "n": fitted.nobs
        }, index=[feature])

args = microbe_only_features
df = mac_str
results = map(get_results, track(args))
results = list(results)
tests = pd.concat(results)
tests["q"] = multipletests(tests.p, method="fdr_bh")[1]
mic_pro_sig = tests[tests.q <= 0.05]
## merge in names + pathway info to this data
mic_pro_sig = pd.merge(mic_pro_sig, meta_names, on = "feature", how = "inner")

In [45]:
# HYBRID
def get_results(feature):
    
    """Get a single association.
    
    As long as `args` and `df` are assigned this can be used on
    any data set.
    """
    formula = f"PROTEIN_TOTAL_SERUM ~ {feature} + C(sex) + age + BMI_CALC + PC1 + PC2 + PC3 + PC4 + PC5"
    fitted = ols(formula, data=df).fit()
    return pd.DataFrame({
        "feature": feature,
        "beta": fitted.params[feature],
        "t_statistic": fitted.tvalues[feature],
        "p": fitted.pvalues[feature],
        "n": fitted.nobs
        }, index=[feature])

args = hybrid_features
df = mac_str
results = map(get_results, track(args))
results = list(results)
tests = pd.concat(results)
tests["q"] = multipletests(tests.p, method="fdr_bh")[1]
hyb_pro_sig = tests[tests.q <= 0.05]
## merge in names + pathway info to this data
hyb_pro_sig = pd.merge(hyb_pro_sig, meta_names, on = "feature", how = "inner")

In [46]:
## now, putting the association bins + merging + excel sheet into one code block
gen_pro_sig['assoc_bin']='genetic'
mic_pro_sig['assoc_bin']='microbe'
hyb_pro_sig['assoc_bin']='hybrid'

pro_sig = pd.concat([gen_pro_sig, mic_pro_sig, hyb_pro_sig])

In [None]:
with pd.ExcelWriter('MAC_Project_data.xlsx',
                    mode='a') as writer:  
    pro_sig.to_excel(writer, sheet_name='pro_sig')

In [47]:
## turn features column into index feature to use for regression down the line
pro_met_to_col = pro_sig[['feature', 'assoc_bin', 'p']].copy()
pro_met_to_col = pro_met_to_col.pivot_table(index='assoc_bin', columns='feature', values='p')

pro_features = pro_met_to_col.columns[0:]
pro_features

Index(['metabolite_100000007', 'metabolite_100000014', 'metabolite_100000036',
       'metabolite_100000039', 'metabolite_100000263', 'metabolite_100000265',
       'metabolite_100000276', 'metabolite_100000442', 'metabolite_100000551',
       'metabolite_100000580',
       ...
       'metabolite_999948047', 'metabolite_999948076', 'metabolite_999948094',
       'metabolite_999949515', 'metabolite_999949557', 'metabolite_999949637',
       'metabolite_999949681', 'metabolite_999952502', 'metabolite_999952877',
       'metabolite_999952909'],
      dtype='object', name='feature', length=377)

##### Urea Nitrogen ~ metabolites + [covar]

In [48]:
# GENETIC
def get_results(feature):
    
    """Get a single association.
    
    As long as `args` and `df` are assigned this can be used on
    any data set.
    """
    formula = f"UREA_NITROGEN ~ {feature} + C(sex) + age + BMI_CALC + PC1 + PC2 + PC3 + PC4 + PC5"
    fitted = ols(formula, data=df).fit()
    return pd.DataFrame({
        "feature": feature,
        "beta": fitted.params[feature],
        "t_statistic": fitted.tvalues[feature],
        "p": fitted.pvalues[feature],
        "n": fitted.nobs
        }, index=[feature])

args = gen_only_features
df = mac_str
results = map(get_results, track(args))
results = list(results)
tests = pd.concat(results)
tests["q"] = multipletests(tests.p, method="fdr_bh")[1]
gen_ure_sig = tests[tests.q <= 0.05]
## merge in names + pathway info to this data
gen_ure_sig = pd.merge(gen_ure_sig, meta_names, on = "feature", how = "inner")

In [49]:
# MICROBE
def get_results(feature):
    
    """Get a single association.
    
    As long as `args` and `df` are assigned this can be used on
    any data set.
    """
    formula = f"UREA_NITROGEN ~ {feature} + C(sex) + age + BMI_CALC + PC1 + PC2 + PC3 + PC4 + PC5"
    fitted = ols(formula, data=df).fit()
    return pd.DataFrame({
        "feature": feature,
        "beta": fitted.params[feature],
        "t_statistic": fitted.tvalues[feature],
        "p": fitted.pvalues[feature],
        "n": fitted.nobs
        }, index=[feature])

args = microbe_only_features
df = mac_str
results = map(get_results, track(args))
results = list(results)
tests = pd.concat(results)
tests["q"] = multipletests(tests.p, method="fdr_bh")[1]
mic_ure_sig = tests[tests.q <= 0.05]
## merge in names + pathway info to this data
mic_ure_sig = pd.merge(mic_ure_sig, meta_names, on = "feature", how = "inner")

In [50]:
# HYBRID
def get_results(feature):
    
    """Get a single association.
    
    As long as `args` and `df` are assigned this can be used on
    any data set.
    """
    formula = f"UREA_NITROGEN ~ {feature} + C(sex) + age + BMI_CALC + PC1 + PC2 + PC3 + PC4 + PC5"
    fitted = ols(formula, data=df).fit()
    return pd.DataFrame({
        "feature": feature,
        "beta": fitted.params[feature],
        "t_statistic": fitted.tvalues[feature],
        "p": fitted.pvalues[feature],
        "n": fitted.nobs
        }, index=[feature])

args = hybrid_features
df = mac_str
results = map(get_results, track(args))
results = list(results)
tests = pd.concat(results)
tests["q"] = multipletests(tests.p, method="fdr_bh")[1]
hyb_ure_sig = tests[tests.q <= 0.05]
## merge in names + pathway info to this data
hyb_ure_sig = pd.merge(hyb_ure_sig, meta_names, on = "feature", how = "inner")

In [51]:
## now, putting the association bins + merging + excel sheet into one code block
gen_ure_sig['assoc_bin']='genetic'
mic_ure_sig['assoc_bin']='microbe'
hyb_ure_sig['assoc_bin']='hybrid'

ure_sig = pd.concat([gen_ure_sig, mic_ure_sig, hyb_ure_sig])

In [None]:
with pd.ExcelWriter('MAC_Project_data.xlsx',
                    mode='a') as writer:  
    ure_sig.to_excel(writer, sheet_name='ure_sig')

In [52]:
## turn features column into index feature to use for regression down the line
ure_met_to_col = ure_sig[['feature', 'assoc_bin', 'p']].copy()
ure_met_to_col = ure_met_to_col.pivot_table(index='assoc_bin', columns='feature', values='p')

ure_features = ure_met_to_col.columns[0:]
ure_features

Index(['metabolite_100000007', 'metabolite_100000036', 'metabolite_100000042',
       'metabolite_100000054', 'metabolite_100000263', 'metabolite_100000265',
       'metabolite_100000463', 'metabolite_100000467', 'metabolite_100000491',
       'metabolite_100000551',
       ...
       'metabolite_999948001', 'metabolite_999949463', 'metabolite_999949512',
       'metabolite_999949515', 'metabolite_999949521', 'metabolite_999949637',
       'metabolite_999952025', 'metabolite_999952286', 'metabolite_999952640',
       'metabolite_999952877'],
      dtype='object', name='feature', length=359)

##### White Cell Count ~ metabolites + [covar]

In [53]:
# GENETIC
def get_results(feature):
    
    """Get a single association.
    
    As long as `args` and `df` are assigned this can be used on
    any data set.
    """
    formula = f"WHITE_CELL_COUNT ~ {feature} + C(sex) + age + BMI_CALC + PC1 + PC2 + PC3 + PC4 + PC5"
    fitted = ols(formula, data=df).fit()
    return pd.DataFrame({
        "feature": feature,
        "beta": fitted.params[feature],
        "t_statistic": fitted.tvalues[feature],
        "p": fitted.pvalues[feature],
        "n": fitted.nobs
        }, index=[feature])

args = gen_only_features
df = mac_str
results = map(get_results, track(args))
results = list(results)
tests = pd.concat(results)
tests["q"] = multipletests(tests.p, method="fdr_bh")[1]
gen_wbc_sig = tests[tests.q <= 0.05]
## merge in names + pathway info to this data
gen_wbc_sig = pd.merge(gen_wbc_sig, meta_names, on = "feature", how = "inner")

In [54]:
# MICROBE
def get_results(feature):
    
    """Get a single association.
    
    As long as `args` and `df` are assigned this can be used on
    any data set.
    """
    formula = f"WHITE_CELL_COUNT ~ {feature} + C(sex) + age + BMI_CALC + PC1 + PC2 + PC3 + PC4 + PC5"
    fitted = ols(formula, data=df).fit()
    return pd.DataFrame({
        "feature": feature,
        "beta": fitted.params[feature],
        "t_statistic": fitted.tvalues[feature],
        "p": fitted.pvalues[feature],
        "n": fitted.nobs
        }, index=[feature])

args = microbe_only_features
df = mac_str
results = map(get_results, track(args))
results = list(results)
tests = pd.concat(results)
tests["q"] = multipletests(tests.p, method="fdr_bh")[1]
mic_wbc_sig = tests[tests.q <= 0.05]
## merge in names + pathway info to this data
mic_wbc_sig = pd.merge(mic_wbc_sig, meta_names, on = "feature", how = "inner")

In [55]:
# HYBRID
def get_results(feature):
    
    """Get a single association.
    
    As long as `args` and `df` are assigned this can be used on
    any data set.
    """
    formula = f"WHITE_CELL_COUNT ~ {feature} + C(sex) + age + BMI_CALC + PC1 + PC2 + PC3 + PC4 + PC5"
    fitted = ols(formula, data=df).fit()
    return pd.DataFrame({
        "feature": feature,
        "beta": fitted.params[feature],
        "t_statistic": fitted.tvalues[feature],
        "p": fitted.pvalues[feature],
        "n": fitted.nobs
        }, index=[feature])

args = hybrid_features
df = mac_str
results = map(get_results, track(args))
results = list(results)
tests = pd.concat(results)
tests["q"] = multipletests(tests.p, method="fdr_bh")[1]
hyb_wbc_sig = tests[tests.q <= 0.05]
## merge in names + pathway info to this data
hyb_wbc_sig = pd.merge(hyb_wbc_sig, meta_names, on = "feature", how = "inner")

In [56]:
## now, putting the association bins + merging + excel sheet into one code block
gen_wbc_sig['assoc_bin']='genetic'
mic_wbc_sig['assoc_bin']='microbe'
hyb_wbc_sig['assoc_bin']='hybrid'

wbc_sig = pd.concat([gen_wbc_sig, mic_wbc_sig, hyb_wbc_sig])

In [None]:
with pd.ExcelWriter('MAC_Project_data.xlsx',
                    mode='a') as writer:  
    wbc_sig.to_excel(writer, sheet_name='wbc_sig')

In [57]:
## turn features column into index feature to use for regression down the line
wbc_met_to_col = wbc_sig[['feature', 'assoc_bin', 'p']].copy()
wbc_met_to_col = wbc_met_to_col.pivot_table(index='assoc_bin', columns='feature', values='p')

wbc_features = wbc_met_to_col.columns[0:]
wbc_features

Index(['metabolite_100000007', 'metabolite_100000010', 'metabolite_100000014',
       'metabolite_100000036', 'metabolite_100000039', 'metabolite_100000406',
       'metabolite_100000442', 'metabolite_100000463', 'metabolite_100000551',
       'metabolite_100000580',
       ...
       'metabolite_999947642', 'metabolite_999947804', 'metabolite_999947905',
       'metabolite_999947955', 'metabolite_999949515', 'metabolite_999949521',
       'metabolite_999949592', 'metabolite_999949637', 'metabolite_999952877',
       'metabolite_999953267'],
      dtype='object', name='feature', length=252)

##### GFR ~ metabolites + [covar]

In [58]:
# GENETIC
def get_results(feature):
    
    """Get a single association.
    
    As long as `args` and `df` are assigned this can be used on
    any data set.
    """
    formula = f"GFR_MDRD ~ {feature} + C(sex) + age + BMI_CALC + PC1 + PC2 + PC3 + PC4 + PC5"
    fitted = ols(formula, data=df).fit()
    return pd.DataFrame({
        "feature": feature,
        "beta": fitted.params[feature],
        "t_statistic": fitted.tvalues[feature],
        "p": fitted.pvalues[feature],
        "n": fitted.nobs
        }, index=[feature])

args = gen_only_features
df = mac_str
results = map(get_results, track(args))
results = list(results)
tests = pd.concat(results)
tests["q"] = multipletests(tests.p, method="fdr_bh")[1]
gen_gfr_sig = tests[tests.q <= 0.05]
## merge in names + pathway info to this data
gen_gfr_sig = pd.merge(gen_gfr_sig, meta_names, on = "feature", how = "inner")

In [59]:
# MICROBE
def get_results(feature):
    
    """Get a single association.
    
    As long as `args` and `df` are assigned this can be used on
    any data set.
    """
    formula = f"GFR_MDRD ~ {feature} + C(sex) + age + BMI_CALC + PC1 + PC2 + PC3 + PC4 + PC5"
    fitted = ols(formula, data=df).fit()
    return pd.DataFrame({
        "feature": feature,
        "beta": fitted.params[feature],
        "t_statistic": fitted.tvalues[feature],
        "p": fitted.pvalues[feature],
        "n": fitted.nobs
        }, index=[feature])

args = microbe_only_features
df = mac_str
results = map(get_results, track(args))
results = list(results)
tests = pd.concat(results)
tests["q"] = multipletests(tests.p, method="fdr_bh")[1]
mic_gfr_sig = tests[tests.q <= 0.05]
## merge in names + pathway info to this data
mic_gfr_sig = pd.merge(mic_gfr_sig, meta_names, on = "feature", how = "inner")

In [60]:
# HYBRID
def get_results(feature):
    
    """Get a single association.
    
    As long as `args` and `df` are assigned this can be used on
    any data set.
    """
    formula = f"GFR_MDRD ~ {feature} + C(sex) + age + BMI_CALC + PC1 + PC2 + PC3 + PC4 + PC5"
    fitted = ols(formula, data=df).fit()
    return pd.DataFrame({
        "feature": feature,
        "beta": fitted.params[feature],
        "t_statistic": fitted.tvalues[feature],
        "p": fitted.pvalues[feature],
        "n": fitted.nobs
        }, index=[feature])

args = hybrid_features
df = mac_str
results = map(get_results, track(args))
results = list(results)
tests = pd.concat(results)
tests["q"] = multipletests(tests.p, method="fdr_bh")[1]
hyb_gfr_sig = tests[tests.q <= 0.05]
## merge in names + pathway info to this data
hyb_gfr_sig = pd.merge(hyb_gfr_sig, meta_names, on = "feature", how = "inner")

In [61]:
## now, putting the association bins + merging + excel sheet into one code block
gen_gfr_sig['assoc_bin']='genetic'
mic_gfr_sig['assoc_bin']='microbe'
hyb_gfr_sig['assoc_bin']='hybrid'

gfr_sig = pd.concat([gen_gfr_sig, mic_gfr_sig, hyb_gfr_sig])

In [None]:
with pd.ExcelWriter('MAC_Project_data.xlsx',
                    mode='a') as writer:  
    gfr_sig.to_excel(writer, sheet_name='gfr_sig')

In [62]:
## turn features column into index feature to use for regression down the line
gfr_met_to_col = gfr_sig[['feature', 'assoc_bin', 'p']].copy()
gfr_met_to_col = gfr_met_to_col.pivot_table(index='assoc_bin', columns='feature', values='p')

gfr_features = gfr_met_to_col.columns[0:]
gfr_features

Index(['metabolite_100000014', 'metabolite_100000036', 'metabolite_100000042',
       'metabolite_100000096', 'metabolite_100000257', 'metabolite_100000263',
       'metabolite_100000265', 'metabolite_100000282', 'metabolite_100000406',
       'metabolite_100000445',
       ...
       'metabolite_999949521', 'metabolite_999949555', 'metabolite_999949557',
       'metabolite_999949592', 'metabolite_999949637', 'metabolite_999952025',
       'metabolite_999952286', 'metabolite_999952640', 'metabolite_999952877',
       'metabolite_999952909'],
      dtype='object', name='feature', length=446)

### Mediation

### Moderation

#### Now, need to loop through OLS regressions using activity * metabolite interaction term, looking for a significant interaction term after fdr correction

In [68]:
## note, to do an interaction term, add in the two terms plus :, so activity:metabolite
## testing
## hdl first
def get_results(feature):
    
    """Get a single association.
    
    As long as `args` and `df` are assigned this can be used on
    any data set.
    """
    formula = f"HDL_CHOL_DIRECT ~ {feature} + activities_distance + {feature}:activities_distance + C(sex) + age + BMI_CALC + PC1 + PC2 + PC3 + PC4 + PC5"
    fitted = ols(formula, data=df).fit()
    return pd.DataFrame({
        "feature": feature,
        "beta": fitted.params[feature':activities_distance'],
        "t_statistic": fitted.tvalues[feature:'activities_distance'],
        "p": fitted.pvalues['feature:activities_distance'],
        "n": fitted.nobs
        }, index=[feature])

args = hdl_features
df = mac_str
results = map(get_results, track(args))
results = list(results)
tests = pd.concat(results)
tests["q"] = multipletests(tests.p, method="fdr_bh")[1]
hdl_mod = tests[tests.q <= 0.05]
hdl_mod


SyntaxError: invalid syntax (1570778631.py, line 15)

In [66]:
## note, to do an interaction term, add in the two terms plus :, so activity:metabolite
## testing
## hdl first
import statsmodels.formula.api as smf
hdl_mod =  smf.ols('HDL_CHOL_DIRECT ~ metabolite_100000265 + activities_distance + metabolite_100000265:activities_distance + sex + age + BMI_CALC + PC1 +PC2 + PC3 + PC4 + PC5', data=mac_str)
hdl_res = hdl_mod.fit()
hdl_res.summary()

0,1,2,3
Dep. Variable:,HDL_CHOL_DIRECT,R-squared:,0.345
Model:,OLS,Adj. R-squared:,0.34
Method:,Least Squares,F-statistic:,77.89
Date:,"Wed, 13 Sep 2023",Prob (F-statistic):,6.68e-141
Time:,12:50:55,Log-Likelihood:,-1983.3
No. Observations:,1640,AIC:,3991.0
Df Residuals:,1628,BIC:,4055.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.8525,0.156,5.458,0.000,0.546,1.159
sex[T.M],-0.7089,0.045,-15.735,0.000,-0.797,-0.621
metabolite_100000265,-0.1700,0.055,-3.115,0.002,-0.277,-0.063
activities_distance,0.0599,0.014,4.252,0.000,0.032,0.088
metabolite_100000265:activities_distance,0.0133,0.014,0.986,0.324,-0.013,0.040
age,0.0158,0.002,8.762,0.000,0.012,0.019
BMI_CALC,-0.0563,0.003,-17.035,0.000,-0.063,-0.050
PC1,3.7771,1.543,2.448,0.014,0.750,6.804
PC2,0.4642,1.307,0.355,0.722,-2.098,3.027

0,1,2,3
Omnibus:,10.55,Durbin-Watson:,2.008
Prob(Omnibus):,0.005,Jarque-Bera (JB):,10.817
Skew:,0.168,Prob(JB):,0.00448
Kurtosis:,3.214,Cond. No.,4670.0


In [91]:
data = {'coef': hdl_res.params,
        'std err': hdl_res.bse,
        't': hdl_res.tvalues,
        'P>|t|': hdl_res.pvalues,
        '[0.025': hdl_res.conf_int()[0],
        '0.975]': hdl_res.conf_int()[1]}
hdl_data = pd.DataFrame(data).round(3)
hdl_data = hdl_data.drop(labels=['sex[T.M]','age','PC1','PC2','PC3','PC4','PC5','BMI_CALC'])
hdl_data

Unnamed: 0,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.852,0.156,5.458,0.0,0.546,1.159
metabolite_100000265,-0.17,0.055,-3.115,0.002,-0.277,-0.063
activities_distance,0.06,0.014,4.252,0.0,0.032,0.088
metabolite_100000265:activities_distance,0.013,0.014,0.986,0.324,-0.013,0.04


#### Unused Regressions

##### Metabolites ~ Distance + [covar] 

In [9]:
# GENETIC
def get_results(feature):
    
    """Get a single association.
    
    As long as `args` and `df` are assigned this can be used on
    any data set.
    """
    formula = f"{feature} ~ activities_distance + C(sex) + age + BMI_CALC + PC1 + PC2 + PC3 + PC4 + PC5"
    fitted = ols(formula, data=df).fit()
    return pd.DataFrame({
        "feature": feature,
        "beta": fitted.params["activities_distance"],
        "t_statistic": fitted.tvalues["activities_distance"],
        "p": fitted.pvalues["activities_distance"],
        "n": fitted.nobs
        }, index=[feature])

args = gen_only_features
df = mac_str
results = map(get_results, track(args))
results = list(results)
tests = pd.concat(results)
tests["q"] = multipletests(tests.p, method="fdr_bh")[1]
gene_dist_signifs = tests[tests.q <= 0.05]
gene_dist_signifs

Unnamed: 0,feature,beta,t_statistic,p,n,q
metabolite_55,metabolite_55,0.061625,3.664017,0.0002561816,1651.0,0.010028
metabolite_1242,metabolite_1242,-0.052163,-3.139676,0.001721643,1642.0,0.028392
metabolite_1538,metabolite_1538,-0.088196,-5.289383,1.392653e-07,1649.0,3.8e-05
metabolite_100000054,metabolite_100000054,-0.052076,-3.283774,0.001047693,1521.0,0.022833
metabolite_100000263,metabolite_100000263,0.083847,5.025297,5.579951e-07,1639.0,7.6e-05
metabolite_100000282,metabolite_100000282,-0.061054,-3.591109,0.000339929,1505.0,0.011643
metabolite_100001197,metabolite_100001197,0.077654,4.5443,5.914836e-06,1651.0,0.00054
metabolite_100001253,metabolite_100001253,-0.054117,-3.174542,0.001529736,1586.0,0.027943
metabolite_100001395,metabolite_100001395,0.067098,4.362821,1.364144e-05,1647.0,0.000934
metabolite_100001426,metabolite_100001426,-0.050164,-3.065432,0.002210548,1583.0,0.030285


In [10]:
len(gene_dist_signifs)

26

In [11]:
# MICROBE
def get_results(feature):
    
    """Get a single association.
    
    As long as `args` and `df` are assigned this can be used on
    any data set.
    """
    formula = f"{feature} ~ activities_distance + C(sex) + age + BMI_CALC + PC1 + PC2 + PC3 + PC4 + PC5"
    fitted = ols(formula, data=df).fit()
    return pd.DataFrame({
        "feature": feature,
        "beta": fitted.params["activities_distance"],
        "t_statistic": fitted.tvalues["activities_distance"],
        "p": fitted.pvalues["activities_distance"],
        "n": fitted.nobs
        }, index=[feature])

args = microbe_only_features
df = mac_str
results = map(get_results, track(args))
results = list(results)
tests = pd.concat(results)
tests["q"] = multipletests(tests.p, method="fdr_bh")[1]
microbe_dist_signifs = tests[tests.q <= 0.05]
microbe_dist_signifs

Unnamed: 0,feature,beta,t_statistic,p,n,q
metabolite_62,metabolite_62,0.044474,2.531619,0.01145655,1489.0,0.04439415
metabolite_181,metabolite_181,0.043987,2.663517,0.007808743,1646.0,0.0327123
metabolite_922,metabolite_922,-0.079317,-4.929866,9.129863e-07,1530.0,2.830257e-05
metabolite_1001,metabolite_1001,-0.064381,-3.809932,0.0001441223,1654.0,0.001661116
metabolite_1135,metabolite_1135,-0.066476,-3.546977,0.000402407,1424.0,0.003465171
metabolite_1668,metabolite_1668,-0.048804,-2.709987,0.006805796,1499.0,0.03013995
metabolite_100000014,metabolite_100000014,0.063503,3.800753,0.0001495716,1638.0,0.001661116
metabolite_100000436,metabolite_100000436,-0.051888,-2.850818,0.004423077,1445.0,0.02142428
metabolite_100000447,metabolite_100000447,0.047208,2.567077,0.01036308,1358.0,0.04118659
metabolite_100000467,metabolite_100000467,-0.060036,-3.510827,0.000458922,1634.0,0.003743838


In [12]:
len(microbe_dist_signifs)

40

In [13]:
# HYBRID
def get_results(feature):
    
    """Get a single association.
    
    As long as `args` and `df` are assigned this can be used on
    any data set.
    """
    formula = f"{feature} ~ activities_distance + C(sex) + age + BMI_CALC + PC1 + PC2 + PC3 + PC4 + PC5"
    fitted = ols(formula, data=df).fit()
    return pd.DataFrame({
        "feature": feature,
        "beta": fitted.params["activities_distance"],
        "t_statistic": fitted.tvalues["activities_distance"],
        "p": fitted.pvalues["activities_distance"],
        "n": fitted.nobs
        }, index=[feature])

args = hybrid_features
df = mac_str
results = map(get_results, track(args))
results = list(results)
tests = pd.concat(results)
tests["q"] = multipletests(tests.p, method="fdr_bh")[1]
hybrid_dist_signifs = tests[tests.q <= 0.05]
hybrid_dist_signifs

Unnamed: 0,feature,beta,t_statistic,p,n,q
metabolite_240,metabolite_240,0.059287,3.850025,1.226092e-04,1662.0,0.001313
metabolite_250,metabolite_250,0.040474,2.461910,1.392143e-02,1662.0,0.048793
metabolite_302,metabolite_302,-0.047559,-2.670878,7.646839e-03,1515.0,0.029868
metabolite_391,metabolite_391,0.044794,2.762376,5.802438e-03,1646.0,0.025018
metabolite_480,metabolite_480,-0.078410,-4.821319,1.558035e-06,1651.0,0.000043
...,...,...,...,...,...,...
metabolite_999947670,metabolite_999947670,0.058621,3.263538,1.127685e-03,1368.0,0.007641
metabolite_999949515,metabolite_999949515,0.062541,3.904017,9.849561e-05,1628.0,0.001168
metabolite_999949592,metabolite_999949592,0.081916,5.203601,2.200966e-07,1649.0,0.000009
metabolite_999952502,metabolite_999952502,-0.076613,-4.469971,8.357701e-06,1655.0,0.000146


In [14]:
len(hybrid_dist_signifs)

99

##### Metabolites ~ Resting Heart Rate + [covar]

In [20]:
# GENETIC
def get_results(feature):
    
    """Get a single association.
    
    As long as `args` and `df` are assigned this can be used on
    any data set.
    """
    formula = f"{feature} ~ heartrate_resting + C(sex) + age + BMI_CALC + PC1 + PC2 + PC3 + PC4 + PC5"
    fitted = ols(formula, data=df).fit()
    return pd.DataFrame({
        "feature": feature,
        "beta": fitted.params["heartrate_resting"],
        "t_statistic": fitted.tvalues["heartrate_resting"],
        "p": fitted.pvalues["heartrate_resting"],
        "n": fitted.nobs
        }, index=[feature])

args = gen_only_features
df = mac_str
results = map(get_results, track(args))
results = list(results)
tests = pd.concat(results)
tests["q"] = multipletests(tests.p, method="fdr_bh")[1]
gene_hr_signifs = tests[tests.q <= 0.05]
gene_hr_signifs

Unnamed: 0,feature,beta,t_statistic,p,n,q
metabolite_35,metabolite_35,-0.010017,-2.611695,9.120033e-03,1230.0,4.164815e-02
metabolite_179,metabolite_179,-0.009582,-2.564127,1.047356e-02,1126.0,4.628636e-02
metabolite_234,metabolite_234,0.027326,8.098629,1.258188e-15,1323.0,3.447436e-13
metabolite_278,metabolite_278,0.012809,3.666698,2.561612e-04,1243.0,2.807527e-03
metabolite_279,metabolite_279,0.009594,2.816278,4.931841e-03,1309.0,2.702649e-02
...,...,...,...,...,...,...
metabolite_100020204,metabolite_100020204,-0.009836,-2.966780,3.065310e-03,1288.0,1.755789e-02
metabolite_100020241,metabolite_100020241,-0.021674,-5.911384,4.366437e-09,1265.0,1.495505e-07
metabolite_100021123,metabolite_100021123,-0.013384,-3.652579,2.704023e-04,1255.0,2.849624e-03
metabolite_999946970,metabolite_999946970,-0.008996,-2.557442,1.066113e-02,1266.0,4.636747e-02


In [21]:
len(gene_hr_signifs)

66

In [22]:
gene_hr_signifs.sort_values(by="beta", ascending=True)

Unnamed: 0,feature,beta,t_statistic,p,n,q
metabolite_100000787,metabolite_100000787,-0.024991,-7.371307,3.001062e-13,1306.0,4.111454e-11
metabolite_1140,metabolite_1140,-0.024005,-6.450451,1.574563e-10,1297.0,8.628604e-09
metabolite_100020241,metabolite_100020241,-0.021674,-5.911384,4.366437e-09,1265.0,1.495505e-07
metabolite_100001294,metabolite_100001294,-0.021486,-6.210764,7.099935e-10,1299.0,3.242304e-08
metabolite_100001662,metabolite_100001662,-0.020792,-6.135072,1.131238e-09,1296.0,4.427987e-08
...,...,...,...,...,...,...
metabolite_100001856,metabolite_100001856,0.018887,5.268388,1.612702e-07,1293.0,4.418803e-06
metabolite_482,metabolite_482,0.019663,5.678763,1.676385e-08,1293.0,5.103662e-07
metabolite_100009028,metabolite_100009028,0.023353,6.563809,7.585937e-11,1297.0,5.196367e-09
metabolite_313,metabolite_313,0.024166,6.714689,2.833906e-11,1281.0,2.588301e-09


In [23]:
# MICROBE
def get_results(feature):
    
    """Get a single association.
    
    As long as `args` and `df` are assigned this can be used on
    any data set.
    """
    formula = f"{feature} ~ heartrate_resting + C(sex) + age + BMI_CALC + PC1 + PC2 + PC3 + PC4 + PC5"
    fitted = ols(formula, data=df).fit()
    return pd.DataFrame({
        "feature": feature,
        "beta": fitted.params["heartrate_resting"],
        "t_statistic": fitted.tvalues["heartrate_resting"],
        "p": fitted.pvalues["heartrate_resting"],
        "n": fitted.nobs
        }, index=[feature])

args = microbe_only_features
df = mac_str
results = map(get_results, track(args))
results = list(results)
tests = pd.concat(results)
tests["q"] = multipletests(tests.p, method="fdr_bh")[1]
microbe_hr_signifs = tests[tests.q <= 0.05]
microbe_hr_signifs

Unnamed: 0,feature,beta,t_statistic,p,n,q
metabolite_62,metabolite_62,-0.016372,-4.516791,6.926421e-06,1158.0,0.0001064549
metabolite_424,metabolite_424,0.013792,3.849186,0.000124233,1318.0,0.00101348
metabolite_439,metabolite_439,0.010598,2.923452,0.003521398,1316.0,0.01559476
metabolite_561,metabolite_561,0.022687,7.141582,1.535328e-12,1303.0,7.932529e-11
metabolite_891,metabolite_891,0.011027,3.084442,0.002082611,1302.0,0.01041306
metabolite_922,metabolite_922,0.025897,7.841544,9.797522e-15,1206.0,1.518616e-12
metabolite_1135,metabolite_1135,0.011889,3.172043,0.001553348,1172.0,0.008598893
metabolite_1231,metabolite_1231,0.009533,2.68273,0.007394831,1312.0,0.02729045
metabolite_1539,metabolite_1539,0.014575,4.149323,3.55196e-05,1306.0,0.0003932527
metabolite_100000014,metabolite_100000014,-0.012832,-3.554651,0.0003922993,1287.0,0.002643756


In [24]:
len(microbe_hr_signifs)

50

In [25]:
microbe_hr_signifs.sort_values(by="beta", ascending=True)

Unnamed: 0,feature,beta,t_statistic,p,n,q
metabolite_62,metabolite_62,-0.016372,-4.516791,6.926421e-06,1158.0,0.0001064549
metabolite_100001977,metabolite_100001977,-0.014597,-4.445798,9.523537e-06,1276.0,0.0001135499
metabolite_100000014,metabolite_100000014,-0.012832,-3.554651,0.0003922993,1287.0,0.002643756
metabolite_999952877,metabolite_999952877,-0.012546,-3.531914,0.0004277325,1248.0,0.002762439
metabolite_999947708,metabolite_999947708,-0.012451,-3.033814,0.002478426,991.0,0.01129871
metabolite_999948001,metabolite_999948001,-0.012043,-3.307125,0.0009706729,1203.0,0.005786704
metabolite_100005352,metabolite_100005352,-0.011894,-3.648112,0.0002747175,1299.0,0.002027677
metabolite_100001384,metabolite_100001384,-0.011845,-3.406382,0.0006814107,1142.0,0.004224746
metabolite_999947905,metabolite_999947905,-0.011726,-3.574013,0.0003644282,1309.0,0.002567563
metabolite_999946674,metabolite_999946674,-0.011412,-3.18858,0.001465318,1256.0,0.008412012


In [26]:
# HYBRID
def get_results(feature):
    
    """Get a single association.
    
    As long as `args` and `df` are assigned this can be used on
    any data set.
    """
    formula = f"{feature} ~ heartrate_resting + C(sex) + age + BMI_CALC + PC1 + PC2 + PC3 + PC4 + PC5"
    fitted = ols(formula, data=df).fit()
    return pd.DataFrame({
        "feature": feature,
        "beta": fitted.params["heartrate_resting"],
        "t_statistic": fitted.tvalues["heartrate_resting"],
        "p": fitted.pvalues["heartrate_resting"],
        "n": fitted.nobs
        }, index=[feature])

args = hybrid_features
df = mac_str
results = map(get_results, track(args))
results = list(results)
tests = pd.concat(results)
tests["q"] = multipletests(tests.p, method="fdr_bh")[1]
hybrid_hr_signifs = tests[tests.q <= 0.05]
hybrid_hr_signifs

Unnamed: 0,feature,beta,t_statistic,p,n,q
metabolite_93,metabolite_93,0.013660,3.799834,1.515510e-04,1298.0,7.509690e-04
metabolite_240,metabolite_240,-0.008136,-2.585905,9.819835e-03,1316.0,2.694368e-02
metabolite_250,metabolite_250,-0.010034,-2.904678,3.738441e-03,1311.0,1.170908e-02
metabolite_297,metabolite_297,0.022105,6.213295,6.972472e-10,1310.0,1.929051e-08
metabolite_391,metabolite_391,-0.013563,-3.803073,1.496255e-04,1294.0,7.509690e-04
...,...,...,...,...,...,...
metabolite_999946613,metabolite_999946613,-0.010366,-2.989303,2.849844e-03,1282.0,9.370930e-03
metabolite_999946620,metabolite_999946620,-0.008554,-2.372567,1.781500e-02,1265.0,4.447053e-02
metabolite_999949515,metabolite_999949515,-0.010483,-3.136742,1.747642e-03,1279.0,6.430393e-03
metabolite_999949592,metabolite_999949592,-0.018319,-5.492037,4.782783e-08,1297.0,6.616183e-07


In [27]:
len(hybrid_hr_signifs)

136

In [28]:
hybrid_hr_signifs.sort_values(by="beta", ascending=True)

Unnamed: 0,feature,beta,t_statistic,p,n,q
metabolite_100009007,metabolite_100009007,-0.023871,-7.832992,9.787206e-15,1319.0,1.624676e-12
metabolite_563,metabolite_563,-0.020061,-5.680185,1.661494e-08,1298.0,2.903242e-07
metabolite_100009020,metabolite_100009020,-0.019934,-6.263689,5.100057e-10,1310.0,1.539290e-08
metabolite_100009009,metabolite_100009009,-0.019917,-5.986767,2.763000e-09,1314.0,5.395976e-08
metabolite_100001083,metabolite_100001083,-0.018990,-5.520755,4.084457e-08,1287.0,5.895826e-07
...,...,...,...,...,...,...
metabolite_100010925,metabolite_100010925,0.022389,6.026545,2.290685e-09,1097.0,4.918355e-08
metabolite_100010940,metabolite_100010940,0.022712,7.028306,3.375265e-12,1302.0,3.735294e-10
metabolite_100009026,metabolite_100009026,0.022874,6.757444,2.121908e-11,1298.0,1.408947e-09
metabolite_823,metabolite_823,0.023889,6.855432,1.099508e-11,1299.0,9.125919e-10


##### Metabolites ~ Sleep Efficiency + [covar]

In [29]:
# GENETIC
def get_results(feature):
    
    """Get a single association.
    
    As long as `args` and `df` are assigned this can be used on
    any data set.
    """
    formula = f"{feature} ~ sleep_efficiency + C(sex) + age + BMI_CALC + PC1 + PC2 + PC3 + PC4 + PC5"
    fitted = ols(formula, data=df).fit()
    return pd.DataFrame({
        "feature": feature,
        "beta": fitted.params["sleep_efficiency"],
        "t_statistic": fitted.tvalues["sleep_efficiency"],
        "p": fitted.pvalues["sleep_efficiency"],
        "n": fitted.nobs
        }, index=[feature])

args = gen_only_features
df = mac_str
results = map(get_results, track(args))
results = list(results)
tests = pd.concat(results)
tests["q"] = multipletests(tests.p, method="fdr_bh")[1]
gene_sleep_signifs = tests[tests.q <= 0.05]
gene_sleep_signifs

Unnamed: 0,feature,beta,t_statistic,p,n,q


In [30]:
# MICROBE
def get_results(feature):
    
    """Get a single association.
    
    As long as `args` and `df` are assigned this can be used on
    any data set.
    """
    formula = f"{feature} ~ sleep_efficiency + C(sex) + age + BMI_CALC + PC1 + PC2 + PC3 + PC4 + PC5"
    fitted = ols(formula, data=df).fit()
    return pd.DataFrame({
        "feature": feature,
        "beta": fitted.params["sleep_efficiency"],
        "t_statistic": fitted.tvalues["sleep_efficiency"],
        "p": fitted.pvalues["sleep_efficiency"],
        "n": fitted.nobs
        }, index=[feature])

args = microbe_only_features
df = mac_str
results = map(get_results, track(args))
results = list(results)
tests = pd.concat(results)
tests["q"] = multipletests(tests.p, method="fdr_bh")[1]
microbe_sleep_signifs = tests[tests.q <= 0.05]
microbe_sleep_signifs

Unnamed: 0,feature,beta,t_statistic,p,n,q


In [31]:
# HYBRID
def get_results(feature):
    
    """Get a single association.
    
    As long as `args` and `df` are assigned this can be used on
    any data set.
    """
    formula = f"{feature} ~ sleep_efficiency + C(sex) + age + BMI_CALC + PC1 + PC2 + PC3 + PC4 + PC5"
    fitted = ols(formula, data=df).fit()
    return pd.DataFrame({
        "feature": feature,
        "beta": fitted.params["sleep_efficiency"],
        "t_statistic": fitted.tvalues["sleep_efficiency"],
        "p": fitted.pvalues["sleep_efficiency"],
        "n": fitted.nobs
        }, index=[feature])

args = hybrid_features
df = mac_str
results = map(get_results, track(args))
results = list(results)
tests = pd.concat(results)
tests["q"] = multipletests(tests.p, method="fdr_bh")[1]
hybrid_sleep_signifs = tests[tests.q <= 0.05]
hybrid_sleep_signifs

Unnamed: 0,feature,beta,t_statistic,p,n,q


##### Sedentary Minutes ~ Metabolites + [covar]

In [36]:
# GENETIC
def get_results(feature):
    
    """Get a single association.
    
    As long as `args` and `df` are assigned this can be used on
    any data set.
    """
    formula = f"{feature} ~ activities_minutesSedentary + C(sex) + age + BMI_CALC + PC1 + PC2 + PC3 + PC4 + PC5"
    fitted = ols(formula, data=df).fit()
    return pd.DataFrame({
        "feature": feature,
        "beta": fitted.params["activities_minutesSedentary"],
        "t_statistic": fitted.tvalues["activities_minutesSedentary"],
        "p": fitted.pvalues["activities_minutesSedentary"],
        "n": fitted.nobs
        }, index=[feature])

args = gen_only_features
df = mac_str
results = map(get_results, track(args))
results = list(results)
tests = pd.concat(results)
tests["q"] = multipletests(tests.p, method="fdr_bh")[1]
gene_seden_signifs = tests[tests.q <= 0.05]
gene_seden_signifs

Unnamed: 0,feature,beta,t_statistic,p,n,q


In [37]:
# MICROBE
def get_results(feature):
    
    """Get a single association.
    
    As long as `args` and `df` are assigned this can be used on
    any data set.
    """
    formula = f"{feature} ~ activities_minutesSedentary + C(sex) + age + BMI_CALC + PC1 + PC2 + PC3 + PC4 + PC5"
    fitted = ols(formula, data=df).fit()
    return pd.DataFrame({
        "feature": feature,
        "beta": fitted.params["activities_minutesSedentary"],
        "t_statistic": fitted.tvalues["activities_minutesSedentary"],
        "p": fitted.pvalues["activities_minutesSedentary"],
        "n": fitted.nobs
        }, index=[feature])

args = microbe_only_features
df = mac_str
results = map(get_results, track(args))
results = list(results)
tests = pd.concat(results)
tests["q"] = multipletests(tests.p, method="fdr_bh")[1]
microbe_seden_signifs = tests[tests.q <= 0.05]
microbe_seden_signifs

Unnamed: 0,feature,beta,t_statistic,p,n,q


In [38]:
# HYBRID
def get_results(feature):
    
    """Get a single association.
    
    As long as `args` and `df` are assigned this can be used on
    any data set.
    """
    formula = f"{feature} ~ activities_minutesSedentary + C(sex) + age + BMI_CALC + PC1 + PC2 + PC3 + PC4 + PC5"
    fitted = ols(formula, data=df).fit()
    return pd.DataFrame({
        "feature": feature,
        "beta": fitted.params["activities_minutesSedentary"],
        "t_statistic": fitted.tvalues["activities_minutesSedentary"],
        "p": fitted.pvalues["activities_minutesSedentary"],
        "n": fitted.nobs
        }, index=[feature])

args = hybrid_features
df = mac_str
results = map(get_results, track(args))
results = list(results)
tests = pd.concat(results)
tests["q"] = multipletests(tests.p, method="fdr_bh")[1]
hybrid_seden_signifs = tests[tests.q <= 0.05]
hybrid_seden_signifs

Unnamed: 0,feature,beta,t_statistic,p,n,q
metabolite_100002784,metabolite_100002784,0.000445,3.814546,0.000142,1596.0,0.047033
