# Residuals for metabolites

Since it is difficult to control for confounders in test/validation split we will remove common confounder effects from all metabolites before running the split. For that we start by merging the covariates in our initial sample table.

In [1]:
import warnings
warnings.simplefilter("ignore")

## Merge covariates

The covariates we want to control for are sex, age, BMI, metabolomics batch, and the first 5 principal components of genetic ancestry. We can get sex and age from the microbiome table.

In [2]:
import arivale_data_interface as adi
import pandas as pd

with_all = pd.read_csv("data/all_feature_types.csv", dtype={"public_client_id": str, "blood_days_in_program": "float64"})
clients = pd.read_csv("/proj/arivale/microbiome/16S_processed/metadata.csv")[["sex", "age", "id"]]
clients["stool_sample_id"] = clients.id.str.split("|").str[0]
clients["stool_vendor"] = "DNAGenotek"
clients.loc[clients.id.str.startswith("AV15"), "stool_vendor"] = "SecondGenome"
clients.drop(columns="id", inplace=True)
merged = pd.merge(with_all, clients.drop_duplicates(), on="stool_sample_id")
merged

Unnamed: 0,public_client_id,genome_id,blood_sample_id,blood_days_in_program,stool_sample_id,stool_days_in_program,sex,age,stool_vendor
0,01000261,A477AV494-007,A477AV558-002,65.0,AV15-1592,77.0,F,49.0,SecondGenome
1,01002183,A595AV283-007,A595AV320-002,13.0,AV15-1668,28.0,F,67.0,SecondGenome
2,01002412,A294AU436-007,A294AU415-002,13.0,AV15-1446,13.0,F,52.0,SecondGenome
3,01003555,DS-267525,A972BH438-002,15.0,22001612560494,8.0,M,47.0,DNAGenotek
4,01003758,DS-279359,A706BO778-007,417.0,22001805511668,432.0,F,32.0,DNAGenotek
...,...,...,...,...,...,...,...,...,...
1564,01995656,DS-268965,A308BH458-003,38.0,22001612560620,41.0,F,48.0,DNAGenotek
1565,01995874,A968AR531-005,A968AR545-002,28.0,AV15-1032,28.0,M,69.0,SecondGenome
1566,01997759,DS-282699,A850BJ121-008,6.0,22001701511388,7.0,F,44.0,DNAGenotek
1567,01997909,A162AU015-007,A162AU063-002,10.0,AV15-1368,10.0,M,51.0,SecondGenome


Now we will merge in the BMI from weight history.

In [3]:
weights = adi.get_snapshot("weight")[["public_client_id", "WEIGHT_CALC", "BMI_CALC", "days_in_program"]].dropna()
merged = pd.merge_asof(
    merged.sort_values(by="blood_days_in_program"), 
    weights.sort_values(by="days_in_program"), 
    by="public_client_id", 
    left_on="blood_days_in_program", 
    right_on="days_in_program", 
    direction="nearest"
).drop(columns=["days_in_program"])

  and should_run_async(code)


In [4]:
merged

  and should_run_async(code)


Unnamed: 0,public_client_id,genome_id,blood_sample_id,blood_days_in_program,stool_sample_id,stool_days_in_program,sex,age,stool_vendor,WEIGHT_CALC,BMI_CALC
0,01859111,DS-280477,A926BI442-002,0.0,22001612561271,15.0,M,29.0,DNAGenotek,190.0,27.259184
1,01789546,A976AR206-005,A976AR221-002,1.0,AV15-1094,1.0,F,87.0,SecondGenome,117.0,18.692937
2,01764086,339149,A976AR212-002,1.0,AV15-1184,21.0,M,73.0,SecondGenome,207.6,29.784245
3,01725364,DS-265606,A579BF775-003,1.0,22001612560477,13.0,M,56.0,DNAGenotek,260.0,38.391094
4,01363795,DS-280282,A439BJ778-007,1.0,22001701510768,8.0,F,40.0,DNAGenotek,230.0,34.458948
...,...,...,...,...,...,...,...,...,...,...,...
1564,01003758,DS-279359,A706BO778-007,417.0,22001805511668,432.0,F,32.0,DNAGenotek,150.0,24.207989
1565,01179898,DS-268522,A308BM821-002,420.0,22001701512279,435.0,M,51.0,DNAGenotek,253.6,34.390586
1566,01198770,DS-283161,A021BP586-003,425.0,22001805511646,433.0,M,45.0,DNAGenotek,165.0,24.363579
1567,01178677,DS-269561,A581BM818-003,428.0,22001803513473,446.0,F,62.0,DNAGenotek,141.0,24.199951


Now we add in the metabolomics batch.

In [5]:
mets = adi.get_snapshot("metabolomics_samples")[["sample_id", "BATCH_DATE"]]
merged = pd.merge(merged, mets, left_on="blood_sample_id", right_on="sample_id")

  and should_run_async(code)


And finally the ancestry.

In [6]:
ancestry = adi.get_snapshot("genetics_ancestry")[["public_client_id", "PC1", "PC2", "PC3", "PC4", "PC5"]].drop_duplicates(subset=["public_client_id"])
merged = pd.merge(merged, ancestry, on="public_client_id")
merged

  and should_run_async(code)


Unnamed: 0,public_client_id,genome_id,blood_sample_id,blood_days_in_program,stool_sample_id,stool_days_in_program,sex,age,stool_vendor,WEIGHT_CALC,BMI_CALC,sample_id,BATCH_DATE,PC1,PC2,PC3,PC4,PC5
0,01859111,DS-280477,A926BI442-002,0.0,22001612561271,15.0,M,29.0,DNAGenotek,190.0,27.259184,A926BI442-002,2017-12-14,0.006696,-0.002037,-0.002971,0.004407,0.002763
1,01789546,A976AR206-005,A976AR221-002,1.0,AV15-1094,1.0,F,87.0,SecondGenome,117.0,18.692937,A976AR221-002,2016-05-27,0.006875,-0.001322,-0.002180,0.001956,0.002735
2,01764086,339149,A976AR212-002,1.0,AV15-1184,21.0,M,73.0,SecondGenome,207.6,29.784245,A976AR212-002,2016-05-27,0.002915,0.004413,0.011031,-0.052312,-0.000650
3,01725364,DS-265606,A579BF775-003,1.0,22001612560477,13.0,M,56.0,DNAGenotek,260.0,38.391094,A579BF775-003,2019-03-04,0.006246,-0.002646,-0.004597,0.004217,0.001866
4,01363795,DS-280282,A439BJ778-007,1.0,22001701510768,8.0,F,40.0,DNAGenotek,230.0,34.458948,A439BJ778-007,2018-10-31,0.006186,-0.001918,-0.001869,0.003208,0.002379
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1564,01003758,DS-279359,A706BO778-007,417.0,22001805511668,432.0,F,32.0,DNAGenotek,150.0,24.207989,A706BO778-007,2018-10-31,0.006133,-0.000789,-0.004071,0.006642,0.001519
1565,01179898,DS-268522,A308BM821-002,420.0,22001701512279,435.0,M,51.0,DNAGenotek,253.6,34.390586,A308BM821-002,2018-09-28,0.006350,-0.002005,-0.002000,0.007206,0.003287
1566,01198770,DS-283161,A021BP586-003,425.0,22001805511646,433.0,M,45.0,DNAGenotek,165.0,24.363579,A021BP586-003,2018-10-31,0.006631,-0.001006,-0.002126,0.000773,0.002054
1567,01178677,DS-269561,A581BM818-003,428.0,22001803513473,446.0,F,62.0,DNAGenotek,141.0,24.199951,A581BM818-003,2018-10-31,-0.022400,-0.011723,-0.009898,0.000202,0.005539


Now we have everything assembled to fit our residuals.

## Processing metabolomics data

Now we will log transform our metabolomics data, filter them and remove the residual effects from our covariates.

In [7]:
metabolites = adi.get_snapshot("metabolomics_corrected")
metabolites = metabolites[metabolites.sample_id.isin(with_all.blood_sample_id)]
metabolite_features = metabolites.columns[8:]

bad = metabolites[metabolite_features].isnull().sum() / metabolites.shape[0] > 0.25
metabolite_features = bad[~bad].index
metabolites = metabolites.drop(columns=bad[bad].index)
metabolite_features.shape

  and should_run_async(code)


(930,)

Now we log transform.

In [8]:
import numpy as np

metabolites[metabolite_features] = np.log(metabolites[metabolite_features])
metabolites.rename(columns=dict(zip(metabolite_features, "metabolite_" + metabolite_features)), inplace=True)
metabolite_features = "metabolite_" + metabolite_features
metabolites = metabolites.rename(columns={"sample_id": "blood_sample_id"}).drop(columns=["days_in_program", "days_since_first_call", "days_since_first_draw", "month", "weekday", "season"])
metabolites = pd.merge(merged, metabolites, on=["public_client_id", "blood_sample_id"])
metabolites.to_csv("data/metabolites_raw.csv", index=False)

  and should_run_async(code)


And finally we remove the residuals.

In [9]:
from rich.progress import track
import statsmodels.formula.api as smf

formula = "{} ~ C(sex) + C(BATCH_DATE) + C(stool_vendor) + BMI_CALC + scale(age) + I(scale(age)**2) + scale(age)*C(sex) + I(scale(age)**2)*C(sex) + PC1 + PC2 + PC3 + PC4 + PC5"
metabolites_resid = metabolites.copy()
metrics = pd.DataFrame(columns=["r2", "p"], dtype="float64", index=metabolite_features)

for met in track(metabolite_features):
    model = smf.ols(formula.format(met), data=metabolites).fit()
    metrics.loc[met, "r2"] = model.rsquared
    metrics.loc[met, "p"] = model.f_pvalue
    metabolites_resid[met] = model.resid

  from collections import Mapping


Output()

In [10]:
metrics.describe()

  and should_run_async(code)


Unnamed: 0,r2,p
count,930.0,930.0
mean,0.124294,0.0005797584
std,0.067177,0.007287394
min,0.018925,9.958736e-250
25%,0.075115,1.917506e-40
50%,0.112036,7.298625000000001e-25
75%,0.160782,1.478513e-13
max,0.583349,0.1648566


And we save the residuals for later use.

In [11]:
metabolites_resid.to_csv("data/metabolites_residuals.csv")
metrics.to_csv("data/confounder_fits.csv", index=True)

  and should_run_async(code)


## Create train/validation data

Finally we create our train and validation data.

In [12]:
import random
import numpy as np

random.seed(42)
train_idx = random.sample(metabolites_resid.index.tolist(), k=1000)
train = metabolites_resid.loc[train_idx]
valid = metabolites_resid[~metabolites_resid.index.isin(train_idx)]

train.to_csv("data/train.csv", index=False)
phenotype = train[["genome_id", "genome_id"] + metabolite_features.tolist()]
phenotype.to_csv("data/train_phenotype.tsv", sep="\t", header=False, index=False)
valid.to_csv("data/valid.csv", index=False)

metabolite_indices = pd.DataFrame({"index": [i + 1 for i in range(len(metabolite_features))], "metabolite": metabolite_features})
metabolite_indices.to_csv("data/met_indices.csv", index=False)

print(f"{train.shape[0]} training samples and {valid.shape[0]} validation samples.")

1000 training samples and 569 validation samples.
