# Genetic associations

Here we will get a curated list of previously identified SNP <-> metabolite associations from two very large studies spanning more than 30K individuals.

In [23]:
import pandas as pd
import warnings

warnings.simplefilter("ignore")

fin = pd.read_excel("data/Yin2022_S3.xlsx", "Sheet1", skiprows=17)
fin["metabolite_id"] = fin["Metabolite ID"].str.replace("^C", "metabolite_")
ukb = pd.read_excel("data/Hysi_Mangino_2022_S3.xlsx", "Table S3")

Now we merge this in with our metabolon data.

In [41]:
import arivale_data_interface as adi

met_meta = adi.get_snapshot("metabolomics_metadata")[["CHEMICAL_ID", "BIOCHEMICAL_NAME"]]
met_meta["metabolite_id"] = "metabolite_" + met_meta.CHEMICAL_ID.astype(str)
id_to_name = pd.Series(met_meta.BIOCHEMICAL_NAME.values, index=met_meta.metabolite_id.values)
name_to_id = pd.Series(met_meta.metabolite_id.values, index=met_meta.BIOCHEMICAL_NAME.values)

In [44]:
fin_matched = fin[fin.metabolite_id.isin(name_to_id)][["rsID", "metabolite_id", "P", "Putative causal gene", ]]
fin_matched.columns = ["rsid", "metabolite_id", "p", "gene"]
fin_matched["study"] = "Yin et. al. 2022"
fin_matched["biochemical_name"] = id_to_name[fin_matched.metabolite_id].values
fin_matched

Unnamed: 0,rsid,metabolite_id,p,gene,study,biochemical_name
0,rs561463382,metabolite_1442,6.853000e-37,unknown,Yin et. al. 2022,beta-hydroxyisovalerate
2,rs4646080,metabolite_100000096,7.368000e-47,AGMAT,Yin et. al. 2022,4-guanidinobutanoate
3,rs3980014,metabolite_100000096,1.837000e-201,AGMAT,Yin et. al. 2022,4-guanidinobutanoate
4,rs113177297,metabolite_100002769,1.285000e-15,AGMAT,Yin et. al. 2022,argininate*
5,rs61757683,metabolite_35,9.200000e-36,ALDH4A1,Yin et. al. 2022,S-1-pyrroline-5-carboxylate
...,...,...,...,...,...,...
2023,rs5905042,metabolite_100002017,3.263000e-57,unknown,Yin et. al. 2022,"5alpha-androstan-3alpha,17beta-diol disulfate"
2025,rs5905042,metabolite_100002026,2.693000e-15,unknown,Yin et. al. 2022,"androstenediol (3alpha, 17alpha) monosulfate (2)"
2026,rs5905042,metabolite_100006005,1.652000e-57,unknown,Yin et. al. 2022,"5alpha-androstan-3alpha,17beta-diol monosulfat..."
2028,,metabolite_189,5.932000e-11,TMLHE,Yin et. al. 2022,"N6,N6,N6-trimethyllysine"


In [46]:
ukb_matched = ukb[ukb.Metabolite.isin(id_to_name)][["SNP ID", "P", "Gene", "Metabolite"]]
ukb_matched.columns = ["rsid", "p", "gene", "biochemical_name"]
ukb_matched["study"] = "Hysi, Mangino et. al. 2022"
ukb_matched["metabolite_id"] = name_to_id[ukb_matched.biochemical_name].values
ukb_matched

Unnamed: 0,rsid,p,gene,biochemical_name,study,metabolite_id
0,rs7604682,0.000000e+00,ALMS1,N-acetylphenylalanine,"Hysi, Mangino et. al. 2022",metabolite_100001256
1,rs887829,0.000000e+00,UGT1A8,"bilirubin (Z,Z)","Hysi, Mangino et. al. 2022",metabolite_1090
2,rs55826256,0.000000e+00,ACOT6,X - 24309,"Hysi, Mangino et. al. 2022",metabolite_999952504
5,chr2:234664586,0.000000e+00,UGT1A8,biliverdin,"Hysi, Mangino et. al. 2022",metabolite_250
6,rs12367888,0.000000e+00,SLCO1B1,glycochenodeoxycholate glucuronide (1),"Hysi, Mangino et. al. 2022",metabolite_100009264
...,...,...,...,...,...,...
901,rs2286963,9.260000e-247,ACADL,X - 13431,"Hysi, Mangino et. al. 2022",metabolite_999947788
903,rs45446698,3.560000e-286,CYP3A5,epiandrosterone sulfate,"Hysi, Mangino et. al. 2022",metabolite_100001287
907,rs887829,3.670000e-301,UGT1A8,"bilirubin (E,Z or Z,E)*","Hysi, Mangino et. al. 2022",metabolite_100001951
909,rs102275,1.350000e-309,TMEM258,1-stearoyl-2-arachidonoyl-GPC (18:0/20:4),"Hysi, Mangino et. al. 2022",metabolite_100001869


In [51]:
associations = pd.concat([fin_matched, ukb_matched]).drop_duplicates(subset=["rsid", "metabolite_id"])
associations.to_csv("data/genetic_associations.csv", index=False)
associations

Unnamed: 0,rsid,metabolite_id,p,gene,study,biochemical_name
0,rs561463382,metabolite_1442,6.853000e-37,unknown,Yin et. al. 2022,beta-hydroxyisovalerate
2,rs4646080,metabolite_100000096,7.368000e-47,AGMAT,Yin et. al. 2022,4-guanidinobutanoate
3,rs3980014,metabolite_100000096,1.837000e-201,AGMAT,Yin et. al. 2022,4-guanidinobutanoate
4,rs113177297,metabolite_100002769,1.285000e-15,AGMAT,Yin et. al. 2022,argininate*
5,rs61757683,metabolite_35,9.200000e-36,ALDH4A1,Yin et. al. 2022,S-1-pyrroline-5-carboxylate
...,...,...,...,...,...,...
897,rs35754645,metabolite_100003696,5.600000e-228,UGT1A8,"Hysi, Mangino et. al. 2022",succinimide
898,rs99780,metabolite_100008914,4.590000e-231,FADS2,"Hysi, Mangino et. al. 2022",1-palmitoyl-2-arachidonoyl-GPC (16:0/20:4n6)
901,rs2286963,metabolite_999947788,9.260000e-247,ACADL,"Hysi, Mangino et. al. 2022",X - 13431
909,rs102275,metabolite_100001869,1.350000e-309,TMEM258,"Hysi, Mangino et. al. 2022",1-stearoyl-2-arachidonoyl-GPC (18:0/20:4)


## Compare with the identified markers

Let's see how many of the markers we can identify in our dataset.

In [54]:
from pyplink import PyPlink

genotype = PyPlink('input_bed/all_chr/all_genomes_09112019_all_chr')
markers = genotype.get_geno_marker()

TypeError: get_geno_marker() missing 1 required positional argument: 'marker'

In [56]:
markers = genotype.get_bim()

In [57]:
markers

Unnamed: 0_level_0,chrom,pos,cm,a1,a2
snp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
rs3107975,1,55326,0,C,T
rs62641298;rs2462495;rs62641298,1,79033,0,G,A
rs114608975,1,86028,0,C,T
rs200369601,1,92675,0,G,A
rs56055731;rs8179466;rs56055731,1,234313,0,T,C
...,...,...,...,...,...
rs114553188,22,51222100,0,T,G
rs375798137;rs6010090;rs375798137,22,51223637,0,A,G
rs9616985,22,51229805,0,C,T
rs376461333,22,51232488,0,G,A


In [61]:
markers["rsid"] = markers.index.str.split(";").str[0]

In [63]:
found = associations[associations.rsid.isin(markers.rsid.unique())]

In [69]:
associations[associations.biochemical_name.str.contains("deoxychol")]

Unnamed: 0,rsid,metabolite_id,p,gene,study,biochemical_name
443,rs13121671,metabolite_100020550,0.0,UGT2B15|UGT2B17,Yin et. al. 2022,deoxycholic acid glucuronide
836,rs10504255,metabolite_302,3.66e-13,CYP7A1,Yin et. al. 2022,deoxycholate
1003,rs146405172,metabolite_100009264,2.941e-14,ABCC2,Yin et. al. 2022,glycochenodeoxycholate glucuronide (1)
1249,rs189206203,metabolite_100006641,2.805e-11,SLCO1A2|SLCO1B1|SLCO1B3,Yin et. al. 2022,glycochenodeoxycholate 3-sulfate
1275,rs4149056,metabolite_100006642,1.498e-85,SLCO1A2|SLCO1B1|SLCO1B3,Yin et. al. 2022,glycodeoxycholate 3-sulfate
1276,rs4149056,metabolite_100006644,1.6589999999999998e-42,SLCO1A2|SLCO1B1|SLCO1B3,Yin et. al. 2022,taurodeoxycholic acid 3-sulfate
1277,rs4149056,metabolite_100009264,0.0,SLCO1A2|SLCO1B1|SLCO1B3,Yin et. al. 2022,glycochenodeoxycholate glucuronide (1)
1282,rs4149056,metabolite_100021712,1.246e-53,SLCO1A2|SLCO1B1|SLCO1B3,Yin et. al. 2022,deoxycholic acid 12-sulfate*
1301,rs73079476,metabolite_100020550,9.416e-45,SLCO1A2|SLCO1B1|SLCO1B3,Yin et. al. 2022,deoxycholic acid glucuronide
1322,rs35380692,metabolite_100006642,4.009e-28,SLCO1A2|SLCO1B1|SLCO1B3,Yin et. al. 2022,glycodeoxycholate 3-sulfate


In [85]:
associations[~associations.rsid.isin(found.rsid) & associations.biochemical_name.str.contains("deoxychol")].rsid.unique()

array(['rs146405172', 'rs189206203', 'rs35380692', 'rs530831645',
       'rs10670440', 'rs12367888', 'chr4:69375591', 'rs11519274'],
      dtype=object)

In [72]:
markers[markers.rsid == "rs367714993"]

Unnamed: 0_level_0,chrom,pos,cm,a1,a2,rsid
snp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1


In [73]:
markers[markers.index.str.contains("rs35380692")]

Unnamed: 0_level_0,chrom,pos,cm,a1,a2,rsid
snp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1


In [82]:
markers[(markers.chrom == 12) & (markers.pos>=21343000)]

Unnamed: 0_level_0,chrom,pos,cm,a1,a2,rsid
snp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
rs2417966,12,21343229,0,G,C,rs2417966
rs2169968,12,21343404,0,G,A,rs2169968
rs73079476,12,21343833,0,C,A,rs73079476
rs67981690,12,21343886,0,G,A,rs67981690
rs10770792,12,21344543,0,A,T,rs10770792
...,...,...,...,...,...,...
rs150985013,12,133832310,0,T,C,rs150985013
rs149717378,12,133832593,0,A,C,rs149717378
rs148530687,12,133833832,0,T,C,rs148530687
rs145462682,12,133838329,0,A,G,rs145462682
