# GWAS data collection

In [113]:
import pandas as pd
from collections import Counter, defaultdict
import json
import requests
from tqdm import tqdm
import re
import copy

Prerequisites:

Obtain the last version of `All associations v1.0.2 - with added ontology annotations, GWAS Catalog study accession numbers and genotyping technology` from GWAS Catalog website: 
https://www.ebi.ac.uk/gwas/docs/file-downloads

GWAS gives us gene-disease associations, therefore we map genes and diseases.

## Opening GWAS `.tsv`

In [2]:
gwas_df = pd.read_table('gwas_catalog_v1.0.2-associations_e110_r2023-09-10.tsv', dtype=str)

In [5]:
gwas_df

Unnamed: 0,DATE ADDED TO CATALOG,PUBMEDID,FIRST AUTHOR,DATE,JOURNAL,LINK,STUDY,DISEASE/TRAIT,INITIAL SAMPLE SIZE,REPLICATION SAMPLE SIZE,...,PVALUE_MLOG,P-VALUE (TEXT),OR or BETA,95% CI (TEXT),PLATFORM [SNPS PASSING QC],CNV,MAPPED_TRAIT,MAPPED_TRAIT_URI,STUDY ACCESSION,GENOTYPING TECHNOLOGY
0,2017-11-30,29059683,Michailidou K,2017-10-23,Nature,www.ncbi.nlm.nih.gov/pubmed/29059683,Association analysis identifies 65 new breast ...,Breast cancer,"76,192 European ancestry cases, 63,082 Europea...","46,785 European ancestry cases, 42,892 Europea...",...,14.301029995663981,(EA),0.0483,[0.036-0.06] unit increase,Illumina [~ 11800000] (imputed),N,breast carcinoma,http://www.ebi.ac.uk/efo/EFO_0000305,GCST004988,Genome-wide genotyping array
1,2017-11-30,29059683,Michailidou K,2017-10-23,Nature,www.ncbi.nlm.nih.gov/pubmed/29059683,Association analysis identifies 65 new breast ...,Breast cancer,"76,192 European ancestry cases, 63,082 Europea...","46,785 European ancestry cases, 42,892 Europea...",...,24.0,(EA),0.2482,[0.2-0.3] unit decrease,Illumina [~ 11800000] (imputed),N,breast carcinoma,http://www.ebi.ac.uk/efo/EFO_0000305,GCST004988,Genome-wide genotyping array
2,2017-11-30,29059683,Michailidou K,2017-10-23,Nature,www.ncbi.nlm.nih.gov/pubmed/29059683,Association analysis identifies 65 new breast ...,Breast cancer,"76,192 European ancestry cases, 63,082 Europea...","46,785 European ancestry cases, 42,892 Europea...",...,6.096910013008056,(EA),0.0327,[0.02-0.046] unit decrease,Illumina [~ 11800000] (imputed),N,breast carcinoma,http://www.ebi.ac.uk/efo/EFO_0000305,GCST004988,Genome-wide genotyping array
3,2017-11-30,29059683,Michailidou K,2017-10-23,Nature,www.ncbi.nlm.nih.gov/pubmed/29059683,Association analysis identifies 65 new breast ...,Breast cancer,"76,192 European ancestry cases, 63,082 Europea...","46,785 European ancestry cases, 42,892 Europea...",...,42.045757490560675,(EA),0.0878,[0.075-0.1] unit decrease,Illumina [~ 11800000] (imputed),N,breast carcinoma,http://www.ebi.ac.uk/efo/EFO_0000305,GCST004988,Genome-wide genotyping array
4,2017-11-30,29059683,Michailidou K,2017-10-23,Nature,www.ncbi.nlm.nih.gov/pubmed/29059683,Association analysis identifies 65 new breast ...,Breast cancer,"76,192 European ancestry cases, 63,082 Europea...","46,785 European ancestry cases, 42,892 Europea...",...,10.0,(EA),0.0481,[0.033-0.063] unit decrease,Illumina [~ 11800000] (imputed),N,breast carcinoma,http://www.ebi.ac.uk/efo/EFO_0000305,GCST004988,Genome-wide genotyping array
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
552111,2023-05-04,35120996,Rhee EP,2022-02-01,Kidney Int,www.ncbi.nlm.nih.gov/pubmed/35120996,Trans-ethnic genome-wide association study of ...,Asparagine levels in chronic kidney disease,822 European ancestry individuals,,...,15.522878745280337,,0.18,[0.14-0.22] unit increase,Illumina [9097073] (imputed),N,asparagine measurement,http://www.ebi.ac.uk/efo/EFO_0009766,GCST90257491,Genome-wide genotyping array
552112,2023-05-04,35120996,Rhee EP,2022-02-01,Kidney Int,www.ncbi.nlm.nih.gov/pubmed/35120996,Trans-ethnic genome-wide association study of ...,Bilirubin levels in chronic kidney disease,822 European ancestry individuals,,...,14.522878745280337,,0.31,[0.23-0.39] unit increase,Illumina [9097073] (imputed),N,bilirubin measurement,http://www.ebi.ac.uk/efo/EFO_0004570,GCST90257494,Genome-wide genotyping array
552113,2023-05-04,35120996,Rhee EP,2022-02-01,Kidney Int,www.ncbi.nlm.nih.gov/pubmed/35120996,Trans-ethnic genome-wide association study of ...,Biliverdin levels in chronic kidney disease,822 European ancestry individuals,,...,18.221848749616356,,0.27,[0.21-0.33] unit increase,Illumina [9097073] (imputed),N,biliverdin measurement,http://www.ebi.ac.uk/efo/EFO_0021033,GCST90257495,Genome-wide genotyping array
552114,2023-05-04,35120996,Rhee EP,2022-02-01,Kidney Int,www.ncbi.nlm.nih.gov/pubmed/35120996,Trans-ethnic genome-wide association study of ...,Dimethylguanido valerate levels in chronic kid...,822 European ancestry individuals,,...,14.522878745280337,,0.6,[0.44-0.76] unit increase,Illumina [9097073] (imputed),N,carboxylic acid measurement,http://www.ebi.ac.uk/efo/EFO_0010468,GCST90257503,Genome-wide genotyping array


In [6]:
gwas_df.columns

Index(['DATE ADDED TO CATALOG', 'PUBMEDID', 'FIRST AUTHOR', 'DATE', 'JOURNAL',
       'LINK', 'STUDY', 'DISEASE/TRAIT', 'INITIAL SAMPLE SIZE',
       'REPLICATION SAMPLE SIZE', 'REGION', 'CHR_ID', 'CHR_POS',
       'REPORTED GENE(S)', 'MAPPED_GENE', 'UPSTREAM_GENE_ID',
       'DOWNSTREAM_GENE_ID', 'SNP_GENE_IDS', 'UPSTREAM_GENE_DISTANCE',
       'DOWNSTREAM_GENE_DISTANCE', 'STRONGEST SNP-RISK ALLELE', 'SNPS',
       'MERGED', 'SNP_ID_CURRENT', 'CONTEXT', 'INTERGENIC',
       'RISK ALLELE FREQUENCY', 'P-VALUE', 'PVALUE_MLOG', 'P-VALUE (TEXT)',
       'OR or BETA', '95% CI (TEXT)', 'PLATFORM [SNPS PASSING QC]', 'CNV',
       'MAPPED_TRAIT', 'MAPPED_TRAIT_URI', 'STUDY ACCESSION',
       'GENOTYPING TECHNOLOGY'],
      dtype='object')

In [7]:
gwas_short_df = gwas_df[
    [
        'PUBMEDID', 
        'DATE',
        'DISEASE/TRAIT','MAPPED_TRAIT', 'MAPPED_TRAIT_URI',
        'REPORTED GENE(S)', 'MAPPED_GENE'
    ]
].dropna(subset=['MAPPED_GENE', 'MAPPED_TRAIT_URI'])

In [8]:
gwas_short_df

Unnamed: 0,PUBMEDID,DATE,DISEASE/TRAIT,MAPPED_TRAIT,MAPPED_TRAIT_URI,REPORTED GENE(S),MAPPED_GENE
0,29059683,2017-10-23,Breast cancer,breast carcinoma,http://www.ebi.ac.uk/efo/EFO_0000305,NR,FGF10
1,29059683,2017-10-23,Breast cancer,breast carcinoma,http://www.ebi.ac.uk/efo/EFO_0000305,NR,KREMEN1
2,29059683,2017-10-23,Breast cancer,breast carcinoma,http://www.ebi.ac.uk/efo/EFO_0000305,NR,TRPS1
4,29059683,2017-10-23,Breast cancer,breast carcinoma,http://www.ebi.ac.uk/efo/EFO_0000305,NR,PPP4R3A - CATSPERB
5,29059683,2017-10-23,Breast cancer,breast carcinoma,http://www.ebi.ac.uk/efo/EFO_0000305,NR,NF2 - CABP7
...,...,...,...,...,...,...,...
552111,35120996,2022-02-01,Asparagine levels in chronic kidney disease,asparagine measurement,http://www.ebi.ac.uk/efo/EFO_0009766,,ASPG
552112,35120996,2022-02-01,Bilirubin levels in chronic kidney disease,bilirubin measurement,http://www.ebi.ac.uk/efo/EFO_0004570,,"UGT1A9, UGT1A7, UGT1A10, UGT1A8, UGT1A6"
552113,35120996,2022-02-01,Biliverdin levels in chronic kidney disease,biliverdin measurement,http://www.ebi.ac.uk/efo/EFO_0021033,,"UGT1A9, UGT1A5, UGT1A10, UGT1A6, UGT1A7, UGT1A..."
552114,35120996,2022-02-01,Dimethylguanido valerate levels in chronic kid...,carboxylic acid measurement,http://www.ebi.ac.uk/efo/EFO_0010468,,AGXT2


### Transforming GWAS records to binary interactions (`gwas_short_binary_df`)

In [49]:
gwas_short_binary_df = gwas_short_df.copy()

gwas_short_binary_df['gene'] = gwas_short_df['MAPPED_GENE'].apply(
    lambda x: re.split(', | - ', x)
)

gwas_short_binary_df['trait'] = gwas_short_df['MAPPED_TRAIT_URI'].apply(lambda x: x.split(', '))

In [51]:
gwas_short_binary_df = (
    gwas_short_binary_df
        .explode('trait')
        .explode('gene')
)

In [123]:
gwas_short_binary_df

Unnamed: 0,PUBMEDID,DATE,DISEASE/TRAIT,MAPPED_TRAIT,MAPPED_TRAIT_URI,REPORTED GENE(S),MAPPED_GENE,gene,trait
0,29059683,2017-10-23,Breast cancer,breast carcinoma,http://www.ebi.ac.uk/efo/EFO_0000305,NR,FGF10,FGF10,http://www.ebi.ac.uk/efo/EFO_0000305
1,29059683,2017-10-23,Breast cancer,breast carcinoma,http://www.ebi.ac.uk/efo/EFO_0000305,NR,KREMEN1,KREMEN1,http://www.ebi.ac.uk/efo/EFO_0000305
2,29059683,2017-10-23,Breast cancer,breast carcinoma,http://www.ebi.ac.uk/efo/EFO_0000305,NR,TRPS1,TRPS1,http://www.ebi.ac.uk/efo/EFO_0000305
4,29059683,2017-10-23,Breast cancer,breast carcinoma,http://www.ebi.ac.uk/efo/EFO_0000305,NR,PPP4R3A - CATSPERB,PPP4R3A,http://www.ebi.ac.uk/efo/EFO_0000305
4,29059683,2017-10-23,Breast cancer,breast carcinoma,http://www.ebi.ac.uk/efo/EFO_0000305,NR,PPP4R3A - CATSPERB,CATSPERB,http://www.ebi.ac.uk/efo/EFO_0000305
...,...,...,...,...,...,...,...,...,...
552113,35120996,2022-02-01,Biliverdin levels in chronic kidney disease,biliverdin measurement,http://www.ebi.ac.uk/efo/EFO_0021033,,"UGT1A9, UGT1A5, UGT1A10, UGT1A6, UGT1A7, UGT1A...",UGT1A8,http://www.ebi.ac.uk/efo/EFO_0021033
552113,35120996,2022-02-01,Biliverdin levels in chronic kidney disease,biliverdin measurement,http://www.ebi.ac.uk/efo/EFO_0021033,,"UGT1A9, UGT1A5, UGT1A10, UGT1A6, UGT1A7, UGT1A...",UGT1A4,http://www.ebi.ac.uk/efo/EFO_0021033
552113,35120996,2022-02-01,Biliverdin levels in chronic kidney disease,biliverdin measurement,http://www.ebi.ac.uk/efo/EFO_0021033,,"UGT1A9, UGT1A5, UGT1A10, UGT1A6, UGT1A7, UGT1A...",UGT1A3,http://www.ebi.ac.uk/efo/EFO_0021033
552114,35120996,2022-02-01,Dimethylguanido valerate levels in chronic kid...,carboxylic acid measurement,http://www.ebi.ac.uk/efo/EFO_0010468,,AGXT2,AGXT2,http://www.ebi.ac.uk/efo/EFO_0010468


## Opening MRCONSO

In [84]:
mrconso_path = (
    '../../UMLS_Metathesaurus/mrconso_and_semtypes_2022AA_df.pkl'
)

In [85]:
mrconso_st_df = pd.read_pickle(mrconso_path)

## Obtaining mappings for traits from EBI (online)

In [9]:
gwas_df['MAPPED_TRAIT_URI'].sample(10)

249534                 http://www.ebi.ac.uk/efo/EFO_0010228
423150    http://www.ebi.ac.uk/efo/EFO_0006336, http://w...
503297                 http://www.ebi.ac.uk/efo/EFO_0803452
142185            http://purl.obolibrary.org/obo/HP_0000155
546456                 http://www.ebi.ac.uk/efo/EFO_0004339
489979                 http://www.ebi.ac.uk/efo/EFO_0022284
274808         http://purl.obolibrary.org/obo/MONDO_0005148
479777                 http://www.ebi.ac.uk/efo/EFO_0004612
301456    http://www.ebi.ac.uk/efo/EFO_0004574, http://w...
478440                 http://www.ebi.ac.uk/efo/EFO_0004340
Name: MAPPED_TRAIT_URI, dtype: object

In [10]:
gwas_traits_s_list = gwas_short_df['MAPPED_TRAIT_URI'].apply(lambda x: x.split(', '))

In [11]:
gwas_traits_list = []
for ch in gwas_traits_s_list:
    gwas_traits_list += ch

In [12]:
gwas_traits_list = list(set(gwas_traits_list))
len(gwas_traits_list)

8284

In [13]:
Counter([p[7:].split('/')[0] for p in gwas_traits_list])

Counter({'www.ebi.ac.uk': 7088,
         'purl.obolibrary.org': 1187,
         'www.orpha.net': 9})

In [14]:
gwas_traits_ebi_list = [l for l in gwas_traits_list if 'ebi' in l]

In [15]:
gwas_traits_ebi_list[:3]

['http://www.ebi.ac.uk/efo/EFO_0803182',
 'http://www.ebi.ac.uk/efo/EFO_0600065',
 'http://www.ebi.ac.uk/efo/EFO_0021713']

In [30]:
gwas_ebi_traits_raw_list = []
bad_links = []

for ent in tqdm(gwas_traits_ebi_list[:]):
    link = f'https://www.ebi.ac.uk/ols/api/ontologies/efo/terms?iri={ent}'
    
    try:
        req = requests.get(link).text

        gwas_ebi_traits_raw_list.append(
            json.loads(req)['_embedded']['terms'][0]
        )
    except:
        bad_links.append(ent)

100%|█████████████████████████████████████████████████████████████████████████████| 7088/7088 [50:46<00:00,  2.33it/s]


In [31]:
len(bad_links)

292

In [32]:
bad_links[:2]

['http://www.ebi.ac.uk/efo/EFO_0022273',
 'http://www.ebi.ac.uk/efo/EFO_0022245']

### Creating mappings dataframe

In [33]:
gwas_ebi_traits_raw_df = pd.DataFrame(gwas_ebi_traits_raw_list)

In [55]:
gwas_ebi_traits_raw_df['obo_xref']

0       [{'database': 'PMID', 'id': '29875488', 'descr...
1       [{'database': 'PMID', 'id': '34309184', 'descr...
2       [{'database': 'PMID', 'id': '33634981', 'descr...
4       [{'database': 'PMID', 'id': '35347128', 'descr...
5       [{'database': 'PMID', 'id': '35347128', 'descr...
                              ...                        
6790    [{'database': 'PMID', 'id': '23823483', 'descr...
6791    [{'database': 'PMID', 'id': ' 32355309', 'desc...
6792    [{'database': 'PMID', 'id': '34396400', 'descr...
6793    [{'database': 'PMID', 'id': '35347128', 'descr...
6795    [{'database': 'PMID', 'id': '28240269', 'descr...
Name: obo_xref, Length: 11438, dtype: object

In [35]:
gwas_ebi_traits_raw_df['db_ids'] = (
    gwas_ebi_traits_raw_df['annotation']
        .apply(lambda x: x.get('database_cross_reference'))
)

In [36]:
gwas_ebi_traits_raw_df = (
    gwas_ebi_traits_raw_df
        .dropna(
            subset=['db_ids']
        )
        .explode('db_ids')
)

In [38]:
gwas_ebi_traits_raw_df['pref'] = gwas_ebi_traits_raw_df['db_ids'].apply(
    lambda x: x.split(':')[0]
)

In [39]:
gwas_ebi_traits_raw_df['ext_id'] = gwas_ebi_traits_raw_df['db_ids'].apply(
    lambda x: x.split(':')[1] if len(x.split(':')) > 1 else None
)

In [40]:
gwas_ebi_traits_raw_df['ext_id']

0        29875488
1        34309184
2        33634981
4        35347128
5        35347128
          ...    
6790     23823483
6791     32355309
6792     34396400
6793     35347128
6795     28240269
Name: ext_id, Length: 11438, dtype: object

In [56]:
gwas_ebi_traits_raw_df

Unnamed: 0,iri,lang,description,synonyms,annotation,label,ontology_name,ontology_prefix,ontology_iri,is_obsolete,...,obo_id,in_subset,obo_definition_citation,obo_xref,obo_synonym,is_preferred_root,_links,db_ids,pref,ext_id
0,http://www.ebi.ac.uk/efo/EFO_0803182,en,[Quantification of transmembrane protein 119 i...,[],{'database_cross_reference': ['PMID:29875488']...,transmembrane protein 119 measurement,efo,EFO,http://www.ebi.ac.uk/efo/efo.owl,False,...,EFO:0803182,,,"[{'database': 'PMID', 'id': '29875488', 'descr...",,False,{'self': {'href': 'https://www.ebi.ac.uk/ols/a...,PMID:29875488,PMID,29875488
1,http://www.ebi.ac.uk/efo/EFO_0600065,en,[Any process that results in a change in state...,[],{'database_cross_reference': ['PMID:34309184']...,response to tolvaptan,efo,EFO,http://www.ebi.ac.uk/efo/efo.owl,False,...,EFO:0600065,,,"[{'database': 'PMID', 'id': '34309184', 'descr...",,False,{'self': {'href': 'https://www.ebi.ac.uk/ols/a...,PMID:34309184,PMID,34309184
2,http://www.ebi.ac.uk/efo/EFO_0021713,en,[Quantification of the ratio of 3-Indolepropio...,[],{'database_cross_reference': ['PMID:33634981']...,3-Indolepropionic acid to Hippuric acid ratio,efo,EFO,http://www.ebi.ac.uk/efo/efo.owl,False,...,EFO:0021713,,,"[{'database': 'PMID', 'id': '33634981', 'descr...",,False,{'self': {'href': 'https://www.ebi.ac.uk/ols/a...,PMID:33634981,PMID,33634981
4,http://www.ebi.ac.uk/efo/EFO_0800125,en,[Quantification of the amount of beta-hydroxyi...,[],{'database_cross_reference': ['PMID:35347128']...,beta-hydroxyisovalerate measurement,efo,EFO,http://www.ebi.ac.uk/efo/efo.owl,False,...,EFO:0800125,,,"[{'database': 'PMID', 'id': '35347128', 'descr...",,False,{'self': {'href': 'https://www.ebi.ac.uk/ols/a...,PMID:35347128,PMID,35347128
5,http://www.ebi.ac.uk/efo/EFO_0800773,en,[Quantification of the amount of X-17340 in a ...,[],{'database_cross_reference': ['PMID:35347128']...,X-17340 measurement,efo,EFO,http://www.ebi.ac.uk/efo/efo.owl,False,...,EFO:0800773,,,"[{'database': 'PMID', 'id': '35347128', 'descr...",,False,{'self': {'href': 'https://www.ebi.ac.uk/ols/a...,PMID:35347128,PMID,35347128
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6790,http://www.ebi.ac.uk/efo/EFO_0010398,en,[The quantification of sphingomyelin 24:1 leve...,[SM 24:1],{'database_cross_reference': ['PMID:23823483']...,sphingomyelin 24:1 measurement,efo,EFO,http://www.ebi.ac.uk/efo/efo.owl,False,...,EFO:0010398,,,"[{'database': 'PMID', 'id': '23823483', 'descr...",,False,{'self': {'href': 'https://www.ebi.ac.uk/ols/a...,PMID:23823483,PMID,23823483
6791,http://www.ebi.ac.uk/efo/EFO_0010749,en,[Quantification of some aspect of motor functi...,[motor control measurement],{'database_cross_reference': ['PMID: 32355309'...,motor function measurement,efo,EFO,http://www.ebi.ac.uk/efo/efo.owl,False,...,EFO:0010749,,,"[{'database': 'PMID', 'id': ' 32355309', 'desc...",,False,{'self': {'href': 'https://www.ebi.ac.uk/ols/a...,PMID: 32355309,PMID,32355309
6792,http://www.ebi.ac.uk/efo/EFO_0021795,en,"[Infertility caused by fallopian tube damage, ...",[],"{'creator': ['Zoe May Pendlington'], 'database...",tubal factor infertility,efo,EFO,http://www.ebi.ac.uk/efo/efo.owl,False,...,EFO:0021795,,,"[{'database': 'PMID', 'id': '34396400', 'descr...",,False,{'self': {'href': 'https://www.ebi.ac.uk/ols/a...,PMID:34396400,PMID,34396400
6793,http://www.ebi.ac.uk/efo/EFO_0800175,en,[Quantification of the amount of quinolinate i...,[],{'database_cross_reference': ['PMID:35347128']...,quinolinate measurement,efo,EFO,http://www.ebi.ac.uk/efo/efo.owl,False,...,EFO:0800175,,,"[{'database': 'PMID', 'id': '35347128', 'descr...",,False,{'self': {'href': 'https://www.ebi.ac.uk/ols/a...,PMID:35347128,PMID,35347128


In [43]:
gwas_ebi_traits_raw_df[
    gwas_ebi_traits_raw_df['pref'] == 'UMLS'
]

Unnamed: 0,iri,lang,description,synonyms,annotation,label,ontology_name,ontology_prefix,ontology_iri,is_obsolete,...,obo_id,in_subset,obo_definition_citation,obo_xref,obo_synonym,is_preferred_root,_links,db_ids,pref,ext_id
19,http://www.ebi.ac.uk/efo/EFO_0003870,en,[A congenital or acquired aneurysm within the ...,"[Artery Aneurysms, Basilar, Aneurysm, Intracra...","{'database_cross_reference': ['DOID:10941', 'U...",brain aneurysm,efo,EFO,http://www.ebi.ac.uk/efo/efo.owl,False,...,EFO:0003870,,[{'definition': 'A congenital or acquired aneu...,"[{'database': 'UMLS', 'id': 'C0007766', 'descr...","[{'name': 'brain aneurysm', 'scope': 'hasExact...",False,{'self': {'href': 'https://www.ebi.ac.uk/ols/a...,UMLS:C0007766,UMLS,C0007766
42,http://www.ebi.ac.uk/efo/EFO_0004145,en,[A disease of the muscle in which the muscle f...,[myopathy],"{'comment': ['Editor note: TODO check this, re...",myopathy,efo,EFO,http://www.ebi.ac.uk/efo/efo.owl,False,...,EFO:0004145,,[{'definition': 'A disease of the muscle in wh...,"[{'database': 'ICD9', 'id': '359.8', 'descript...","[{'name': 'myopathy', 'scope': 'hasExactSynony...",False,{'self': {'href': 'https://www.ebi.ac.uk/ols/a...,UMLS:C0026848,UMLS,C0026848
89,http://www.ebi.ac.uk/efo/EFO_1000824,en,[A disorder caused by a lack of blood flow or ...,"[perinatal depression, Asphyxia - birth, Unspe...",{'comment': ['Pathophysiology: There are three...,asphyxia neonatorum,efo,EFO,http://www.ebi.ac.uk/efo/efo.owl,False,...,EFO:1000824,[gard_rare],[{'definition': 'A disorder caused by a lack o...,"[{'database': 'DOID', 'id': '11088', 'descript...","[{'name': 'perinatal depression', 'scope': 'ha...",False,{'self': {'href': 'https://www.ebi.ac.uk/ols/a...,UMLS:C0004045,UMLS,C0004045
95,http://www.ebi.ac.uk/efo/EFO_0004286,en,[Occlusion of the lumen of a vein by a thrombu...,"[VTE, venous thromboembolism]",{'database_cross_reference': ['MedDRA:10066899...,venous thromboembolism,efo,EFO,http://www.ebi.ac.uk/efo/efo.owl,False,...,EFO:0004286,,[{'definition': 'Occlusion of the lumen of a v...,"[{'database': 'ICD10', 'id': 'I82', 'descripti...","[{'name': 'venous thromboembolism', 'scope': '...",False,{'self': {'href': 'https://www.ebi.ac.uk/ols/a...,UMLS:C1861172,UMLS,C1861172
99,http://www.ebi.ac.uk/efo/EFO_0003060,en,[A group of at least three distinct histologic...,"[non-small cell carcinoma of lung, NSCLC - non...",{'IAO_0000589': ['non-small cell lung carcinom...,non-small cell lung carcinoma,efo,EFO,http://www.ebi.ac.uk/efo/efo.owl,False,...,EFO:0003060,,[{'definition': 'A group of at least three dis...,"[{'database': 'MONDO', 'id': '0005233', 'descr...",[{'name': 'NSCLC - non-small cell lung cancer'...,False,{'self': {'href': 'https://www.ebi.ac.uk/ols/a...,UMLS:C0007131,UMLS,C0007131
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6719,http://www.ebi.ac.uk/efo/EFO_0006859,en,[A primary or metastatic malignant neoplasm af...,"[malignant neoplasm of the head and neck, head...",{'closeMatch': ['http://identifiers.org/mesh/D...,head and neck malignant neoplasia,efo,EFO,http://www.ebi.ac.uk/efo/efo.owl,False,...,EFO:0006859,,[{'definition': 'A primary or metastatic malig...,"[{'database': 'MedDRA', 'id': '10067821', 'des...","[{'name': 'cancer of head and neck', 'scope': ...",False,{'self': {'href': 'https://www.ebi.ac.uk/ols/a...,UMLS:C0278996,UMLS,C0278996
6759,http://www.ebi.ac.uk/efo/EFO_1001506,en,[An angle-closure glaucoma characterized by cl...,"[primary angle-closure glaucoma, primary angle...","{'database_cross_reference': ['ICD9:365.2', 'U...",primary angle closure glaucoma,efo,EFO,http://www.ebi.ac.uk/efo/efo.owl,False,...,EFO:1001506,,[{'definition': 'An angle-closure glaucoma cha...,"[{'database': 'ICD9', 'id': '365.20', 'descrip...","[{'name': 'primary angle closure glaucoma', 's...",False,{'self': {'href': 'https://www.ebi.ac.uk/ols/a...,UMLS:C0017606,UMLS,C0017606
6762,http://www.ebi.ac.uk/efo/EFO_0006788,en,[A category of psychiatric disorders which are...,"[anxiety disorder, neurotic anxiety states, an...","{'database_cross_reference': ['NCIt:C2878', 'D...",anxiety disorder,efo,EFO,http://www.ebi.ac.uk/efo/efo.owl,False,...,EFO:0006788,,[{'definition': 'A category of psychiatric dis...,"[{'database': 'NCIt', 'id': 'C2878', 'descript...","[{'name': 'anxiety disorder', 'scope': 'hasExa...",False,{'self': {'href': 'https://www.ebi.ac.uk/ols/a...,UMLS:C0003469,UMLS,C0003469
6766,http://www.ebi.ac.uk/efo/EFO_0000284,en,[Increase in constituent cells in the PROSTATE...,"[Adenomas, Prostatic, Benign Hyperplasia of Pr...",{'IAO_0000589': ['benign prostatic hyperplasia...,benign prostatic hyperplasia,efo,EFO,http://www.ebi.ac.uk/efo/efo.owl,False,...,EFO:0000284,,[{'definition': 'A non-cancerous nodular enlar...,"[{'database': 'MONDO', 'id': '0010811', 'descr...","[{'name': 'benign hyperplasia of prostate', 's...",False,{'self': {'href': 'https://www.ebi.ac.uk/ols/a...,UMLS:C1704272,UMLS,C1704272


In [42]:
gwas_ebi_traits_raw_df['pref'].value_counts()

PMID                                           4851
MedDRA                                          673
ICD9                                            604
MONDO                                           475
NCIt                                            471
SNOMEDCT                                        460
DOID                                            419
MeSH                                            417
SCTID                                           407
NCIT                                            385
UMLS                                            384
MESH                                            356
OMIM                                            345
ICD10                                           336
Orphanet                                        159
GARD                                            120
ICD10CM                                         116
EFO                                              87
HP                                               86
Wikipedia   

In [72]:
# come from: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/index.html

# keys refer to gwas_ebi_traits_raw_df['pref'], values refer to Metathesaurus SAB column

umls_external_codes = {
    'ICD9': ['ICD9CM', 'MTHICD9'],
    'MedDRA': ['MDR'],
    'ICD10': ['ICD10'],
    #'UMLS': ['MTHMST'],
    'NCIt': ['NCI'],
    'NCIT': ['NCI'],
    'MeSH': ['MSH'],
    'MESH': ['MSH'],
    'ICD10CM': ['ICD10CM'],
    'SNOMEDCT': ['SNOMEDCT_US'],
    'OMIM': ['OMIM'],
}

In [44]:
gwas_ebi_traits_raw_df.to_pickle(
    'gwas_ebi_traits_raw_df.pkl'
)

In [None]:
gwas_ebi_traits_raw_df = pd.read_pickle('gwas_ebi_traits_raw_df.pkl')

### Splitting mappings to UMLS (direct) and non-UMLS (mrconso)

In [79]:
gwas_ebi_traits_umls_subs_cui_df = (
    gwas_ebi_traits_raw_df[
        gwas_ebi_traits_raw_df['pref'] == 'UMLS'
    ]
)

gwas_ebi_traits_umls_subs_noncui_df = (
    gwas_ebi_traits_raw_df[
        gwas_ebi_traits_raw_df['pref'].isin(umls_external_codes.keys())
    ]
)

#### Direct mappings to UMLS

In [80]:
ebi_link_to_umls_dict = (
    gwas_ebi_traits_umls_subs_cui_df
    [['iri', 'ext_id']]
    .groupby('iri')
    .agg(set)
    ['ext_id']
    .to_dict()
)

In [81]:
ebi_link_to_umls_dict['http://www.ebi.ac.uk/efo/EFO_0000095']

{'C0023434', 'C0855095'}

#### Mappings with MRCONSO

In [82]:
# total unique traits

len(gwas_ebi_traits_raw_df['iri'].drop_duplicates())

5717

In [83]:
# mrconso-compatible unique traits

len(gwas_ebi_traits_umls_subs_noncui_df['iri'].drop_duplicates())

848

In [86]:
mrconso_st_df

Unnamed: 0,CUI,LAT,TS,LUI,STT,SUI,ISPREF,AUI,SAUI,SCUI,SDUI,SAB,TTY,CODE,STR,SRL,SUPPRESS,CVF,sem_types
0,C0000005,ENG,P,L0000005,PF,S0007492,Y,A26634265,,M0019694,D012711,MSH,PEP,D012711,(131)I-Macroaggregated Albumin,,N,256.0,"{Amino Acid, Peptide, or Protein, Indicator, R..."
1,C0000005,ENG,S,L0270109,PF,S0007491,Y,A26634266,,M0019694,D012711,MSH,ET,D012711,(131)I-MAA,,N,256.0,"{Amino Acid, Peptide, or Protein, Indicator, R..."
10,C0000039,ENG,P,L0000039,PF,S17175117,N,A28315139,9194921.0,1926948,,RXNORM,IN,1926948,"1,2-dipalmitoylphosphatidylcholine",,N,256.0,"{Organic Chemical, Pharmacologic Substance}"
11,C0000039,ENG,P,L0000039,PF,S17175117,Y,A28572604,,,,MTH,PN,NOCODE,"1,2-dipalmitoylphosphatidylcholine",,N,256.0,"{Organic Chemical, Pharmacologic Substance}"
12,C0000039,ENG,P,L0000039,VC,S0007564,Y,A0016515,,M0023172,D015060,MSH,MH,D015060,"1,2-Dipalmitoylphosphatidylcholine",,N,,"{Organic Chemical, Pharmacologic Substance}"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16857339,C5574640,ENG,S,L17663434,PF,S21286067,Y,A33944162,,Q9004,,HCPCS,AB,Q9004,Va whole health partner serv,,Y,,{Health Care Activity}
16857340,C5574641,ENG,P,L17663430,PF,S21286031,Y,A33944163,,S1091,,HCPCS,PT,S1091,"Stent, non-coronary, temporary, with delivery ...",,N,,{Medical Device}
16857341,C5574641,ENG,S,L17663429,PF,S21286030,Y,A33944560,,S1091,,HCPCS,AB,S1091,Stent non-coronary propel,,Y,,{Medical Device}
16857342,C5574642,ENG,P,L17662944,PF,S21285763,Y,A33944049,,S9432,,HCPCS,PT,S9432,Medical foods for non-inborn errors of metabolism,,N,,{Food}


In [91]:
gwas_ebi_traits_umls_subs_noncui_mrconso_df = pd.merge(
    left=gwas_ebi_traits_umls_subs_noncui_df,
    right=mrconso_st_df[['CUI', 'STR', 'CODE', 'SAB']],
    left_on='ext_id',
    right_on='CODE',
    how='left'
)

In [93]:
gwas_ebi_traits_umls_subs_noncui_mrconso_df['SAB']

0                MSH
1                MSH
2                MSH
3                MSH
4                MSH
            ...     
25879            MSH
25880            MSH
25881    SNOMEDCT_US
25882    SNOMEDCT_US
25883            MDR
Name: SAB, Length: 25884, dtype: object

In [105]:
# enforcing that target codes come from restricted list of sources

gwas_ebi_traits_umls_subs_noncui_mrconso_df = (
    gwas_ebi_traits_umls_subs_noncui_mrconso_df[
        gwas_ebi_traits_umls_subs_noncui_mrconso_df['SAB']
            .isin(sum(umls_external_codes.values(), []))
    ]
)

In [106]:
gwas_ebi_traits_umls_subs_noncui_mrconso_df

Unnamed: 0,iri,lang,description,synonyms,annotation,label,ontology_name,ontology_prefix,ontology_iri,is_obsolete,...,obo_synonym,is_preferred_root,_links,db_ids,pref,ext_id,CUI,STR,CODE,SAB
0,http://www.ebi.ac.uk/efo/EFO_1000965,en,[A systemic non-thrombocytopenic purpura cause...,"[Autoimmune purpura (disorder) [Ambiguous], Al...","{'database_cross_reference': ['DOID:11123', 'M...",Henoch-Schoenlein purpura,efo,EFO,http://www.ebi.ac.uk/efo/efo.owl,False,...,,False,{'self': {'href': 'https://www.ebi.ac.uk/ols/a...,MeSH:D011695,MeSH,D011695,C0034152,Henoch-Schoenlein Purpura,D011695,MSH
1,http://www.ebi.ac.uk/efo/EFO_1000965,en,[A systemic non-thrombocytopenic purpura cause...,"[Autoimmune purpura (disorder) [Ambiguous], Al...","{'database_cross_reference': ['DOID:11123', 'M...",Henoch-Schoenlein purpura,efo,EFO,http://www.ebi.ac.uk/efo/efo.owl,False,...,,False,{'self': {'href': 'https://www.ebi.ac.uk/ols/a...,MeSH:D011695,MeSH,D011695,C0034152,"Purpura, Henoch-Schonlein",D011695,MSH
2,http://www.ebi.ac.uk/efo/EFO_1000965,en,[A systemic non-thrombocytopenic purpura cause...,"[Autoimmune purpura (disorder) [Ambiguous], Al...","{'database_cross_reference': ['DOID:11123', 'M...",Henoch-Schoenlein purpura,efo,EFO,http://www.ebi.ac.uk/efo/efo.owl,False,...,,False,{'self': {'href': 'https://www.ebi.ac.uk/ols/a...,MeSH:D011695,MeSH,D011695,C0034152,"Purpura, Schonlein-Henoch",D011695,MSH
3,http://www.ebi.ac.uk/efo/EFO_1000965,en,[A systemic non-thrombocytopenic purpura cause...,"[Autoimmune purpura (disorder) [Ambiguous], Al...","{'database_cross_reference': ['DOID:11123', 'M...",Henoch-Schoenlein purpura,efo,EFO,http://www.ebi.ac.uk/efo/efo.owl,False,...,,False,{'self': {'href': 'https://www.ebi.ac.uk/ols/a...,MeSH:D011695,MeSH,D011695,C0034152,Henoch-Schonlein Purpura,D011695,MSH
4,http://www.ebi.ac.uk/efo/EFO_1000965,en,[A systemic non-thrombocytopenic purpura cause...,"[Autoimmune purpura (disorder) [Ambiguous], Al...","{'database_cross_reference': ['DOID:11123', 'M...",Henoch-Schoenlein purpura,efo,EFO,http://www.ebi.ac.uk/efo/efo.owl,False,...,,False,{'self': {'href': 'https://www.ebi.ac.uk/ols/a...,MeSH:D011695,MeSH,D011695,C0034152,Henoch Schonlein Purpuras,D011695,MSH
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25879,http://www.ebi.ac.uk/efo/EFO_0004277,en,"[Tissue NECROSIS in any area of the brain, inc...","[ANTERIOR CEREBRAL CIRC INFARCT, Infarction, A...","{'database_cross_reference': ['DOID:3454', 'IC...",brain infarction,efo,EFO,http://www.ebi.ac.uk/efo/efo.owl,False,...,"[{'name': 'brain infarction', 'scope': 'hasExa...",False,{'self': {'href': 'https://www.ebi.ac.uk/ols/a...,MESH:D020520,MESH,D020520,C0751955,"Infarcts, Brain",D020520,MSH
25880,http://www.ebi.ac.uk/efo/EFO_0004277,en,"[Tissue NECROSIS in any area of the brain, inc...","[ANTERIOR CEREBRAL CIRC INFARCT, Infarction, A...","{'database_cross_reference': ['DOID:3454', 'IC...",brain infarction,efo,EFO,http://www.ebi.ac.uk/efo/efo.owl,False,...,"[{'name': 'brain infarction', 'scope': 'hasExa...",False,{'self': {'href': 'https://www.ebi.ac.uk/ols/a...,MESH:D020520,MESH,D020520,C0751955,"Infarct, Brain",D020520,MSH
25881,http://www.ebi.ac.uk/efo/EFO_0004277,en,"[Tissue NECROSIS in any area of the brain, inc...","[ANTERIOR CEREBRAL CIRC INFARCT, Infarction, A...","{'database_cross_reference': ['DOID:3454', 'IC...",brain infarction,efo,EFO,http://www.ebi.ac.uk/efo/efo.owl,False,...,"[{'name': 'brain infarction', 'scope': 'hasExa...",False,{'self': {'href': 'https://www.ebi.ac.uk/ols/a...,SNOMEDCT:230693009,SNOMEDCT,230693009,C0393953,Anterior cerebral circulation infarction,230693009,SNOMEDCT_US
25882,http://www.ebi.ac.uk/efo/EFO_0004277,en,"[Tissue NECROSIS in any area of the brain, inc...","[ANTERIOR CEREBRAL CIRC INFARCT, Infarction, A...","{'database_cross_reference': ['DOID:3454', 'IC...",brain infarction,efo,EFO,http://www.ebi.ac.uk/efo/efo.owl,False,...,"[{'name': 'brain infarction', 'scope': 'hasExa...",False,{'self': {'href': 'https://www.ebi.ac.uk/ols/a...,SNOMEDCT:230693009,SNOMEDCT,230693009,C0393953,Anterior cerebral circulation infarction (diso...,230693009,SNOMEDCT_US


In [108]:
ebi_link_to_umls_mrconso_dict = (
    gwas_ebi_traits_umls_subs_noncui_mrconso_df
    [['iri', 'CUI']]
    .groupby('iri')
    .agg(set)
    ['CUI']
    .to_dict()
)

In [110]:
len(ebi_link_to_umls_mrconso_dict)

848

#### Merging direct and indirect mappings (`iri -> {CUI}`)

In [114]:
len(ebi_link_to_umls_dict), len(ebi_link_to_umls_mrconso_dict)

(324, 848)

In [120]:
iri_to_cui_dict = defaultdict(set)

for iri, cuis in ebi_link_to_umls_dict.items():
    iri_to_cui_dict[iri].update(cuis)
    
for iri, cuis in ebi_link_to_umls_mrconso_dict.items():
    iri_to_cui_dict[iri].update(cuis)

In [121]:
len(iri_to_cui_dict)

849

## Mapping to gene names UMLS

In [140]:
gwas_short_binary_df['gene_lower'] = gwas_short_binary_df['gene'].str.lower()

In [141]:
gwas_gene_lowercase_list = list(
    gwas_short_binary_df['gene_lower']
        .drop_duplicates()
)
len(gwas_gene_lowercase_list)

30811

In [130]:
pref_semtypes = {
    'Amino Acid, Peptide, or Protein',
    'Gene or Genome'
}

In [139]:
mrconso_st_df['STR_lower'] = mrconso_st_df['STR'].str.lower()

In [142]:
mrconso_gene_st_df = mrconso_st_df[
    (
          mrconso_st_df['STR_lower'].isin(gwas_gene_lowercase_list)
        & mrconso_st_df['sem_types'].apply(lambda x: pref_semtypes.intersection(x))
    )
]
len(mrconso_gene_st_df)

57943

In [144]:
mrconso_gene_to_cui_dict = (
    mrconso_gene_st_df[['CUI', 'STR_lower']]
        .groupby('STR_lower')
        .agg(set)
        ['CUI']
        .to_dict()
)

In [146]:
mrconso_gene_to_cui_dict['a1cf']

{'C0104364', 'C2240226'}

## Adding mappings to `gwas_short_binary_df`

In [124]:
gwas_short_binary_df['trait_cui'] = (
    gwas_short_binary_df['trait'].apply(lambda x: iri_to_cui_dict.get(x))
)

In [147]:
gwas_short_binary_df

Unnamed: 0,PUBMEDID,DATE,DISEASE/TRAIT,MAPPED_TRAIT,MAPPED_TRAIT_URI,REPORTED GENE(S),MAPPED_GENE,gene,trait,trait_cui,gene_lower
0,29059683,2017-10-23,Breast cancer,breast carcinoma,http://www.ebi.ac.uk/efo/EFO_0000305,NR,FGF10,FGF10,http://www.ebi.ac.uk/efo/EFO_0000305,"{C0678222, C0346153, C0006142, C3809918, C1861...",fgf10
1,29059683,2017-10-23,Breast cancer,breast carcinoma,http://www.ebi.ac.uk/efo/EFO_0000305,NR,KREMEN1,KREMEN1,http://www.ebi.ac.uk/efo/EFO_0000305,"{C0678222, C0346153, C0006142, C3809918, C1861...",kremen1
2,29059683,2017-10-23,Breast cancer,breast carcinoma,http://www.ebi.ac.uk/efo/EFO_0000305,NR,TRPS1,TRPS1,http://www.ebi.ac.uk/efo/EFO_0000305,"{C0678222, C0346153, C0006142, C3809918, C1861...",trps1
4,29059683,2017-10-23,Breast cancer,breast carcinoma,http://www.ebi.ac.uk/efo/EFO_0000305,NR,PPP4R3A - CATSPERB,PPP4R3A,http://www.ebi.ac.uk/efo/EFO_0000305,"{C0678222, C0346153, C0006142, C3809918, C1861...",ppp4r3a
4,29059683,2017-10-23,Breast cancer,breast carcinoma,http://www.ebi.ac.uk/efo/EFO_0000305,NR,PPP4R3A - CATSPERB,CATSPERB,http://www.ebi.ac.uk/efo/EFO_0000305,"{C0678222, C0346153, C0006142, C3809918, C1861...",catsperb
...,...,...,...,...,...,...,...,...,...,...,...
552113,35120996,2022-02-01,Biliverdin levels in chronic kidney disease,biliverdin measurement,http://www.ebi.ac.uk/efo/EFO_0021033,,"UGT1A9, UGT1A5, UGT1A10, UGT1A6, UGT1A7, UGT1A...",UGT1A8,http://www.ebi.ac.uk/efo/EFO_0021033,,ugt1a8
552113,35120996,2022-02-01,Biliverdin levels in chronic kidney disease,biliverdin measurement,http://www.ebi.ac.uk/efo/EFO_0021033,,"UGT1A9, UGT1A5, UGT1A10, UGT1A6, UGT1A7, UGT1A...",UGT1A4,http://www.ebi.ac.uk/efo/EFO_0021033,,ugt1a4
552113,35120996,2022-02-01,Biliverdin levels in chronic kidney disease,biliverdin measurement,http://www.ebi.ac.uk/efo/EFO_0021033,,"UGT1A9, UGT1A5, UGT1A10, UGT1A6, UGT1A7, UGT1A...",UGT1A3,http://www.ebi.ac.uk/efo/EFO_0021033,,ugt1a3
552114,35120996,2022-02-01,Dimethylguanido valerate levels in chronic kid...,carboxylic acid measurement,http://www.ebi.ac.uk/efo/EFO_0010468,,AGXT2,AGXT2,http://www.ebi.ac.uk/efo/EFO_0010468,,agxt2


In [148]:
gwas_short_binary_df['gene_cui'] = (
    gwas_short_binary_df['gene_lower'].apply(lambda x: mrconso_gene_to_cui_dict.get(x))
)

In [149]:
gwas_short_binary_df

Unnamed: 0,PUBMEDID,DATE,DISEASE/TRAIT,MAPPED_TRAIT,MAPPED_TRAIT_URI,REPORTED GENE(S),MAPPED_GENE,gene,trait,trait_cui,gene_lower,gene_cui
0,29059683,2017-10-23,Breast cancer,breast carcinoma,http://www.ebi.ac.uk/efo/EFO_0000305,NR,FGF10,FGF10,http://www.ebi.ac.uk/efo/EFO_0000305,"{C0678222, C0346153, C0006142, C3809918, C1861...",fgf10,"{C1432620, C1333535}"
1,29059683,2017-10-23,Breast cancer,breast carcinoma,http://www.ebi.ac.uk/efo/EFO_0000305,NR,KREMEN1,KREMEN1,http://www.ebi.ac.uk/efo/EFO_0000305,"{C0678222, C0346153, C0006142, C3809918, C1861...",kremen1,{C1424993}
2,29059683,2017-10-23,Breast cancer,breast carcinoma,http://www.ebi.ac.uk/efo/EFO_0000305,NR,TRPS1,TRPS1,http://www.ebi.ac.uk/efo/EFO_0000305,"{C0678222, C0346153, C0006142, C3809918, C1861...",trps1,{C1421175}
4,29059683,2017-10-23,Breast cancer,breast carcinoma,http://www.ebi.ac.uk/efo/EFO_0000305,NR,PPP4R3A - CATSPERB,PPP4R3A,http://www.ebi.ac.uk/efo/EFO_0000305,"{C0678222, C0346153, C0006142, C3809918, C1861...",ppp4r3a,{C1822785}
4,29059683,2017-10-23,Breast cancer,breast carcinoma,http://www.ebi.ac.uk/efo/EFO_0000305,NR,PPP4R3A - CATSPERB,CATSPERB,http://www.ebi.ac.uk/efo/EFO_0000305,"{C0678222, C0346153, C0006142, C3809918, C1861...",catsperb,{C1426973}
...,...,...,...,...,...,...,...,...,...,...,...,...
552113,35120996,2022-02-01,Biliverdin levels in chronic kidney disease,biliverdin measurement,http://www.ebi.ac.uk/efo/EFO_0021033,,"UGT1A9, UGT1A5, UGT1A10, UGT1A6, UGT1A7, UGT1A...",UGT1A8,http://www.ebi.ac.uk/efo/EFO_0021033,,ugt1a8,"{C1122343, C1421332}"
552113,35120996,2022-02-01,Biliverdin levels in chronic kidney disease,biliverdin measurement,http://www.ebi.ac.uk/efo/EFO_0021033,,"UGT1A9, UGT1A5, UGT1A10, UGT1A6, UGT1A7, UGT1A...",UGT1A4,http://www.ebi.ac.uk/efo/EFO_0021033,,ugt1a4,{C1421328}
552113,35120996,2022-02-01,Biliverdin levels in chronic kidney disease,biliverdin measurement,http://www.ebi.ac.uk/efo/EFO_0021033,,"UGT1A9, UGT1A5, UGT1A10, UGT1A6, UGT1A7, UGT1A...",UGT1A3,http://www.ebi.ac.uk/efo/EFO_0021033,,ugt1a3,{C1421327}
552114,35120996,2022-02-01,Dimethylguanido valerate levels in chronic kid...,carboxylic acid measurement,http://www.ebi.ac.uk/efo/EFO_0010468,,AGXT2,AGXT2,http://www.ebi.ac.uk/efo/EFO_0010468,,agxt2,{C1422631}


In [150]:
gwas_short_binary_cui_df = (
    gwas_short_binary_df
        .explode('gene_cui')
        .explode('trait_cui')
        .dropna(subset=['trait_cui', 'gene_cui'])
)

In [151]:
gwas_short_binary_cui_df

Unnamed: 0,PUBMEDID,DATE,DISEASE/TRAIT,MAPPED_TRAIT,MAPPED_TRAIT_URI,REPORTED GENE(S),MAPPED_GENE,gene,trait,trait_cui,gene_lower,gene_cui
0,29059683,2017-10-23,Breast cancer,breast carcinoma,http://www.ebi.ac.uk/efo/EFO_0000305,NR,FGF10,FGF10,http://www.ebi.ac.uk/efo/EFO_0000305,C0678222,fgf10,C1432620
0,29059683,2017-10-23,Breast cancer,breast carcinoma,http://www.ebi.ac.uk/efo/EFO_0000305,NR,FGF10,FGF10,http://www.ebi.ac.uk/efo/EFO_0000305,C0346153,fgf10,C1432620
0,29059683,2017-10-23,Breast cancer,breast carcinoma,http://www.ebi.ac.uk/efo/EFO_0000305,NR,FGF10,FGF10,http://www.ebi.ac.uk/efo/EFO_0000305,C0006142,fgf10,C1432620
0,29059683,2017-10-23,Breast cancer,breast carcinoma,http://www.ebi.ac.uk/efo/EFO_0000305,NR,FGF10,FGF10,http://www.ebi.ac.uk/efo/EFO_0000305,C3809918,fgf10,C1432620
0,29059683,2017-10-23,Breast cancer,breast carcinoma,http://www.ebi.ac.uk/efo/EFO_0000305,NR,FGF10,FGF10,http://www.ebi.ac.uk/efo/EFO_0000305,C1861906,fgf10,C1432620
...,...,...,...,...,...,...,...,...,...,...,...,...
552112,35120996,2022-02-01,Bilirubin levels in chronic kidney disease,bilirubin measurement,http://www.ebi.ac.uk/efo/EFO_0004570,,"UGT1A9, UGT1A7, UGT1A10, UGT1A8, UGT1A6",UGT1A10,http://www.ebi.ac.uk/efo/EFO_0004570,C0344395,ugt1a10,C1311129
552112,35120996,2022-02-01,Bilirubin levels in chronic kidney disease,bilirubin measurement,http://www.ebi.ac.uk/efo/EFO_0004570,,"UGT1A9, UGT1A7, UGT1A10, UGT1A8, UGT1A6",UGT1A10,http://www.ebi.ac.uk/efo/EFO_0004570,C0344395,ugt1a10,C1421323
552112,35120996,2022-02-01,Bilirubin levels in chronic kidney disease,bilirubin measurement,http://www.ebi.ac.uk/efo/EFO_0004570,,"UGT1A9, UGT1A7, UGT1A10, UGT1A8, UGT1A6",UGT1A8,http://www.ebi.ac.uk/efo/EFO_0004570,C0344395,ugt1a8,C1122343
552112,35120996,2022-02-01,Bilirubin levels in chronic kidney disease,bilirubin measurement,http://www.ebi.ac.uk/efo/EFO_0004570,,"UGT1A9, UGT1A7, UGT1A10, UGT1A8, UGT1A6",UGT1A8,http://www.ebi.ac.uk/efo/EFO_0004570,C0344395,ugt1a8,C1421332


## Extracting pairs and saving

In [152]:
gwas_pairs = list({
    tuple(sorted(p)) for p in list(
        zip(
            gwas_short_binary_cui_df['trait_cui'],
            gwas_short_binary_cui_df['gene_cui']
        )
    )
})
len(gwas_pairs)

642544

In [153]:
with open('../../benchmark_data/01_cui_pairs_json/gwas_cui_pairs.json', 'w') as f:
    json.dump(gwas_pairs, f)