# KEGG data collection

Link: https://www.genome.jp/kegg/pathway.html

In [220]:
from Bio.KEGG import REST
from tqdm import tqdm
from collections import defaultdict, Counter
import requests
import numpy as np
import pandas as pd
import json
import random
import time
import re
from itertools import combinations

In [92]:
tqdm.pandas()

## Opening ready-to-use files

### UMLS MRCONSO mapping

In [121]:
mrconso_path = (
    '../../UMLS_Metathesaurus/mrconso_and_semtypes_2022AA_df.pkl'
)

In [122]:
mrconso_st_df = pd.read_pickle(mrconso_path)

In [123]:
mrconso_st_df

Unnamed: 0,CUI,LAT,TS,LUI,STT,SUI,ISPREF,AUI,SAUI,SCUI,SDUI,SAB,TTY,CODE,STR,SRL,SUPPRESS,CVF,sem_types
0,C0000005,ENG,P,L0000005,PF,S0007492,Y,A26634265,,M0019694,D012711,MSH,PEP,D012711,(131)I-Macroaggregated Albumin,,N,256.0,"{Indicator, Reagent, or Diagnostic Aid, Amino ..."
1,C0000005,ENG,S,L0270109,PF,S0007491,Y,A26634266,,M0019694,D012711,MSH,ET,D012711,(131)I-MAA,,N,256.0,"{Indicator, Reagent, or Diagnostic Aid, Amino ..."
10,C0000039,ENG,P,L0000039,PF,S17175117,N,A28315139,9194921.0,1926948,,RXNORM,IN,1926948,"1,2-dipalmitoylphosphatidylcholine",,N,256.0,"{Organic Chemical, Pharmacologic Substance}"
11,C0000039,ENG,P,L0000039,PF,S17175117,Y,A28572604,,,,MTH,PN,NOCODE,"1,2-dipalmitoylphosphatidylcholine",,N,256.0,"{Organic Chemical, Pharmacologic Substance}"
12,C0000039,ENG,P,L0000039,VC,S0007564,Y,A0016515,,M0023172,D015060,MSH,MH,D015060,"1,2-Dipalmitoylphosphatidylcholine",,N,,"{Organic Chemical, Pharmacologic Substance}"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16857339,C5574640,ENG,S,L17663434,PF,S21286067,Y,A33944162,,Q9004,,HCPCS,AB,Q9004,Va whole health partner serv,,Y,,{Health Care Activity}
16857340,C5574641,ENG,P,L17663430,PF,S21286031,Y,A33944163,,S1091,,HCPCS,PT,S1091,"Stent, non-coronary, temporary, with delivery ...",,N,,{Medical Device}
16857341,C5574641,ENG,S,L17663429,PF,S21286030,Y,A33944560,,S1091,,HCPCS,AB,S1091,Stent non-coronary propel,,Y,,{Medical Device}
16857342,C5574642,ENG,P,L17662944,PF,S21285763,Y,A33944049,,S9432,,HCPCS,PT,S9432,Medical foods for non-inborn errors of metabolism,,N,,{Food}


In [47]:
mesh_to_umls_dict = (
    mrconso_st_df[
        mrconso_st_df['SAB'] == 'MSH'
    ]
    [['CUI', 'CODE']]
        .groupby('CODE')
        .agg(set)['CUI']
        .to_dict()
)

### UMLS CAS mapping

In [124]:
cas_to_cui_dict = pd.read_pickle(
    '../../pubchem_mappings/pc_cas_to_CUI_full_dict.pkl'
)

In [140]:
cas_to_cui_dict = pd.read_pickle(
    '../../chemidplus_local/cas_to_cui_meshdb_dict.pkl'
)

In [141]:
len(cas_to_cui_dict)

80344

In [127]:
len(cas_to_cui_dict)

132597

In [142]:
cas_to_cui_dict['56124-62-0']

{'C0068314', 'C0101466', 'C0133123', 'C0724176', 'C1519947'}

## Obtaining raw pathways from KEGG db (online)

The prefix has the following meaning:

- **map**
    manually drawn reference pathway
    
- **ko**
    reference pathway highlighting KOs (KEGG Orthology)
    
- **ec**
    reference metabolic pathway highlighting EC numbers
    
- **rn**
    reference metabolic pathway highlighting reactions
    
- **org**
    organism-specific pathway generated by converting KOs to gene identifiers

In [21]:
# list of prefixes to obtain

orgs_list = [
    'map',
    'ec',
    'rn',
    'ko',
    'hsa', #human
]

In [22]:
pathways_raw_list = []

for org in tqdm(orgs_list):
    pathways_raw_list.append(
        REST.kegg_list(
            database="pathway", 
            org=org)
        .read()
    )

100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [00:07<00:00,  1.60s/it]


In [24]:
# Filter all human pathways for repair pathways
pathways = dict()

for org_pw in pathways_raw:

    for line in org_pw.rstrip().split("\n"):
        entry, description = line.split("\t")
        pathways[entry] = description

In [25]:
len(pathways)

1741

In [30]:
pathways['hsa00010']

'Glycolysis / Gluconeogenesis - Homo sapiens (human)'

In [36]:
rawKEGGPathways_list = []

for pw in tqdm(pathways):
    pathway_file = REST.kegg_get(pw).read()
    rawKEGGPathways_list.append(pathway_file)

100%|█████████████████████████████████████████████████████████████████████████████| 1741/1741 [47:30<00:00,  1.64s/it]


In [40]:
with open('rawKEGGPathways_list.json', 'w') as f:
    json.dump(rawKEGGPathways_list, f)

In [233]:
# How a single record looks like

rawKEGGPathways_list[20].split('\n')

['ENTRY       map00620                    Pathway',
 'NAME        Pyruvate metabolism',
 'CLASS       Metabolism; Carbohydrate metabolism',
 'PATHWAY_MAP map00620  Pyruvate metabolism',
 'MODULE      M00168  CAM (Crassulacean acid metabolism), dark [PATH:map00620]',
 '            M00169  CAM (Crassulacean acid metabolism), light [PATH:map00620]',
 '            M00172  C4-dicarboxylic acid cycle, NADP - malic enzyme type [PATH:map00620]',
 '            M00307  Pyruvate oxidation, pyruvate => acetyl-CoA [PATH:map00620]',
 '            M00579  Phosphate acetyltransferase-acetate kinase pathway, acetyl-CoA => acetate [PATH:map00620]',
 'DBLINKS     GO: 0006090',
 'REFERENCE   ',
 '  AUTHORS   Nishizuka Y, Seyama Y, Ikai A, Ishimura Y, Kawaguchi A (eds).',
 '  TITLE     [Cellular Functions and Metabolic Maps] (In Japanese)',
 '  JOURNAL   Tokyo Kagaku Dojin (1997)',
 'REL_PATHWAY map00010  Glycolysis / Gluconeogenesis',
 '            map00020  Citrate cycle (TCA cycle)',
 '            map00

## Parsing raw pathways

In [None]:
with open('rawKEGGPathways_list.json', 'r') as f:
    rawKEGGPathways_list = json.load(f)

In [50]:
interestingSections = ['NAME', 'DISEASE', 'DRUG', 'GENE', 'COMPOUND']

In [52]:
records_dict = defaultdict(lambda: defaultdict(list))
for pw in tqdm(rawKEGGPathways_list[:]):
    currentSection = None
    for line in pw.rstrip().split('\n'):
        section = line[:12].strip()
        if not section == '':
            currentSection = section
        if currentSection == 'ENTRY':
            currentName = ' '.join(line[12:].split())
        if currentSection in interestingSections:
            records_dict[currentName][currentSection].append(line[12:])

100%|██████████████████████████████████████████████████████████████████████████| 1741/1741 [00:00<00:00, 11783.54it/s]


In [62]:
records_dict['hsa05225 Pathway']['NAME']

['Hepatocellular carcinoma - Homo sapiens (human)']

In [61]:
records_dict['hsa05225 Pathway']['COMPOUND']

['C00027  Hydrogen peroxide',
 'C00039  DNA',
 'C00076  Calcium cation',
 'C00165  Diacylglycerol',
 'C00533  Nitric oxide',
 'C00704  Superoxide',
 'C01245  D-myo-Inositol 1,4,5-trisphosphate',
 'C01471  Acrolein',
 'C05981  Phosphatidylinositol-3,4,5-trisphosphate',
 'C06793  Vinyl chloride',
 'C06800  Aflatoxin B1',
 'C16844  Hydroxyl radical',
 'C21641  Hydroperoxyl radical',
 'C21642  4-Hydroxynonenal']

## Getting mappings from KEGG for diseases, compounds, drugs (online)

In [63]:
kegg_ent_ids_list = []

for pw_key in records_dict:
    for name in records_dict[pw_key]:
        if name in ['DRUG', 'COMPOUND', 'DISEASE']:
            for ent in records_dict[pw_key][name]:
                kegg_ent_ids_list.append(ent)

In [66]:
kegg_ent_ids_set = set([ent.split(' ')[0] for ent in kegg_ent_ids_list])

kegg_ent_ids_list = list(kegg_ent_ids_set)

len(kegg_ent_ids_set)

12799

In [68]:
# splitting unique ids into chunks for faster querying

n_per_request = 10

kegg_ent_ids_chunked = [
    kegg_ent_ids_list[i:i+n_per_request] for i in range(
        0,
        len(kegg_ent_ids_list),
        n_per_request
    )
]

In [69]:
kegg_ent_ids_chunked[5]

['D05523',
 'D02462',
 'C00170',
 'C16201',
 'C22163',
 'D12445',
 'C12003',
 'C06027',
 'D00921',
 'C03272']

In [81]:
chunk_raw_responses_list = []
chunk_raw_bad_responses_list = []

for ids_chunk in tqdm(kegg_ent_ids_chunked):
    try:
        chunk_raw_response = REST.kegg_get(ids_chunk).read().split('///\n')
        chunk_raw_responses_list += chunk_raw_response
    except Exception as e:
        print(e)
        chunk_raw_bad_responses_list.append(ids_chunk)
    time.sleep(1)

100%|█████████████████████████████████████████████████████████████████████████████| 1280/1280 [50:54<00:00,  2.39s/it]


### Parsing KEGG online mappings, creating a dataframe

In [82]:
split_pos = 12

hooks = {'ENTRY', 'NAME', 'DBLINKS'}

kegg_ents_list = []

for ent in chunk_raw_responses_list:
    lines = ent.split('\n')
    cur_category = ''
    
    ent_dict = defaultdict(list)
    
    for line in lines:
        prefix, content = line[:12].strip(), line[12:]
        
        if prefix:
            cur_category = prefix
        
        if cur_category in hooks:
            ent_dict[cur_category].append(content)
        
    kegg_ents_list.append(ent_dict)

In [83]:
kegg_ents_list[3]

defaultdict(list,
            {'ENTRY': ['C05349                      Compound'],
             'NAME': ['Ciprofloxacin'],
             'DBLINKS': ['CAS: 85721-33-1',
              'PubChem: 7727',
              'ChEBI: 100241',
              'PDB-CCD: CPF',
              'NIKKAJI: J21.732H']})

In [84]:
kegg_ents_parsed_list = []

for ent in kegg_ents_list:
    ent_dict = dict()
    for key in ent:
        if key == 'ENTRY':
            entry_parsed = ' '.join(ent[key]).split()
            #if len(entry_parsed) > 2:
                #print(entry_parsed)
            #entry_id, entry_type = entry_parsed
            
            ent_dict['entry_id'] = entry_parsed[0]
            ent_dict['entry_type'] = '|'.join(entry_parsed[1:])
        
        if key == 'NAME':
            ent_dict['name'] = ''.join(ent[key]).replace(';', '|')
            
        if key == 'DBLINKS':
            dblinks_dict = dict()
            for dbl_line in ent[key]:
                if dbl_line:
                    db_name, db_id = dbl_line.split(': ')
                    dblinks_dict[db_name] = db_id
        
            #ent_dict['dblinks'] = dblinks_dict
            ent_dict.update(dblinks_dict)
    if ent_dict:
        kegg_ents_parsed_list.append(ent_dict)

In [85]:
kegg_ents_df = pd.DataFrame(kegg_ents_parsed_list)

In [86]:
kegg_ents_df

Unnamed: 0,entry_id,entry_type,name,CAS,PubChem,ChEBI,3DMET,NIKKAJI,LigandBox,PDB-CCD,...,LIPIDMAPS,JCGGDB,LipidBank,KNApSAcK,GlycoEpitope,ICD-11,ICD-10,MeSH,OMIM,MedlinePlus
0,C01073,Compound,N-Acetyl-beta-alanine,3025-95-4,4311,16682,B00235,J135.888J,,,...,,,,,,,,,,
1,D05267,Drug,Orbofiban acetate (USAN),165800-05-5,47206976,,,,D05267,,...,,,,,,,,,,
2,D05972,Drug,Sultamicillin (USAN/INN),76497-13-7,47207630,,,J34.488E,D05972,,...,,,,,,,,,,
3,C05349,Compound,Ciprofloxacin,85721-33-1,7727,100241,,J21.732H,,CPF,...,,,,,,,,,,
4,D00079,Drug,Dinoprostone (JAN/USP/INN)|Cervidil (TN)|Prepi...,363-24-6,7847147,15551,,J9.243F,D00079,P2E,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12794,D03493,Drug,Ciglitazone (USAN/INN),74772-77-3,17397629,64227,,J20.561C,D03493,,...,,,,,,,,,,
12795,C17953,Compound,(5S)-Albaflavenol,,96024174,51480,,J2.799.619A,,,...,LMPR0103700009,,,,,,,,,
12796,C10557,Compound,Dihydroanhydropodorhizol|Yatein|(-)-Yatein|(3R...,40456-50-6,12740,4553,B03955,J257.868I,,YTN,...,,,,C00002598,,,,,,
12797,C15096,Compound,"GV 150013X|N-Phenyl-N'-[(3R)-2,3,4,5-tetrahydr...",,17396093,79615,,J924.335F,,,...,,,,,,,,,,


In [87]:
kegg_ents_df['entry_type'].value_counts()

Compound        6342
Drug            5991
Glycan           330
Disease           80
Mixture|Drug      56
Name: entry_type, dtype: int64

In [88]:
kegg_ents_df.to_pickle('kegg_ent_mappings_df.pkl')

### Mapping `Disease` (with ICD-10 codes)

In [99]:
kegg_ents_df[
    kegg_ents_df['entry_type'] == 'Disease'
].notna().sum()

entry_id        80
entry_type      80
name            80
CAS              0
PubChem          0
ChEBI            0
3DMET            0
NIKKAJI          0
LigandBox        0
PDB-CCD          0
CCSD             0
GlyTouCan        0
LIPIDMAPS        0
JCGGDB           0
LipidBank        0
KNApSAcK         0
GlycoEpitope     0
ICD-11          79
ICD-10          78
MeSH            71
OMIM            41
MedlinePlus     19
dtype: int64

In [152]:
kegg_disease_mapping_df = kegg_ents_df[
    kegg_ents_df['entry_type'] == 'Disease'
][['entry_id', 'ICD-10']].drop_duplicates().dropna()

In [165]:
kegg_disease_mapping_df['ICD-10'] = (
    kegg_disease_mapping_df['ICD-10'].apply(lambda x: x.split(' '))
)
kegg_disease_mapping_df = kegg_disease_mapping_df.explode('ICD-10')

In [167]:
kegg_disease_mapping_mrconso_df = pd.merge(
    left=kegg_disease_mapping_df,
    right=mrconso_st_df[['CUI', 'STR', 'CODE', 'SAB']],
    left_on='ICD-10',
    right_on='CODE',
    how='left'
)

In [168]:
kegg_disease_mapping_mrconso_df

Unnamed: 0,entry_id,ICD-10,CUI,STR,CODE,SAB
0,H00081,E06.3,C0342125,Hashitoxicosis (transient),E06.3,ICD10CM
1,H00081,E06.3,C0677606,Lymphadenoid goiter,E06.3,ICD10CM
2,H00081,E06.3,C0677607,Hashimoto's thyroiditis,E06.3,ICD10CM
3,H00081,E06.3,C0677607,Struma lymphomatosa,E06.3,ICD10CM
4,H00081,E06.3,C0920350,Autoimmune thyroiditis,E06.3,ICD10
...,...,...,...,...,...,...
664,H01563,B24,C0019693,Unspecified human immunodeficiency virus [HIV]...,B24,ICD10AM
665,H01563,B24,C0019693,Unspecified human immunodeficiency virus [HIV]...,B24,ICD10
666,H01563,B24,C2500275,Imaging @ Heart @ Ultrasonography,B24,ICD10PCS
667,H01563,B24,C2500275,"Imaging, Heart, Ultrasonography",B24,ICD10PCS


In [169]:
kegg_disease_mapping_mrconso_df[
    kegg_disease_mapping_mrconso_df['CUI'].isna()
]

Unnamed: 0,entry_id,ICD-10,CUI,STR,CODE,SAB
519,H00330,U82.1,,,,
607,H00331,U83.0,,,,


In [173]:
kegg_disease_mapping_mrconso_df = kegg_disease_mapping_mrconso_df.dropna()
kegg_disease_mapping_mrconso_df = kegg_disease_mapping_mrconso_df[
    kegg_disease_mapping_mrconso_df['SAB'].str.contains('ICD10')
]

In [174]:
kegg_disease_mapping_mrconso_df['SAB'].value_counts()

ICD10CM      307
ICD10        106
ICD10AM      104
ICD10PCS      57
ICD10AE        9
ICD10AMAE      9
Name: SAB, dtype: int64

In [175]:
kegg_disease_mapping_mrconso_df

Unnamed: 0,entry_id,ICD-10,CUI,STR,CODE,SAB
0,H00081,E06.3,C0342125,Hashitoxicosis (transient),E06.3,ICD10CM
1,H00081,E06.3,C0677606,Lymphadenoid goiter,E06.3,ICD10CM
2,H00081,E06.3,C0677607,Hashimoto's thyroiditis,E06.3,ICD10CM
3,H00081,E06.3,C0677607,Struma lymphomatosa,E06.3,ICD10CM
4,H00081,E06.3,C0920350,Autoimmune thyroiditis,E06.3,ICD10
...,...,...,...,...,...,...
664,H01563,B24,C0019693,Unspecified human immunodeficiency virus [HIV]...,B24,ICD10AM
665,H01563,B24,C0019693,Unspecified human immunodeficiency virus [HIV]...,B24,ICD10
666,H01563,B24,C2500275,Imaging @ Heart @ Ultrasonography,B24,ICD10PCS
667,H01563,B24,C2500275,"Imaging, Heart, Ultrasonography",B24,ICD10PCS


#### `kegg_to_umls_disease_agg_dict`

In [176]:
kegg_to_umls_disease_agg_dict = (
    kegg_disease_mapping_mrconso_df[['entry_id', 'CUI']]
        .groupby('entry_id')
        .agg(set)
        ['CUI']
)

In [178]:
kegg_to_umls_disease_agg_dict['H00004']

{'C0023473', 'C1292771', 'C2861579', 'C2861580'}

### Mapping `Drug` and `Compound`

#### -> CAS

In [105]:
kegg_ents_df[
    kegg_ents_df['entry_type'].isin(
        {'Drug', 'Compound'}
    )
].notna().sum().sort_values()

CCSD                0
MeSH                0
ICD-10              0
ICD-11              0
GlycoEpitope        0
LipidBank           0
JCGGDB              0
GlyTouCan           0
OMIM                0
MedlinePlus         0
LIPIDMAPS        1369
KNApSAcK         1558
PDB-CCD          2499
3DMET            2802
LigandBox        4118
NIKKAJI          7454
ChEBI            7888
CAS              8660
PubChem         11620
name            12333
entry_type      12333
entry_id        12333
dtype: int64

In [118]:
kegg_entry_to_CAS_df = (
    kegg_ents_df[
        kegg_ents_df['entry_type']
            .isin({'Drug', 'Compound'})
    ]
    [['entry_id', 'CAS']]
    .dropna()
    .drop_duplicates()
)
len(kegg_entry_to_CAS_df)

8660

In [119]:
kegg_entry_to_CAS_df['CAS'] = kegg_entry_to_CAS_df['CAS'].apply(lambda x: x.split(' '))
kegg_entry_to_CAS_df = kegg_entry_to_CAS_df.explode('CAS')

#### --> CUI (`kegg_to_umls_compdrug_agg_dict`)

In [144]:
kegg_entry_to_CAS_df['cui'] = (
    kegg_entry_to_CAS_df['CAS'].apply(
        lambda x: cas_to_cui_dict.get(x)
    )
)
kegg_entry_to_CAS_df = (
    kegg_entry_to_CAS_df.explode('cui')
)

In [145]:
kegg_entry_to_CAS_df

Unnamed: 0,entry_id,CAS,cui
0,C01073,3025-95-4,
1,D05267,165800-05-5,C1456730
1,D05267,165800-05-5,C0769787
1,D05267,165800-05-5,C0769784
2,D05972,76497-13-7,C0110853
...,...,...,...
12792,D12691,16008-36-9,
12793,D12202,1924598-82-2,C4763386
12794,D03493,74772-77-3,C0055721
12794,D03493,74772-77-3,C0101496


In [146]:
kegg_to_umls_compdrug_agg_dict = (
    kegg_entry_to_CAS_df
        .dropna()
        .groupby('entry_id')
        .agg(set)
        ['cui']
)

In [147]:
kegg_to_umls_compdrug_agg_dict['C00002']

{'C0001480',
 'C0004221',
 'C0006618',
 'C0024471',
 'C0109657',
 'C0126945',
 'C0701945',
 'C0701946',
 'C0949213'}

### Mapping `Gene`

In [181]:
gene_names_kegg_dict = dict()

for pw_name in records_dict:
    if 'GENE' in records_dict[pw_name]:
        genes_list_raw = records_dict[pw_name]['GENE']
        
        for raw_gene in genes_list_raw:
            gene_id = re.split('  |; ', raw_gene)[1] 
            gene_names_kegg_dict[raw_gene] = gene_id

In [182]:
len(gene_names_kegg_dict)

8573

In [183]:
kegg_id_to_str_dict = defaultdict(set)

for k in gene_names_kegg_dict:
    v = gene_names_kegg_dict[k]
    kegg_id_to_str_dict[v].add(k)

In [192]:
gene_names_kegg_df = (
    pd.DataFrame(gene_names_kegg_dict.items(), columns=['raw_name', 'raw_symbol'])
)

gene_names_kegg_df['symbol_stripped'] = (
    gene_names_kegg_df['raw_symbol'].apply(
        lambda x: x.split('.')[0].lower()
    )
)

In [193]:
gene_names_kegg_df

Unnamed: 0,raw_name,raw_symbol,symbol_stripped
0,3101 HK3; hexokinase 3 [KO:K00844] [EC:2.7.1.1],HK3,hk3
1,3098 HK1; hexokinase 1 [KO:K00844] [EC:2.7.1.1],HK1,hk1
2,3099 HK2; hexokinase 2 [KO:K00844] [EC:2.7.1.1],HK2,hk2
3,80201 HKDC1; hexokinase domain containing 1 [...,HKDC1,hkdc1
4,2645 GCK; glucokinase [KO:K12407] [EC:2.7.1.2],GCK,gck
...,...,...,...
8568,133522 PPARGC1B; PPARG coactivator 1 beta [KO...,PPARGC1B,ppargc1b
8569,10724 OGA; O-GlcNAcase [KO:K15719] [EC:3.2.1....,OGA,oga
8570,"9070 ASH2L; ASH2 like, histone lysine methylt...",ASH2L,ash2l
8571,"5929 RBBP5; RB binding protein 5, histone lys...",RBBP5,rbbp5


#### -> CUI (`kegg_to_umls_gene_dict`; using gene symbol)

In [199]:
pref_semtypes = {
    'Amino Acid, Peptide, or Protein',
    'Gene or Genome'
}

In [200]:
mrconso_st_gene_df = (
    mrconso_st_df[
        ~(
            mrconso_st_df['sem_types']
                .apply(lambda x: 1 if pref_semtypes.intersection(x) else None)
                .isna()
        )
    ]
    .dropna(subset=['STR'])
)

In [201]:
mrconso_st_gene_df['str_lower'] = mrconso_st_gene_df['STR'].apply(lambda x: x.lower())

In [203]:
kegg_to_umls_gene_cols = [
    'raw_name', 'raw_symbol', 'symbol_stripped', 'CUI', 'CODE', 'SAB', 
    #'sem_types'
]

In [204]:
kegg_to_umls_gene_df = pd.merge(
    gene_names_kegg_df,
    mrconso_st_gene_df,
    left_on='symbol_stripped',
    right_on='str_lower',
    how='inner'
)[
   kegg_to_umls_gene_cols 
].drop_duplicates()

kegg_to_umls_gene_df

Unnamed: 0,raw_name,raw_symbol,symbol_stripped,CUI,CODE,SAB
0,3101 HK3; hexokinase 3 [KO:K00844] [EC:2.7.1.1],HK3,hk3,C1415556,HGNC:4925,HGNC
1,3101 HK3; hexokinase 3 [KO:K00844] [EC:2.7.1.1],HK3,hk3,C1415556,142570,OMIM
2,3101 HK3; hexokinase 3 [KO:K00844] [EC:2.7.1.1],HK3,hk3,C1428139,HGNC:23576,HGNC
3,3101 HK3; hexokinase 3 [KO:K00844] [EC:2.7.1.1],HK3,hk3,C1428139,607825,OMIM
4,3101 HK3; hexokinase 3 [KO:K00844] [EC:2.7.1.1],HK3,hk3,C1705954,C49721,NCI
...,...,...,...,...,...,...
27891,"9070 ASH2L; ASH2 like, histone lysine methylt...",ASH2L,ash2l,C1412580,604782,OMIM
27892,"5929 RBBP5; RB binding protein 5, histone lys...",RBBP5,rbbp5,C1419290,HGNC:9888,HGNC
27893,"5929 RBBP5; RB binding protein 5, histone lys...",RBBP5,rbbp5,C1419290,600697,OMIM
27894,79798 ARMC5; armadillo repeat containing 5 [K...,ARMC5,armc5,C1538935,HGNC:25781,HGNC


In [205]:
kegg_to_umls_gene_dict = dict(
    kegg_to_umls_gene_df[['raw_name', 'symbol_stripped', 'CUI']]
        .groupby('raw_name')
        .agg(set)
        ['CUI']
)

In [206]:
len(kegg_to_umls_gene_dict)

8445

In [209]:
kegg_to_umls_gene_dict['10  NAT2; N-acetyltransferase 2 [KO:K00622] [EC:2.3.1.5]']

{'C0796518', 'C1261253', 'C1421999'}

## Putting all mappings together, perform filtering, compose pairs

In [211]:
records_dict['hsa00010 Pathway']

defaultdict(list,
            {'NAME': ['Glycolysis / Gluconeogenesis - Homo sapiens (human)'],
             'DRUG': ['D00123  Cyanamide (JP18)',
              'D00131  Disulfiram (JP18/USP/INN)',
              'D07257  Lonidamine (INN)',
              'D08970  Piragliatin (USAN)',
              'D11342  Dorzagliatin (USAN)',
              'D11408  Mitapivat sulfate (USAN)',
              'D12320  Cadisegliatin (USAN)',
              'D12362  Etavopivat (USAN/INN)'],
             'GENE': ['3101  HK3; hexokinase 3 [KO:K00844] [EC:2.7.1.1]',
              '3098  HK1; hexokinase 1 [KO:K00844] [EC:2.7.1.1]',
              '3099  HK2; hexokinase 2 [KO:K00844] [EC:2.7.1.1]',
              '80201  HKDC1; hexokinase domain containing 1 [KO:K00844] [EC:2.7.1.1]',
              '2645  GCK; glucokinase [KO:K12407] [EC:2.7.1.2]',
              '2821  GPI; glucose-6-phosphate isomerase [KO:K01810] [EC:5.3.1.9]',
              '5213  PFKM; phosphofructokinase, muscle [KO:K00850] [EC:2.7.1.11]',
    

In [214]:
records_filtered = dict()
records_mapped = dict()

for k in records_dict:
    pw_filt_dict = defaultdict(list)
    pw_mapped_dict = defaultdict(list)
    
    for category in records_dict[k]:
        kegg_ids = records_dict[k][category]
        if category == 'NAME':
            pw_filt_dict[category] = kegg_ids
            pw_mapped_dict[category] = kegg_ids
        
        elif category in {'DRUG', 'COMPOUND'}:
            for record in kegg_ids:
                kegg_id = record.split(' ')[0]
                if kegg_id in kegg_to_umls_compdrug_agg_dict:
                    mapped_cuis = kegg_to_umls_compdrug_agg_dict[kegg_id]
                    pw_filt_dict[category].append(kegg_id)
                    pw_mapped_dict[category].append(mapped_cuis)
        
        elif category == 'DISEASE':
            for record in kegg_ids:
                kegg_id = record.split(' ')[0]
                if kegg_id in kegg_to_umls_disease_agg_dict:
                    mapped_cuis = kegg_to_umls_disease_agg_dict[kegg_id]
                    pw_filt_dict[category].append(kegg_id)
                    pw_mapped_dict[category].append(mapped_cuis)
                    
        elif category == 'GENE':
            for record in kegg_ids:
                kegg_id = record
                if kegg_id in kegg_to_umls_gene_dict:
                    mapped_cuis = kegg_to_umls_gene_dict[kegg_id]
                    pw_filt_dict[category].append(kegg_id)
                    pw_mapped_dict[category].append(mapped_cuis)
    
    records_filtered[k] = pw_filt_dict
    records_mapped[k] = pw_mapped_dict

### Constructing cliques

In [216]:
records_umls_cliques_dict = dict()

for k in records_mapped:
    rec_dict = dict()
    all_cuis = set()
    rec_dict['all_umls_set'] = set()
    rec_dict['all_kegg_set'] = set()
    for category in records_mapped[k]:
        if category == 'NAME':
            rec_dict['NAME'] = records_mapped[k][category]
        else:
            for i, kegg_ids_group in enumerate(records_mapped[k][category]):
                kegg_real_id = records_filtered[k][category][i]
                rec_dict['all_umls_set'].update(kegg_ids_group)
                rec_dict['all_kegg_set'].add(kegg_real_id)
    
    records_umls_cliques_dict[k] = rec_dict

In [217]:
records_umls_cliques_df = pd.DataFrame(records_umls_cliques_dict).T

In [218]:
records_umls_cliques_df

Unnamed: 0,all_umls_set,all_kegg_set,NAME
map01100 Global Pathway,{},{},[Metabolic pathways]
map01110 Global Pathway,{},{},[Biosynthesis of secondary metabolites]
map01120 Global Pathway,{},{},[Microbial metabolism in diverse environments]
map01200 Global Pathway,{},{},[Carbon metabolism]
map01210 Global Pathway,{},{},[2-Oxocarboxylic acid metabolism]
...,...,...,...
hsa04934 Pathway,"{C0039635, C1415321, C0286910, C3256589, C1529...","{C00020, 5929 RBBP5; RB binding protein 5, hi...",[Cushing syndrome - Homo sapiens (human)]
hsa01521 Pathway,"{C4053633, C3827073, C4519548, C2346824, C5545...","{D11417, D11772, D09371, D11980, D11773, D1124...",[EGFR tyrosine kinase inhibitor resistance - H...
hsa01524 Pathway,{},{},[Platinum drug resistance - Homo sapiens (human)]
hsa01523 Pathway,"{C4288736, C1831845, C3640818, C2700602}","{D12183, D10954}",[Antifolate resistance - Homo sapiens (human)]


In [221]:
records_umls_cliques_df['cliques'] = records_umls_cliques_df['all_umls_set'].apply(
    lambda x: list(combinations(x, 2))
)

In [222]:
records_umls_cliques_df['cliques_kegg'] = records_umls_cliques_df['all_kegg_set'].apply(
    lambda x: list(combinations(x, 2))
)

In [223]:
records_umls_cliques_df

Unnamed: 0,all_umls_set,all_kegg_set,NAME,cliques,cliques_kegg
map01100 Global Pathway,{},{},[Metabolic pathways],[],[]
map01110 Global Pathway,{},{},[Biosynthesis of secondary metabolites],[],[]
map01120 Global Pathway,{},{},[Microbial metabolism in diverse environments],[],[]
map01200 Global Pathway,{},{},[Carbon metabolism],[],[]
map01210 Global Pathway,{},{},[2-Oxocarboxylic acid metabolism],[],[]
...,...,...,...,...,...
hsa04934 Pathway,"{C0039635, C1415321, C0286910, C3256589, C1529...","{C00020, 5929 RBBP5; RB binding protein 5, hi...",[Cushing syndrome - Homo sapiens (human)],"[(C0039635, C1415321), (C0039635, C0286910), (...","[(C00020, 5929 RBBP5; RB binding protein 5, h..."
hsa01521 Pathway,"{C4053633, C3827073, C4519548, C2346824, C5545...","{D11417, D11772, D09371, D11980, D11773, D1124...",[EGFR tyrosine kinase inhibitor resistance - H...,"[(C4053633, C3827073), (C4053633, C4519548), (...","[(D11417, D11772), (D11417, D09371), (D11417, ..."
hsa01524 Pathway,{},{},[Platinum drug resistance - Homo sapiens (human)],[],[]
hsa01523 Pathway,"{C4288736, C1831845, C3640818, C2700602}","{D12183, D10954}",[Antifolate resistance - Homo sapiens (human)],"[(C4288736, C1831845), (C4288736, C3640818), (...","[(D12183, D10954)]"


In [224]:
records_all_pairs_raw_list = [] 
for k in records_umls_cliques_df['cliques']:
    records_all_pairs_raw_list += k

In [225]:
records_all_pairs_kegg_raw_list = [] 
for k in records_umls_cliques_df['cliques_kegg']:
    records_all_pairs_kegg_raw_list += k

In [226]:
records_all_sorted_pairs_list = [
    tuple(sorted(pair)) for pair in records_all_pairs_raw_list
]

In [227]:
records_all_kegg_sorted_pairs_list = [
    tuple(sorted(pair)) for pair in records_all_pairs_kegg_raw_list
]

In [228]:
len(set(records_all_kegg_sorted_pairs_list))

4931638

In [229]:
records_all_sorted_pairs_set = set(records_all_sorted_pairs_list)

In [230]:
len(records_all_sorted_pairs_set)

35236404

In [231]:
with open('../../benchmark_data/01_cui_pairs_json/kegg_cui_pairs.json', 'w') as f:
    json.dump(list(records_all_sorted_pairs_set), f)