# Drugcentral data collection

Prerequisites:
- (SQL) postgres dump: https://unmtid-shinyapps.net/download/drugcentral.dump.08222022.sql.gz
- (SQL) Run sql server: `postgres -D ~/software/postgresql/data`
- (SQL) Check if it works with postgres CLI: `psql -h /tmp/ -d drugcentral_102422`
- (TSV) file: https://unmtid-shinyapps.net/download/DrugCentral/2021_09_01/drug.target.interaction.tsv.gz

In [None]:
!wget https://unmtid-shinyapps.net/download/DrugCentral/2021_09_01/drug.target.interaction.tsv.gz --no-check-certificate

In [2]:
import psycopg2
import pandas as pd

from tqdm import tqdm
import time
import json

import numpy as np

from collections import Counter, defaultdict

## Opening ready-to-use files

### Exported `.tsv` table 

In [15]:
drugcentral_export_df = pd.read_table('drug.target.interaction.tsv.gz')
len(drugcentral_export_df)

19378

In [16]:
drugcentral_export_df

Unnamed: 0,DRUG_NAME,STRUCT_ID,TARGET_NAME,TARGET_CLASS,ACCESSION,GENE,SWISSPROT,ACT_VALUE,ACT_UNIT,ACT_TYPE,ACT_COMMENT,ACT_SOURCE,RELATION,MOA,MOA_SOURCE,ACT_SOURCE_URL,MOA_SOURCE_URL,ACTION_TYPE,TDL,ORGANISM
0,levobupivacaine,4,Potassium voltage-gated channel subfamily H me...,Ion channel,Q12809,KCNH2,KCNH2_HUMAN,4.890,,IC50,Inhibition of wild-type human ERG channel expr...,CHEMBL,=,,,,,,Tclin,Homo sapiens
1,levobupivacaine,4,Sodium channel protein type 1 subunit alpha,Ion channel,P35498,SCN1A,SCN1A_HUMAN,5.790,,IC50,,WOMBAT-PK,=,,,,,,Tclin,Homo sapiens
2,levobupivacaine,4,Sodium channel protein type 4 subunit alpha,Ion channel,P35499,SCN4A,SCN4A_HUMAN,,,,,WOMBAT-PK,,1.0,CHEMBL,,https://www.ebi.ac.uk/chembl/compound/inspect/...,BLOCKER,Tclin,Homo sapiens
3,levobupivacaine,4,Prostaglandin E2 receptor EP1 subtype,GPCR,P34995,PTGER1,PE2R1_HUMAN,,,,,WOMBAT-PK,,,,,,,Tclin,Homo sapiens
4,levobupivacaine,4,Cytochrome P450 2D6,Enzyme,P10635,CYP2D6,CP2D6_HUMAN,6.707,,IC50,"DRUGMATRIX: CYP450, 2D6 enzyme inhibition (sub...",DRUG MATRIX,=,,,,,,Tclin,Homo sapiens
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19373,samidorphan,5460,Delta-type opioid receptor,GPCR,P41143,OPRD1,OPRD_HUMAN,8.590,,Ki,,DRUG LABEL,=,,,https://www.accessdata.fda.gov/drugsatfda_docs...,,PARTIAL AGONIST,Tclin,Homo sapiens
19374,sotorasib,5461,GTPase KRas,Enzyme,P01116,KRAS,RASK_HUMAN,7.030,,IC50,KRAS G12C mutation,DRUG LABEL,=,1.0,DRUG LABEL,https://www.accessdata.fda.gov/drugsatfda_docs...,https://www.accessdata.fda.gov/drugsatfda_docs...,INHIBITOR,Tchem,Homo sapiens
19375,ibrexafungerp,5462,"Beta-1,3-glucan synthase catalytic subunit 1",Enzyme,O13428,GSC1,O13428_CANAX,8.350,,IC50,"MoA - inhibits the biosynthesis of beta-(1,3)-...",SCIENTIFIC LITERATURE,=,1.0,DRUG LABEL,https://pubmed.ncbi.nlm.nih.gov/24323472,https://www.accessdata.fda.gov/drugsatfda_docs...,INHIBITOR,,Candida albicans
19376,ibrexafungerp,5462,"1,3-Beta-D-glucan-UDP glucosyltransferase",Enzyme,Q6FTN8,FKS1,Q6FTN8_CANGA,7.830,,IC50,"MoA - inhibits the biosynthesis of beta-(1,3)-...",SCIENTIFIC LITERATURE,=,1.0,SCIENTIFIC LITERATURE,https://pubmed.ncbi.nlm.nih.gov/24323472,https://www.accessdata.fda.gov/drugsatfda_docs...,INHIBITOR,,Candida glabrata


In [17]:
drugcentral_export_df = drugcentral_export_df[
    [
        'DRUG_NAME', 
        'STRUCT_ID', 
        'TARGET_NAME', 
        'TARGET_CLASS',
        'ACCESSION',
        'GENE',
        'ACT_SOURCE'
    ]
].dropna()

drugcentral_export_df['GENE'] = (
    drugcentral_export_df['GENE']
        .apply(lambda x: set(x.split('|')))
)
drugcentral_export_df = drugcentral_export_df.explode('GENE')

In [18]:
drugcentral_export_df

Unnamed: 0,DRUG_NAME,STRUCT_ID,TARGET_NAME,TARGET_CLASS,ACCESSION,GENE,ACT_SOURCE
0,levobupivacaine,4,Potassium voltage-gated channel subfamily H me...,Ion channel,Q12809,KCNH2,CHEMBL
1,levobupivacaine,4,Sodium channel protein type 1 subunit alpha,Ion channel,P35498,SCN1A,WOMBAT-PK
2,levobupivacaine,4,Sodium channel protein type 4 subunit alpha,Ion channel,P35499,SCN4A,WOMBAT-PK
3,levobupivacaine,4,Prostaglandin E2 receptor EP1 subtype,GPCR,P34995,PTGER1,WOMBAT-PK
4,levobupivacaine,4,Cytochrome P450 2D6,Enzyme,P10635,CYP2D6,DRUG MATRIX
...,...,...,...,...,...,...,...
19373,samidorphan,5460,Delta-type opioid receptor,GPCR,P41143,OPRD1,DRUG LABEL
19374,sotorasib,5461,GTPase KRas,Enzyme,P01116,KRAS,DRUG LABEL
19375,ibrexafungerp,5462,"Beta-1,3-glucan synthase catalytic subunit 1",Enzyme,O13428,GSC1,SCIENTIFIC LITERATURE
19376,ibrexafungerp,5462,"1,3-Beta-D-glucan-UDP glucosyltransferase",Enzyme,Q6FTN8,FKS1,SCIENTIFIC LITERATURE


### UMLS MRCONSO mapping

In [30]:
mrconso_path = (
    '../../UMLS_Metathesaurus/mrconso_and_semtypes_2022AA_df.pkl'
)

In [31]:
mrconso_st_df = pd.read_pickle(mrconso_path)

In [32]:
mrconso_st_df

Unnamed: 0,CUI,LAT,TS,LUI,STT,SUI,ISPREF,AUI,SAUI,SCUI,SDUI,SAB,TTY,CODE,STR,SRL,SUPPRESS,CVF,sem_types
0,C0000005,ENG,P,L0000005,PF,S0007492,Y,A26634265,,M0019694,D012711,MSH,PEP,D012711,(131)I-Macroaggregated Albumin,,N,256.0,"{Pharmacologic Substance, Indicator, Reagent, ..."
1,C0000005,ENG,S,L0270109,PF,S0007491,Y,A26634266,,M0019694,D012711,MSH,ET,D012711,(131)I-MAA,,N,256.0,"{Pharmacologic Substance, Indicator, Reagent, ..."
10,C0000039,ENG,P,L0000039,PF,S17175117,N,A28315139,9194921.0,1926948,,RXNORM,IN,1926948,"1,2-dipalmitoylphosphatidylcholine",,N,256.0,"{Organic Chemical, Pharmacologic Substance}"
11,C0000039,ENG,P,L0000039,PF,S17175117,Y,A28572604,,,,MTH,PN,NOCODE,"1,2-dipalmitoylphosphatidylcholine",,N,256.0,"{Organic Chemical, Pharmacologic Substance}"
12,C0000039,ENG,P,L0000039,VC,S0007564,Y,A0016515,,M0023172,D015060,MSH,MH,D015060,"1,2-Dipalmitoylphosphatidylcholine",,N,,"{Organic Chemical, Pharmacologic Substance}"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16857339,C5574640,ENG,S,L17663434,PF,S21286067,Y,A33944162,,Q9004,,HCPCS,AB,Q9004,Va whole health partner serv,,Y,,{Health Care Activity}
16857340,C5574641,ENG,P,L17663430,PF,S21286031,Y,A33944163,,S1091,,HCPCS,PT,S1091,"Stent, non-coronary, temporary, with delivery ...",,N,,{Medical Device}
16857341,C5574641,ENG,S,L17663429,PF,S21286030,Y,A33944560,,S1091,,HCPCS,AB,S1091,Stent non-coronary propel,,Y,,{Medical Device}
16857342,C5574642,ENG,P,L17662944,PF,S21285763,Y,A33944049,,S9432,,HCPCS,PT,S9432,Medical foods for non-inborn errors of metabolism,,N,,{Food}


In [47]:
mesh_to_umls_dict = (
    mrconso_st_df[
        mrconso_st_df['SAB'] == 'MSH'
    ]
    [['CUI', 'CODE']]
        .groupby('CODE')
        .agg(set)['CUI']
        .to_dict()
)

### UMLS CAS mapping

In [74]:
cas_to_cui_dict = pd.read_pickle(
    '../../pubchem_mappings/pc_cas_to_CUI_full_dict.pkl'
)

In [75]:
len(cas_to_cui_dict)

132597

In [76]:
cas_to_cui_dict['56124-62-0']

{'C0068314'}

In [154]:
cas_to_cui_dict = pd.read_pickle(
    '../../chemidplus_obsolete/cas_to_cui_meshdb_dict.pkl'
)

In [155]:
len(cas_to_cui_dict)

80344

## SQL interface for tables

In [3]:

# Connection parameters, yours will be different
param_dic = {
    "host"      : "localhost",
    "database"  : "drugcentral_102422",
    "user"      : "tyagin",
}


In [4]:
def connect(params_dic):
    """ Connect to the PostgreSQL database server """
    conn = None
    try:
        # connect to the PostgreSQL server
        print('Connecting to the PostgreSQL database...')
        conn = psycopg2.connect(**params_dic)
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        sys.exit(1) 
    print("Connection successful")
    return conn

In [5]:
def postgresql_to_dataframe(conn, select_query, ):
    """
    Tranform a SELECT query into a pandas dataframe
    """
    cursor = conn.cursor()
    try:
        cursor.execute(select_query)
    except (Exception, psycopg2.DatabaseError) as error:
        print("Error: %s" % error)
        cursor.close()
        return 1
    
    # Naturally we get a list of tupples
    tupples = cursor.fetchall()
    cursor.close()
    
    # We just need to turn it into a pandas dataframe
    df = pd.DataFrame(tupples)
    return df

In [6]:
conn = connect(param_dic)

Connecting to the PostgreSQL database...
Connection successful


In [7]:
conn.set_client_encoding('UTF8')

### getting table names

In [19]:
tables_cols_df = postgresql_to_dataframe(
    conn,
    """SELECT *
      FROM information_schema.columns
      WHERE table_schema = 'public' 
      order by ordinal_position;
    """
)

In [20]:
tables_cols_agg_df = tables_cols_df[[2,3]].groupby(2).agg(list)

In [21]:
tables_cols_agg_df

Unnamed: 0_level_0,3
2,Unnamed: 1_level_1
act_table_full,"[act_id, struct_id, target_id, target_name, ta..."
action_type,"[id, action_type, description, parent_type]"
active_ingredient,"[id, active_moiety_unii, active_moiety_name, u..."
approval,"[id, struct_id, approval, type, applicant, orp..."
approval_type,"[id, descr]"
...,...
vetomop,"[omopid, struct_id, species, relationship_type..."
vetprod,"[prodid, appl_type, appl_no, trade_name, appli..."
vetprod2struct,"[prodid, struct_id]"
vetprod_type,"[id, appl_type, description]"


In [22]:
tables_cols_dict = tables_cols_agg_df[3].to_dict()

### querying ALL tables

In [23]:
def Query_table(table_name, limit=0):
    
    if limit:
        sql_query = f"""
            SELECT * from {table_name} limit {limit};
        """
    else:
        sql_query = f"""
            SELECT * from {table_name};
        """
    
    sql_df = postgresql_to_dataframe(
        conn,
        sql_query
    )
        
    sql_df = (
        sql_df.rename(columns=dict(enumerate(tables_cols_dict[table_name])))
    )
    
    return sql_df

In [24]:
tables_dict = dict()
for tname in tqdm(tables_cols_dict):
    if tname[0] != '_':
        tables_dict[tname] = Query_table(tname, limit=None)

100%|█████████████████████████████████████████████████████████████████████████████████| 71/71 [00:56<00:00,  1.26it/s]


### Extracting and merging specific tables 

#### Indications table (`indications_df`: `Drug CAS -> Disease`)

In [25]:
indications_df = pd.merge(
    tables_dict['omop_relationship_doid_view'],
    tables_dict['structures'][['id', 'name', 'cas_reg_no']],
    left_on='struct_id',
    right_on='id',
    how='inner'
)

In [26]:
indications_df

Unnamed: 0,id_x,struct_id,concept_id,relationship_name,concept_name,umls_cui,snomed_full_name,cui_semantic_type,snomed_conceptid,doid,id_y,name,cas_reg_no
0,144492,564,21000286,indication,Gonococcal meningitis,C0153225,Gonococcal meningitis,T047,151004.0,,564,ceftriaxone,73384-59-5
1,146703,564,21001035,indication,Streptococcal meningitis,C0154639,Streptococcal meningitis,T047,4510004.0,DOID:11574,564,ceftriaxone,73384-59-5
2,143317,564,21013411,off-label use,Pyrexia of unknown origin,C0015970,Pyrexia of unknown origin,T184,7520000.0,,564,ceftriaxone,73384-59-5
3,144496,564,21000101,indication,Septicemia due to Escherichia coli,C0276088,Septicemia due to Escherichia coli,T047,9323009.0,,564,ceftriaxone,73384-59-5
4,151436,564,21000110,indication,Bacterial septicemia,C0684256,Bacterial septicemia,T047,10001005.0,DOID:0040085,564,ceftriaxone,73384-59-5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
41815,172341,2316,21003003,indication,Diagnostic Test for Thyroid Dysfunction,,,,,,2316,protirelin,24305-27-9
41816,172342,2316,21003002,indication,Pituitary Function Studies,,,,,,2316,protirelin,24305-27-9
41817,134154,2959,21000623,indication,Partial Central Diabetes Insipidus,,,,,,2959,lypressin,50-57-7
41818,174027,5392,40249397,indication,Metastatic non-small cell lung cancer,C0278987,Metastatic non-small cell lung cancer,T191,,,5392,capmatinib,1029712-80-8


#### Artificially created connections between targets (`drugcentral_export_df`) and drugs (`indications_df`)

In [39]:
drug_target_artificial_df = pd.merge(
    indications_df[['snomed_full_name', 'umls_cui', 'struct_id', 'name', 'cas_reg_no']],
    drugcentral_export_df[['DRUG_NAME', 'STRUCT_ID', 'TARGET_NAME', 'GENE']],
    left_on='struct_id',
    right_on='STRUCT_ID',
    how='inner'
).dropna()

len(drug_target_artificial_df)

406142

In [40]:
drug_target_artificial_df

Unnamed: 0,snomed_full_name,umls_cui,struct_id,name,cas_reg_no,DRUG_NAME,STRUCT_ID,TARGET_NAME,GENE
0,Gonococcal meningitis,C0153225,564,ceftriaxone,73384-59-5,ceftriaxone,564,Peptidoglycan synthase FtsI,ftsI
1,Gonococcal meningitis,C0153225,564,ceftriaxone,73384-59-5,ceftriaxone,564,Penicillin-binding protein 1A,mrcA
2,Gonococcal meningitis,C0153225,564,ceftriaxone,73384-59-5,ceftriaxone,564,Penicillin-binding protein 1B,mrcB
3,Gonococcal meningitis,C0153225,564,ceftriaxone,73384-59-5,ceftriaxone,564,Penicillin-binding protein 2,mrdA
4,Gonococcal meningitis,C0153225,564,ceftriaxone,73384-59-5,ceftriaxone,564,D-amino-acid oxidase,DAO
...,...,...,...,...,...,...,...,...,...
445676,Metastatic urothelial carcinoma,C4288754,5366,enfortumab vedotin,1346452-25-2,enfortumab vedotin,5366,Tubulin beta,TUBB3
445677,Metastatic urothelial carcinoma,C4288754,5366,enfortumab vedotin,1346452-25-2,enfortumab vedotin,5366,Nectin-4,NECTIN4
445678,Metastatic human epidermal growth factor 2 pos...,C4721209,5434,margetuximab,1350624-75-7,margetuximab,5434,Receptor tyrosine-protein kinase erbB-2,ERBB2
445679,Metastatic human epidermal growth factor 2 pos...,C4721209,5434,margetuximab,1350624-75-7,margetuximab,5434,Low affinity immunoglobulin gamma Fc region re...,FCGR3A


#### Pharma class (`pharm_action_df`: `Drug CAS -> MeSH indication`)

In [51]:
pharm_action_df = pd.merge(
    tables_dict['pharma_class'],
    tables_dict['structures'][['id', 'name', 'cas_reg_no']],
    left_on='struct_id',
    right_on='id',
    #how='inner'
)

In [52]:
pharm_action_df

Unnamed: 0,id_x,struct_id,type,name_x,class_code,source,id_y,name_y,cas_reg_no
0,449782,3024,PA,Analgesics,D000700,MeSH,3024,bevonium,33371-53-8
1,449783,3024,PA,"Analgesics, Non-Narcotic",D018712,MeSH,3024,bevonium,33371-53-8
2,449784,3024,PA,Anti-Inflammatory Agents,D000893,MeSH,3024,bevonium,33371-53-8
3,449785,3024,PA,"Anti-Inflammatory Agents, Non-Steroidal",D000894,MeSH,3024,bevonium,33371-53-8
4,449786,3024,PA,Antirheumatic Agents,D018501,MeSH,3024,bevonium,33371-53-8
...,...,...,...,...,...,...,...,...,...
25342,463923,4867,PA,Enzyme Inhibitors,D004791,MeSH,4867,dexrabeprazole,177795-60-7
25343,463925,4867,PA,Gastrointestinal Agents,D005765,MeSH,4867,dexrabeprazole,177795-60-7
25344,463927,4867,PA,Proton Pump Inhibitors,D054328,MeSH,4867,dexrabeprazole,177795-60-7
25345,463931,1604,PA,Anti-Arrhythmia Agents,D000889,MeSH,1604,lorajmine,47562-08-3


#### Adverse effects (`faers`): (`adverse_eff_meddra_cui_df`: `Drug CAS -> MedDRA adv effect`)

In [28]:
adverse_eff_df = pd.merge(
    tables_dict['faers'][['struct_id', 'meddra_name', 'meddra_code']],
    tables_dict['structures'][['id', 'name', 'cas_reg_no']],
    left_on='struct_id',
    right_on='id',
    how='inner'
)

In [29]:
adverse_eff_df

Unnamed: 0,struct_id,meddra_name,meddra_code,id,name,cas_reg_no
0,4185,Abdominal discomfort,10000059,4185,vemurafenib,918504-65-1
1,4185,Acne,10000496,4185,vemurafenib,918504-65-1
2,4185,Acrochordon,10000591,4185,vemurafenib,918504-65-1
3,4185,Actinic keratosis,10000614,4185,vemurafenib,918504-65-1
4,4185,Alopecia,10001760,4185,vemurafenib,918504-65-1
...,...,...,...,...,...,...
310049,4714,Metaphyseal corner fracture,10079667,4714,strontium ranelate,135459-87-9
310050,2063,Negative pressure pulmonary oedema,10080589,2063,parecoxib,198470-84-7
310051,5324,Product administration error,10081576,5324,brexanolone,516-54-1
310052,1574,Product dose omission issue,10084406,1574,levorphanol,77-07-6


##### Mapping MEDDRA to UMLS

In [33]:
mrconso_meddra_to_umls_df = (
    mrconso_st_df[(
      (mrconso_st_df['SAB'] == 'MDR') # MEDDRA - MDR in UMLS
    )]
    [['CUI', 'STR', 'CODE']]
    .drop_duplicates()
)
mrconso_meddra_to_umls_df['CODE'] = mrconso_meddra_to_umls_df['CODE'].astype(str)

In [34]:
mrconso_meddra_to_umls_df

Unnamed: 0,CUI,STR,CODE
5277,C0000727,Acute abdomen,10000647
5298,C0000727,Syndrome abdominal acute,10042784
5301,C0000727,Abdominal syndrome acute,10000096
5430,C0000729,Abdominal cramps,10000057
5448,C0000729,Abdominal cramp,10000056
...,...,...,...
16807021,C5554411,Digital dermoscopy,10086170
16807035,C5554412,Auto-injector viewing window blocked,10086312
16807049,C5554413,Tumarkin's otolithic crisis,10086331
16807063,C5554414,Allergy to PEGylated drug,10086350


In [35]:
adverse_eff_df['meddra_code'] = adverse_eff_df['meddra_code'].astype(str)

In [36]:
adverse_eff_meddra_cui_df = pd.merge(
    adverse_eff_df,
    mrconso_meddra_to_umls_df,
    left_on='meddra_code',
    right_on='CODE'
).rename(columns={'CUI': 'meddra_cui'})

In [37]:
adverse_eff_meddra_cui_df

Unnamed: 0,struct_id,meddra_name,meddra_code,id,name,cas_reg_no,meddra_cui,STR,CODE
0,4185,Abdominal discomfort,10000059,4185,vemurafenib,918504-65-1,C0232487,Abdominal discomfort,10000059
1,323,Abdominal discomfort,10000059,323,benzocaine,94-09-7,C0232487,Abdominal discomfort,10000059
2,4952,Abdominal discomfort,10000059,4952,canakinumab,914613-48-2,C0232487,Abdominal discomfort,10000059
3,5056,Abdominal discomfort,10000059,5056,patiromer calcium,1415477-49-4,C0232487,Abdominal discomfort,10000059
4,633,Abdominal discomfort,10000059,633,ciclesonide,126544-47-6,C0232487,Abdominal discomfort,10000059
...,...,...,...,...,...,...,...,...,...
341936,5123,Scan myocardial perfusion abnormal,10061501,5123,technetium Tc 99m tetrofosmin,127455-27-0,C0853562,Scan myocardial perfusion abnormal,10061501
341937,5084,Factor IX inhibition,10051778,5084,eftrenonacog alfa,1270012-74-2,C0948167,Factor IX inhibition,10051778
341938,5308,ADAMTS13 activity abnormal,10074493,5308,caplacizumab,915810-67-2,C3805010,ADAMTS13 activity abnormal,10074493
341939,2805,Bladder instillation procedure,10072199,2805,valrubicin,56124-62-0,C3267064,Bladder instillation procedure,10072199


## Mapping all tables to UMLS CUIs

### `pharm_action_df` -> `pharm_action_mapped_df`

In [156]:
pharm_action_df

Unnamed: 0,id_x,struct_id,type,name_x,class_code,source,id_y,name_y,cas_reg_no,class_code_cui,struct_cui
0,449782,3024,PA,Analgesics,D000700,MeSH,3024,bevonium,33371-53-8,"{C1704390, C0002771}",{C0053533}
1,449783,3024,PA,"Analgesics, Non-Narcotic",D018712,MeSH,3024,bevonium,33371-53-8,{C0242937},{C0053533}
2,449784,3024,PA,Anti-Inflammatory Agents,D000893,MeSH,3024,bevonium,33371-53-8,{C0003209},{C0053533}
3,449785,3024,PA,"Anti-Inflammatory Agents, Non-Steroidal",D000894,MeSH,3024,bevonium,33371-53-8,"{C0003211, C0002773, C0085847}",{C0053533}
4,449786,3024,PA,Antirheumatic Agents,D018501,MeSH,3024,bevonium,33371-53-8,"{C0242708, C4505471, C0003191}",{C0053533}
...,...,...,...,...,...,...,...,...,...,...,...
25342,463923,4867,PA,Enzyme Inhibitors,D004791,MeSH,4867,dexrabeprazole,177795-60-7,{C0014432},{C0378482}
25343,463925,4867,PA,Gastrointestinal Agents,D005765,MeSH,4867,dexrabeprazole,177795-60-7,"{C0282187, C0012237, C0017173}",{C0378482}
25344,463927,4867,PA,Proton Pump Inhibitors,D054328,MeSH,4867,dexrabeprazole,177795-60-7,{C0358591},{C0378482}
25345,463931,1604,PA,Anti-Arrhythmia Agents,D000889,MeSH,1604,lorajmine,47562-08-3,"{C0003302, C0003195, C0027048}","{C0045017, C0024001}"


In [157]:
pharm_action_df['class_code_cui'] = (
    pharm_action_df['class_code'].apply(
        lambda x: mesh_to_umls_dict.get(x)
    )
)

pharm_action_df['struct_cui'] = (
    pharm_action_df['cas_reg_no'].apply(
        lambda x: cas_to_cui_dict.get(x)
    )
)

In [158]:
pharm_action_mapped_df = (
    pharm_action_df
        .explode('class_code_cui')
        .explode('struct_cui')
        .dropna()
)

In [159]:
pharm_action_mapped_df

Unnamed: 0,id_x,struct_id,type,name_x,class_code,source,id_y,name_y,cas_reg_no,class_code_cui,struct_cui
0,449782,3024,PA,Analgesics,D000700,MeSH,3024,bevonium,33371-53-8,C1704390,C0053533
0,449782,3024,PA,Analgesics,D000700,MeSH,3024,bevonium,33371-53-8,C1704390,C1112078
0,449782,3024,PA,Analgesics,D000700,MeSH,3024,bevonium,33371-53-8,C1704390,C0950157
0,449782,3024,PA,Analgesics,D000700,MeSH,3024,bevonium,33371-53-8,C1704390,C0969847
0,449782,3024,PA,Analgesics,D000700,MeSH,3024,bevonium,33371-53-8,C1704390,C0106176
...,...,...,...,...,...,...,...,...,...,...,...
25346,463932,1604,PA,Cardiovascular Agents,D002317,MeSH,1604,lorajmine,47562-08-3,C4704878,C0888886
25346,463932,1604,PA,Cardiovascular Agents,D002317,MeSH,1604,lorajmine,47562-08-3,C4704878,C0888885
25346,463932,1604,PA,Cardiovascular Agents,D002317,MeSH,1604,lorajmine,47562-08-3,C0007220,C0045017
25346,463932,1604,PA,Cardiovascular Agents,D002317,MeSH,1604,lorajmine,47562-08-3,C0007220,C0888886


### `adverse_eff_meddra_cui_df` -> `adverse_eff_mapped_df`

In [160]:
adverse_eff_meddra_cui_df

Unnamed: 0,struct_id,meddra_name,meddra_code,id,name,cas_reg_no,meddra_cui,STR,CODE,struct_cui
0,4185,Abdominal discomfort,10000059,4185,vemurafenib,918504-65-1,C0232487,Abdominal discomfort,10000059,{C3192263}
1,323,Abdominal discomfort,10000059,323,benzocaine,94-09-7,C0232487,Abdominal discomfort,10000059,"{C0103263, C0055342, C2930485, C0005059, C0002..."
2,4952,Abdominal discomfort,10000059,4952,canakinumab,914613-48-2,C0232487,Abdominal discomfort,10000059,{C2718773}
3,5056,Abdominal discomfort,10000059,5056,patiromer calcium,1415477-49-4,C0232487,Abdominal discomfort,10000059,{C4045522}
4,633,Abdominal discomfort,10000059,633,ciclesonide,126544-47-6,C0232487,Abdominal discomfort,10000059,{C0907850}
...,...,...,...,...,...,...,...,...,...,...
341936,5123,Scan myocardial perfusion abnormal,10061501,5123,technetium Tc 99m tetrofosmin,127455-27-0,C0853562,Scan myocardial perfusion abnormal,10061501,{C0211492}
341937,5084,Factor IX inhibition,10051778,5084,eftrenonacog alfa,1270012-74-2,C0948167,Factor IX inhibition,10051778,{C4041753}
341938,5308,ADAMTS13 activity abnormal,10074493,5308,caplacizumab,915810-67-2,C3805010,ADAMTS13 activity abnormal,10074493,{C3713057}
341939,2805,Bladder instillation procedure,10072199,2805,valrubicin,56124-62-0,C3267064,Bladder instillation procedure,10072199,{C0068314}


In [161]:
adverse_eff_meddra_cui_df['struct_cui'] = (
    adverse_eff_meddra_cui_df['cas_reg_no']
        .apply(lambda x: cas_to_cui_dict.get(x))
)

In [162]:
adverse_eff_mapped_df = (
    adverse_eff_meddra_cui_df.explode('struct_cui').dropna()
)

In [163]:
adverse_eff_mapped_df

Unnamed: 0,struct_id,meddra_name,meddra_code,id,name,cas_reg_no,meddra_cui,STR,CODE,struct_cui
0,4185,Abdominal discomfort,10000059,4185,vemurafenib,918504-65-1,C0232487,Abdominal discomfort,10000059,C3192267
0,4185,Abdominal discomfort,10000059,4185,vemurafenib,918504-65-1,C0232487,Abdominal discomfort,10000059,C1832009
0,4185,Abdominal discomfort,10000059,4185,vemurafenib,918504-65-1,C0232487,Abdominal discomfort,10000059,C4083043
0,4185,Abdominal discomfort,10000059,4185,vemurafenib,918504-65-1,C0232487,Abdominal discomfort,10000059,C3192263
0,4185,Abdominal discomfort,10000059,4185,vemurafenib,918504-65-1,C0232487,Abdominal discomfort,10000059,C4310441
...,...,...,...,...,...,...,...,...,...,...
341939,2805,Bladder instillation procedure,10072199,2805,valrubicin,56124-62-0,C3267064,Bladder instillation procedure,10072199,C0724176
341939,2805,Bladder instillation procedure,10072199,2805,valrubicin,56124-62-0,C3267064,Bladder instillation procedure,10072199,C0068314
341939,2805,Bladder instillation procedure,10072199,2805,valrubicin,56124-62-0,C3267064,Bladder instillation procedure,10072199,C1519947
341939,2805,Bladder instillation procedure,10072199,2805,valrubicin,56124-62-0,C3267064,Bladder instillation procedure,10072199,C0133123


### `indications_df` -> `indications_mapped_df`

In [164]:
indications_df

Unnamed: 0,id_x,struct_id,concept_id,relationship_name,concept_name,umls_cui,snomed_full_name,cui_semantic_type,snomed_conceptid,doid,id_y,name,cas_reg_no,struct_cui
0,144492,564,21000286,indication,Gonococcal meningitis,C0153225,Gonococcal meningitis,T047,151004.0,,564,ceftriaxone,73384-59-5,"{C0007561, C0081371, C3536856}"
1,146703,564,21001035,indication,Streptococcal meningitis,C0154639,Streptococcal meningitis,T047,4510004.0,DOID:11574,564,ceftriaxone,73384-59-5,"{C0007561, C0081371, C3536856}"
2,143317,564,21013411,off-label use,Pyrexia of unknown origin,C0015970,Pyrexia of unknown origin,T184,7520000.0,,564,ceftriaxone,73384-59-5,"{C0007561, C0081371, C3536856}"
3,144496,564,21000101,indication,Septicemia due to Escherichia coli,C0276088,Septicemia due to Escherichia coli,T047,9323009.0,,564,ceftriaxone,73384-59-5,"{C0007561, C0081371, C3536856}"
4,151436,564,21000110,indication,Bacterial septicemia,C0684256,Bacterial septicemia,T047,10001005.0,DOID:0040085,564,ceftriaxone,73384-59-5,"{C0007561, C0081371, C3536856}"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41815,172341,2316,21003003,indication,Diagnostic Test for Thyroid Dysfunction,,,,,,2316,protirelin,24305-27-9,{C0040162}
41816,172342,2316,21003002,indication,Pituitary Function Studies,,,,,,2316,protirelin,24305-27-9,{C0040162}
41817,134154,2959,21000623,indication,Partial Central Diabetes Insipidus,,,,,,2959,lypressin,50-57-7,"{C0042413, C0024328}"
41818,174027,5392,40249397,indication,Metastatic non-small cell lung cancer,C0278987,Metastatic non-small cell lung cancer,T191,,,5392,capmatinib,1029712-80-8,{C4053698}


In [165]:
indications_df['struct_cui'] = (
    indications_df['cas_reg_no'].apply(
        lambda x: cas_to_cui_dict.get(x)
    )
)

In [166]:
indications_df.head(3)

Unnamed: 0,id_x,struct_id,concept_id,relationship_name,concept_name,umls_cui,snomed_full_name,cui_semantic_type,snomed_conceptid,doid,id_y,name,cas_reg_no,struct_cui
0,144492,564,21000286,indication,Gonococcal meningitis,C0153225,Gonococcal meningitis,T047,151004.0,,564,ceftriaxone,73384-59-5,"{C1564647, C1564651, C1564652, C0733786, C1564..."
1,146703,564,21001035,indication,Streptococcal meningitis,C0154639,Streptococcal meningitis,T047,4510004.0,DOID:11574,564,ceftriaxone,73384-59-5,"{C1564647, C1564651, C1564652, C0733786, C1564..."
2,143317,564,21013411,off-label use,Pyrexia of unknown origin,C0015970,Pyrexia of unknown origin,T184,7520000.0,,564,ceftriaxone,73384-59-5,"{C1564647, C1564651, C1564652, C0733786, C1564..."


In [167]:
indications_mapped_df = indications_df.explode('struct_cui')

In [168]:
indications_mapped_df

Unnamed: 0,id_x,struct_id,concept_id,relationship_name,concept_name,umls_cui,snomed_full_name,cui_semantic_type,snomed_conceptid,doid,id_y,name,cas_reg_no,struct_cui
0,144492,564,21000286,indication,Gonococcal meningitis,C0153225,Gonococcal meningitis,T047,151004.0,,564,ceftriaxone,73384-59-5,C1564647
0,144492,564,21000286,indication,Gonococcal meningitis,C0153225,Gonococcal meningitis,T047,151004.0,,564,ceftriaxone,73384-59-5,C1564651
0,144492,564,21000286,indication,Gonococcal meningitis,C0153225,Gonococcal meningitis,T047,151004.0,,564,ceftriaxone,73384-59-5,C1564652
0,144492,564,21000286,indication,Gonococcal meningitis,C0153225,Gonococcal meningitis,T047,151004.0,,564,ceftriaxone,73384-59-5,C0733786
0,144492,564,21000286,indication,Gonococcal meningitis,C0153225,Gonococcal meningitis,T047,151004.0,,564,ceftriaxone,73384-59-5,C1564653
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41819,151427,3290,21003116,contraindication,Sensation Disturbance of Genitals,,,,,,3290,oxyquinoline,148-24-3,C0086760
41819,151427,3290,21003116,contraindication,Sensation Disturbance of Genitals,,,,,,3290,oxyquinoline,148-24-3,C0000665
41819,151427,3290,21003116,contraindication,Sensation Disturbance of Genitals,,,,,,3290,oxyquinoline,148-24-3,C1563803
41819,151427,3290,21003116,contraindication,Sensation Disturbance of Genitals,,,,,,3290,oxyquinoline,148-24-3,C0000666


### `drugcentral_export_df` -> `drugcentral_export_mapped_df`

#### struct_id -> CAS -> CUI

In [169]:
structid_to_cas_df = tables_dict['structures'][
    tables_dict['structures']['cas_reg_no'].notna()
][['id', 'name', 'cas_reg_no']]

In [170]:
structid_to_cas_df['struct_cui'] = (
    structid_to_cas_df['cas_reg_no'].apply(
        lambda x: cas_to_cui_dict.get(x)
    )
    .dropna()
)

In [171]:
structid_to_cas_df

Unnamed: 0,id,name,cas_reg_no,struct_cui
0,5392,capmatinib,1029712-80-8,"{C5543774, C5543775, C2983764, C4053698, C5543..."
1,5393,selpercatinib,2152628-33-4,"{C5227683, C5435284, C5235396, C4525531}"
2,5394,ripretinib,1442472-39-0,{C5139749}
4,5395,fluoroestradiol F 18,94153-53-4,
5,5146,ferumoxsil,171544-35-7,"{C0257926, C0257928, C0380253, C0380254}"
...,...,...,...,...
4922,5224,dupilumab,1190264-60-8,"{C3660995, C3660996, C3658854, C4325130}"
4923,5109,urokinase,9039-53-6,"{C0086987, C0701941, C0701940, C0042071, C0086..."
4924,5433,ansuvimab,2375952-29-5,"{C5433066, C5432354, C5432324, C5433440}"
4925,5434,margetuximab,1350624-75-7,"{C4053695, C5433071, C2984521}"


In [172]:
structid_to_cas_df = structid_to_cas_df.dropna().explode('struct_cui')

#### Gene name -> CUI

In [173]:
drugcentral_export_df['GENE_lowcase'] = drugcentral_export_df['GENE'].apply(lambda x: x.lower())

In [174]:
drugcentral_all_genes_set = set(drugcentral_export_df['GENE_lowcase'])
len(drugcentral_all_genes_set)

2145

In [175]:
pref_semtypes = {
    'Amino Acid, Peptide, or Protein',
    'Gene or Genome'
}

In [176]:
mrconso_gene_st_df = mrconso_st_df[
    (
          mrconso_st_df['STR'].apply(lambda x: str(x).lower()).isin(drugcentral_all_genes_set)
        & mrconso_st_df['sem_types'].apply(lambda x: pref_semtypes.intersection(x))
    )
]
len(mrconso_gene_st_df)

7526

In [177]:
mrconso_gene_st_df['STR_lowcase'] = mrconso_gene_st_df['STR'].apply(lambda x: str(x).lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mrconso_gene_st_df['STR_lowcase'] = mrconso_gene_st_df['STR'].apply(lambda x: str(x).lower())


In [178]:
len(mrconso_gene_st_df['STR_lowcase'].drop_duplicates())

1889

In [179]:
gene_symb_to_cui_agg_df = (
    mrconso_gene_st_df[
        ['CUI', 'STR_lowcase']
    ]
        .groupby('STR_lowcase')
        .agg(set)
        .reset_index()
)

In [180]:
gene_symb_to_cui_agg_df

Unnamed: 0,STR_lowcase,CUI
0,aak1,{C1426329}
1,abat,{C1412057}
2,abca1,"{C1412058, C3711161}"
3,abcb1,"{C1704939, C1738970, C0376622}"
4,abcb11,"{C1412070, C1313221}"
...,...,...
1884,zacn,{C1825810}
1885,zak,{C1868689}
1886,zap70,"{C1705831, C0246976, C1421567}"
1887,zmpste24,{C1421586}


#### Merging

In [181]:
drugcentral_export_w_gene_cui_df = (
    pd.merge(
        drugcentral_export_df,
        gene_symb_to_cui_agg_df,
        left_on='GENE_lowcase',
        right_on='STR_lowcase'
    )
    .rename(columns={'CUI': 'gene_CUI'})
    .explode('gene_CUI')
)

In [182]:
drugcentral_export_w_gene_cui_df

Unnamed: 0,DRUG_NAME,STRUCT_ID,TARGET_NAME,TARGET_CLASS,ACCESSION,GENE,ACT_SOURCE,GENE_lowcase,STR_lowcase,gene_CUI
0,levobupivacaine,4,Potassium voltage-gated channel subfamily H me...,Ion channel,Q12809,KCNH2,CHEMBL,kcnh2,kcnh2,C1416572
1,alosetron,129,Potassium voltage-gated channel subfamily H me...,Ion channel,Q12809,KCNH2,CHEMBL,kcnh2,kcnh2,C1416572
2,amiodarone,176,Potassium voltage-gated channel subfamily H me...,Ion channel,Q12809,KCNH2,WOMBAT-PK,kcnh2,kcnh2,C1416572
3,amitriptyline,180,Potassium voltage-gated channel subfamily H me...,Ion channel,Q12809,KCNH2,WOMBAT-PK,kcnh2,kcnh2,C1416572
4,amodiaquine,186,Potassium voltage-gated channel subfamily H me...,Ion channel,Q12809,KCNH2,CHEMBL,kcnh2,kcnh2,C1416572
...,...,...,...,...,...,...,...,...,...,...
21341,evinacumab,5449,Angiopoietin-related protein 3,Secreted,Q9Y5C1,ANGPTL3,DRUG LABEL,angptl3,angptl3,C1412401
21342,pegcetacoplan,5457,Complement C3,Unclassified,P01024,C3,SCIENTIFIC LITERATURE,c3,c3,C1332656
21342,pegcetacoplan,5457,Complement C3,Unclassified,P01024,C3,SCIENTIFIC LITERATURE,c3,c3,C1571455
21343,piflufolastat F-18,5458,Glutamate carboxypeptidase 2,Enzyme,Q04609,FOLH1,UNKNOWN,folh1,folh1,C1333570


In [183]:
drugcentral_export_mapped_df = (
    pd.merge(
        drugcentral_export_w_gene_cui_df,
        structid_to_cas_df,
        left_on='STRUCT_ID',
        right_on='id'
    )
    .dropna()
    .drop_duplicates()
)

In [184]:
drugcentral_export_mapped_df

Unnamed: 0,DRUG_NAME,STRUCT_ID,TARGET_NAME,TARGET_CLASS,ACCESSION,GENE,ACT_SOURCE,GENE_lowcase,STR_lowcase,gene_CUI,id,name,cas_reg_no,struct_cui
0,levobupivacaine,4,Potassium voltage-gated channel subfamily H me...,Ion channel,Q12809,KCNH2,CHEMBL,kcnh2,kcnh2,C1416572,4,levobupivacaine,27262-47-1,C0873119
1,levobupivacaine,4,Potassium voltage-gated channel subfamily H me...,Ion channel,Q12809,KCNH2,CHEMBL,kcnh2,kcnh2,C1416572,4,levobupivacaine,27262-47-1,C0875986
2,levobupivacaine,4,Potassium voltage-gated channel subfamily H me...,Ion channel,Q12809,KCNH2,CHEMBL,kcnh2,kcnh2,C1416572,4,levobupivacaine,27262-47-1,C0873118
3,levobupivacaine,4,Sodium channel protein type 1 subunit alpha,Ion channel,P35498,SCN1A,WOMBAT-PK,scn1a,scn1a,C1419856,4,levobupivacaine,27262-47-1,C0873119
4,levobupivacaine,4,Sodium channel protein type 1 subunit alpha,Ion channel,P35498,SCN1A,WOMBAT-PK,scn1a,scn1a,C1419856,4,levobupivacaine,27262-47-1,C0875986
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
238484,pegcetacoplan,5457,Complement C3,Unclassified,P01024,C3,SCIENTIFIC LITERATURE,c3,c3,C1332656,5457,pegcetacoplan,2019171-69-6,C5418501
238485,pegcetacoplan,5457,Complement C3,Unclassified,P01024,C3,SCIENTIFIC LITERATURE,c3,c3,C1571455,5457,pegcetacoplan,2019171-69-6,C5455506
238486,pegcetacoplan,5457,Complement C3,Unclassified,P01024,C3,SCIENTIFIC LITERATURE,c3,c3,C1571455,5457,pegcetacoplan,2019171-69-6,C5418501
238487,piflufolastat F-18,5458,Glutamate carboxypeptidase 2,Enzyme,Q04609,FOLH1,UNKNOWN,folh1,folh1,C1333570,5458,piflufolastat F-18,1207181-29-0,C3492634


### `drug_target_artificial_df` -> `drug_target_artificial_mapped_df`

In [185]:
drug_target_artificial_mapped_df = pd.merge(
    indications_mapped_df[['snomed_full_name', 'umls_cui', 'struct_id', 'name', 'cas_reg_no', 'struct_cui']],
    drugcentral_export_mapped_df[['DRUG_NAME', 'STRUCT_ID', 'TARGET_NAME', 'GENE', 'gene_CUI', 'struct_cui']],
    left_on='struct_cui',
    right_on='struct_cui',
    how='inner'
).dropna()

In [186]:
drug_target_artificial_mapped_df

Unnamed: 0,snomed_full_name,umls_cui,struct_id,name,cas_reg_no,struct_cui,DRUG_NAME,STRUCT_ID,TARGET_NAME,GENE,gene_CUI
0,Gonococcal meningitis,C0153225,564,ceftriaxone,73384-59-5,C1564647,ceftriaxone,564,D-amino-acid oxidase,DAO,C1413903
1,Gonococcal meningitis,C0153225,564,ceftriaxone,73384-59-5,C1564647,ceftriaxone,564,D-amino-acid oxidase,DAO,C1450094
2,Gonococcal meningitis,C0153225,564,ceftriaxone,73384-59-5,C1564647,ceftriaxone,564,D-amino-acid oxidase,DAO,C1412100
3,Streptococcal meningitis,C0154639,564,ceftriaxone,73384-59-5,C1564647,ceftriaxone,564,D-amino-acid oxidase,DAO,C1413903
4,Streptococcal meningitis,C0154639,564,ceftriaxone,73384-59-5,C1564647,ceftriaxone,564,D-amino-acid oxidase,DAO,C1450094
...,...,...,...,...,...,...,...,...,...,...,...
6001063,Metastatic non-small cell lung cancer,C0278987,5392,capmatinib,1029712-80-8,C5244866,capmatinib,5392,Hepatocyte growth factor receptor,MET,C1419433
6001064,Metastatic non-small cell lung cancer,C0278987,5392,capmatinib,1029712-80-8,C5244866,capmatinib,5392,Hepatocyte growth factor receptor,MET,C1417123
6001065,Metastatic non-small cell lung cancer,C0278987,5392,capmatinib,1029712-80-8,C5244866,capmatinib,5392,Hepatocyte growth factor receptor,MET,C1704823
6001066,Metastatic non-small cell lung cancer,C0278987,5392,capmatinib,1029712-80-8,C5244866,capmatinib,5392,Hepatocyte growth factor receptor,MET,C1822773


## Extracting pairs of UMLS terms

In [188]:
adverse_eff_mapped_pairs_df = (
    adverse_eff_mapped_df[
        ['struct_cui', 'meddra_cui']
    ]
    .dropna()
)

adverse_eff_mapped_pairs = list({
    tuple(sorted(p)) for p in list(
        zip(
            adverse_eff_mapped_pairs_df['struct_cui'],
            adverse_eff_mapped_pairs_df['meddra_cui']
        )
    )
})
len(adverse_eff_mapped_pairs)

2338731

In [189]:
pharm_action_mapped_pairs_df = (
    pharm_action_mapped_df[
        ['struct_cui', 'class_code_cui']
    ]
    .dropna()
)

pharm_action_mapped_pairs = list({
    tuple(sorted(p)) for p in list(
        zip(
            pharm_action_mapped_pairs_df['struct_cui'],
            pharm_action_mapped_pairs_df['class_code_cui']
        )
    )
})
len(pharm_action_mapped_pairs)

158387

In [191]:
indications_mapped_pairs_df = (
    indications_mapped_df[
        ['struct_cui', 'umls_cui']
    ]
    .dropna()
)

indications_mapped_pairs = list({
    tuple(sorted(p)) for p in list(
        zip(
            indications_mapped_pairs_df['struct_cui'],
            indications_mapped_pairs_df['umls_cui']
        )
    )
})
len(indications_mapped_pairs)

257304

In [192]:
drugcentral_export_mapped_pairs_df = (
    drugcentral_export_mapped_df[
        ['struct_cui', 'gene_CUI']
    ]
    .dropna()
)

drugcentral_export_mapped_pairs = list({
    tuple(sorted(p)) for p in list(
        zip(
            drugcentral_export_mapped_pairs_df['struct_cui'],
            drugcentral_export_mapped_pairs_df['gene_CUI']
        )
    )
})
len(drugcentral_export_mapped_pairs)

188791

In [193]:
drug_target_artificial_mapped_pairs_df = (
    drug_target_artificial_mapped_df[
        ['struct_cui', 'gene_CUI']
    ]
    .dropna()
)

drug_target_artificial_mapped_pairs = list({
    tuple(sorted(p)) for p in list(
        zip(
            drug_target_artificial_mapped_pairs_df['struct_cui'],
            drug_target_artificial_mapped_pairs_df['gene_CUI']
        )
    )
})
len(drug_target_artificial_mapped_pairs)

168860

### Summarizing and saving

In [194]:
len(adverse_eff_mapped_pairs)

2338731

In [195]:
len(pharm_action_mapped_pairs)

158387

In [196]:
len(indications_mapped_pairs)

257304

In [197]:
len(drugcentral_export_mapped_pairs)

188791

In [198]:
len(drug_target_artificial_mapped_pairs)

168860

In [199]:
drugcentral_pairs_all = set()

for pair_list in tqdm(
    [
        adverse_eff_mapped_pairs,
        pharm_action_mapped_pairs,
        indications_mapped_pairs,
        drugcentral_export_mapped_pairs,
        drug_target_artificial_mapped_pairs
    ]
):
    drugcentral_pairs_all.update(pair_list)
    
len(drugcentral_pairs_all)

100%|██████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 10.02it/s]


2900561

In [201]:
with open('../../benchmark_data/01_cui_pairs_json/drugcentral_cui_pairs.json', 'w') as f:
    json.dump(list(drugcentral_pairs_all), f)