# CTD Data Collection

Link: http://ctdbase.org/downloads/

In [164]:
import pandas as pd
import json

Used identifiers:

- ChemicalID (MeSH identifier)

- DiseaseID (MeSH or OMIM identifier)

- Gene Symbol

In [67]:
chem_ids_set = set()
gene_symbol_set = set()
dis_ids_set = set()

## Chem - gene

In [2]:
chem_gene_df = pd.read_csv(
    'CTD_chem_gene_ixns.csv.gz', 
    comment='#',
    header=None
)

chem_gene_df.columns = (
    'ChemicalName,ChemicalID,CasRN,GeneSymbol,GeneID,GeneForms,'
    'Organism,OrganismID,Interaction,InteractionActions,PubMedIDs'
    .split(',')
)

In [3]:
chem_gene_df.head(3)

Unnamed: 0,ChemicalName,ChemicalID,CasRN,GeneSymbol,GeneID,GeneForms,Organism,OrganismID,Interaction,InteractionActions,PubMedIDs
0,10074-G5,C534883,,AR,367,protein,Homo sapiens,9606.0,10074-G5 affects the reaction [MYC protein res...,affects^reaction|increases^expression,32184358
1,10074-G5,C534883,,AR,367,protein,Homo sapiens,9606.0,10074-G5 inhibits the reaction [EPHB2 protein ...,decreases^reaction|increases^expression,32184358
2,10074-G5,C534883,,AR,367,protein,Homo sapiens,9606.0,10074-G5 results in decreased expression of AR...,decreases^expression,32184358


In [4]:
chem_gene_df = chem_gene_df[
    [
        'ChemicalName', 
        'ChemicalID', 
        'GeneSymbol', 
        'PubMedIDs'
    ]
]

In [46]:
chem_gene_df

Unnamed: 0,ChemicalName,ChemicalID,GeneSymbol,PubMedIDs
0,10074-G5,C534883,AR,32184358
1,10074-G5,C534883,AR,32184358
2,10074-G5,C534883,AR,32184358
3,10074-G5,C534883,AR,32184358
4,10074-G5,C534883,EPHB2,32184358
...,...,...,...,...
2430018,Zymosan,D015054,VEGFA,17724436
2430019,Zymosan,D015054,VEGFA,17724436
2430020,Zymosan,D015054,XIAP,16803582
2430021,zymosterol,C015582,CYP27A1,14622972


In [68]:
gene_symbol_set.update(chem_gene_df['GeneSymbol'].dropna())

In [69]:
chem_ids_set.update(chem_gene_df['ChemicalID'].dropna())

## Chem - disease

In [6]:
chem_dis_df = pd.read_csv(
    'CTD_chemicals_diseases.csv.gz', 
    comment='#',
    header=None
)

chem_dis_df.columns = (
    'ChemicalName,ChemicalID,CasRN,DiseaseName,DiseaseID,'
    'DirectEvidence,InferenceGeneSymbol,InferenceScore,OmimIDs,PubMedIDs'
    .split(',')
)

In [7]:
chem_dis_df.head(3)

Unnamed: 0,ChemicalName,ChemicalID,CasRN,DiseaseName,DiseaseID,DirectEvidence,InferenceGeneSymbol,InferenceScore,OmimIDs,PubMedIDs
0,06-Paris-LA-66 protocol,C046983,,Precursor Cell Lymphoblastic Leukemia-Lymphoma,MESH:D054198,therapeutic,,,,4519131
1,10074-G5,C534883,,Adenocarcinoma,MESH:D000230,,MYC,4.08,,26432044
2,10074-G5,C534883,,Adenocarcinoma of Lung,MESH:D000077192,,MYC,4.31,,26656844|27602772


In [8]:
chem_dis_df = chem_dis_df[
    chem_dis_df['DirectEvidence'].notna()
]

In [143]:
chem_dis_df = chem_dis_df[
    [
        'ChemicalName', 
        'ChemicalID',
        'DiseaseName',
        'DiseaseID', 
        'PubMedIDs',
    ]
]

In [71]:
chem_ids_set.update(chem_dis_df['ChemicalID'].dropna())
dis_ids_set.update(chem_dis_df['DiseaseID'].dropna())

## Gene - disease

In [10]:
gene_dis_df = pd.read_csv(
    'CTD_genes_diseases.csv.gz', 
    comment='#',
    header=None
)

  gene_dis_df = pd.read_csv(


In [11]:
gene_dis_df.columns = (
    'GeneSymbol,GeneID,DiseaseName,DiseaseID,'
    'DirectEvidence,InferenceChemicalName,InferenceScore,OmimIDs,PubMedIDs'
    .split(',')
)

In [12]:
gene_dis_df.head(3)

Unnamed: 0,GeneSymbol,GeneID,DiseaseName,DiseaseID,DirectEvidence,InferenceChemicalName,InferenceScore,OmimIDs,PubMedIDs
0,11-BETA-HSD3,100174880,"Abnormalities, Drug-Induced",MESH:D000014,,Endocrine Disruptors,5.21,,22659286
1,11-BETA-HSD3,100174880,Amyotrophic Lateral Sclerosis,MESH:D000690,,"Water Pollutants, Chemical",4.75,,33562464
2,11-BETA-HSD3,100174880,Anemia,MESH:D000740,,"Water Pollutants, Chemical",4.28,,26546277


In [None]:
gene_symbol_set

In [13]:
gene_dis_df = gene_dis_df[
    gene_dis_df['DirectEvidence'].notna()
]

In [14]:
gene_dis_df

Unnamed: 0,GeneSymbol,GeneID,DiseaseName,DiseaseID,DirectEvidence,InferenceChemicalName,InferenceScore,OmimIDs,PubMedIDs
4984,A,50518,Dermatitis,MESH:D003872,marker/mechanism,,,,32937126
4989,A,50518,Diabetes Mellitus,MESH:D003920,marker/mechanism,,,,1473152
4995,A,50518,"Diabetes Mellitus, Type 2",MESH:D003924,marker/mechanism,,,,8146154
5037,A,50518,Edema,MESH:D004487,marker/mechanism,,,,32937126
5079,A,50518,Failure to Thrive,MESH:D005183,marker/mechanism,,,,32937126
...,...,...,...,...,...,...,...,...,...
98649752,ZW10,9183,Osteosarcoma,MESH:D012516,marker/mechanism,,,,14767549
98656933,ZWILCH,55055,Weight Gain,MESH:D015430,marker/mechanism,,,,19030233
98658201,ZWINT,11130,"Carcinoma, Hepatocellular",MESH:D006528,marker/mechanism,,,,28284560
98683205,ZYX,7791,Colorectal Neoplasms,MESH:D015179,marker/mechanism,,,,30697742


In [129]:
gene_dis_red_df = gene_dis_df[
    [
        'GeneSymbol',
        'DiseaseName',
        'DiseaseID', 
        'PubMedIDs',
    ]
]

In [37]:
gene_dis_red_df['DiseaseID_mesh'] = gene_dis_red_df['DiseaseID'].apply(
    lambda x: x.split(':')[1]
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gene_dis_red_df['DiseaseID_mesh'] = gene_dis_red_df['DiseaseID'].apply(


In [72]:
gene_symbol_set.update(gene_dis_df['GeneSymbol'])
dis_ids_set.update(gene_dis_df['DiseaseID'])

## Mapping

### UMLS (MRCONSO)

In [16]:
mrconso_path = (
    '../../UMLS_Metathesaurus/mrconso_and_semtypes_2022AA_df.pkl'
)

In [17]:
mrconso_st_df = pd.read_pickle(mrconso_path)

In [58]:
gene_str_to_cui_dict = (
    mrconso_st_gene_df[['CUI', 'STR']]
        .groupby('STR')
        .agg(set)
        ['CUI']
        .to_dict()
)

### dis

In [76]:
dis_ids_nopref_set = {v.split(':')[1] for v in dis_ids_set}

In [83]:
dis_map_raw_df = mrconso_st_df[
    (mrconso_st_df['CODE'].isin(dis_ids_nopref_set)) & \
    (mrconso_st_df['SAB'].isin({'MSH', 'OMIM'}))
]

In [84]:
dis_map_raw_df

Unnamed: 0,CUI,LAT,TS,LUI,STT,SUI,ISPREF,AUI,SAUI,SCUI,SDUI,SAB,TTY,CODE,STR,SRL,SUPPRESS,CVF,sem_types
5270,C0000727,ENG,P,L0000727,PF,S0009054,N,A0017734,,M0000006,D000006,MSH,MH,D000006,"Abdomen, Acute",,N,256.0,{Sign or Symptom}
5293,C0000727,ENG,P,L0000727,VO,S0009056,Y,A0017736,,M0000006,D000006,MSH,PM,D000006,"Abdomens, Acute",,N,,{Sign or Symptom}
5294,C0000727,ENG,P,L0000727,VO,S0010628,Y,A0019567,,M0000006,D000006,MSH,PM,D000006,Acute Abdomens,,N,256.0,{Sign or Symptom}
5297,C0000727,ENG,P,L0000727,VW,S0010627,Y,A0019566,,M0000006,D000006,MSH,PM,D000006,Acute Abdomen,,N,256.0,{Sign or Symptom}
5422,C0000729,ENG,P,L0000729,PF,S0009058,N,A7755642,,M0004741,D003085,MSH,PEP,D003085,Abdominal Cramps,,N,256.0,{Sign or Symptom}
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16827023,C5561936,ENG,P,L17400003,PF,S21011918,Y,A33637593,,,617119,OMIM,PT,617119,BARDET-BIEDL SYNDROME 22,,N,,{Disease or Syndrome}
16827024,C5561936,ENG,S,L17398496,PF,S21011921,Y,A33635476,,,617119,OMIM,ACR,617119,BBS22,,N,,{Disease or Syndrome}
16827025,C5561936,ENG,S,L17400002,PF,S21011917,Y,A33637859,,,617119,OMIM,ETAL,617119,"BARDET-BIEDL SYNDROME 20, FORMERLY",,N,,{Disease or Syndrome}
16827026,C5561936,ENG,S,L17400291,PF,S21011920,Y,A33637158,,,617119,OMIM,ACR,617119,"BBS20, FORMERLY",,N,,{Disease or Syndrome}


In [122]:
dis_to_cui_dict = (
    dis_map_raw_df[['CUI', 'CODE']]
        .groupby('CODE')
        .agg(set)
        ['CUI']
        .to_dict()
)

---
---
---

### chem

In [86]:
chem_map_raw_df = mrconso_st_df[
    (mrconso_st_df['CODE'].isin(chem_ids_set)) & \
    (mrconso_st_df['SAB'] == 'MSH')
]

In [87]:
chem_map_raw_df

Unnamed: 0,CUI,LAT,TS,LUI,STT,SUI,ISPREF,AUI,SAUI,SCUI,SDUI,SAB,TTY,CODE,STR,SRL,SUPPRESS,CVF,sem_types
12,C0000039,ENG,P,L0000039,VC,S0007564,Y,A0016515,,M0023172,D015060,MSH,MH,D015060,"1,2-Dipalmitoylphosphatidylcholine",,N,,"{Organic Chemical, Pharmacologic Substance}"
13,C0000039,ENG,P,L0000039,VC,S1357296,Y,A1317708,,M0023172,D015060,MSH,PM,D015060,"1,2 Dipalmitoylphosphatidylcholine",,N,,"{Organic Chemical, Pharmacologic Substance}"
14,C0000039,ENG,S,L0000035,PF,S0007560,Y,A26674543,,M0023172,D015060,MSH,ET,D015060,"1,2-Dihexadecyl-sn-Glycerophosphocholine",,N,,"{Organic Chemical, Pharmacologic Substance}"
15,C0000039,ENG,S,L0000035,VO,S1357276,Y,A1317687,,M0023172,D015060,MSH,PM,D015060,"1,2 Dihexadecyl sn Glycerophosphocholine",,N,,"{Organic Chemical, Pharmacologic Substance}"
16,C0000039,ENG,S,L0000038,PF,S0007563,Y,A26661070,,M0023172,D015060,MSH,ET,D015060,"1,2-Dipalmitoyl-Glycerophosphocholine",,N,,"{Organic Chemical, Pharmacologic Substance}"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16833703,C5566019,ENG,P,L17403925,PF,S21019533,Y,A33642513,,M000753670,D000090983,MSH,PEP,D000090983,TAK-919,,N,,"{Pharmacologic Substance, Nucleic Acid, Nucleo..."
16833704,C5566019,ENG,P,L17403925,VO,S21019531,Y,A33644063,,M000753670,D000090983,MSH,PM,D000090983,TAK 919,,N,,"{Pharmacologic Substance, Nucleic Acid, Nucleo..."
16833705,C5566019,ENG,S,L17403478,PF,S21019534,Y,A33647962,,M000753670,D000090983,MSH,PM,D000090983,TAK919,,N,,"{Pharmacologic Substance, Nucleic Acid, Nucleo..."
16833707,C5566020,ENG,P,L17402586,PF,S21020536,Y,A33647180,,M000753669,D000090983,MSH,PEP,D000090983,mRNA-1273.211,,N,,"{Pharmacologic Substance, Nucleic Acid, Nucleo..."


In [113]:
chemid_to_cui_dict = (
    chem_map_raw_df[['CUI', 'CODE']]
        .groupby('CODE')
        .agg(set)
        ['CUI']
        .to_dict()
)

### gene

In [101]:
pref_semtypes = {
    'Amino Acid, Peptide, or Protein',
    'Gene or Genome'
}

In [102]:
mrconso_st_gene_df = mrconso_st_df[
    (
        mrconso_st_df['sem_types'].apply(
            lambda x: True if len(x.intersection(pref_semtypes)) > 0 else False
        )
    ) & \
    (
        mrconso_st_df['STR'].isin(gene_symbol_set)
    )
    
]

In [103]:
mrconso_st_gene_df

Unnamed: 0,CUI,LAT,TS,LUI,STT,SUI,ISPREF,AUI,SAUI,SCUI,SDUI,SAB,TTY,CODE,STR,SRL,SUPPRESS,CVF,sem_types
1874,C0000340,ENG,S,L1762315,PF,S2008627,Y,A26620705,,M0023210,D015090,MSH,ET,D015090,CYP27B1,,N,,"{Amino Acid, Peptide, or Protein, Enzyme}"
25154,C0001414,ENG,S,L3554326,PF,S4183677,Y,A12093502,,M0000353,D000228,MSH,DEV,D000228,APRT,,Y,,"{Amino Acid, Peptide, or Protein, Enzyme}"
40994,C0001899,ENG,S,L0580028,PF,S0650857,N,A0706549,,,,SNMI,SY,F-68DC8,GPT,9.0,N,,"{Amino Acid, Peptide, or Protein, Enzyme}"
40995,C0001899,ENG,S,L0580028,PF,S0650857,Y,A3024571,94685018.0,56935002,,SNOMEDCT_US,SY,56935002,GPT,9.0,N,,"{Amino Acid, Peptide, or Protein, Enzyme}"
41015,C0001899,ENG,S,L1224297,PF,S0789224,N,A0848791,,,,AOD,NP,0000028307,ALT,,N,,"{Amino Acid, Peptide, or Protein, Enzyme}"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16794687,C5552914,ENG,S,L6470640,PF,S7473284,Y,A33471231,,C181686,,NCI,SY,C181686,TSHB,,N,,"{Hormone, Amino Acid, Peptide, or Protein}"
16794704,C5552915,ENG,S,L6447586,PF,S7547316,Y,A33464576,,C179706,,NCI,SY,C179706,PHB2,,N,,"{Biologically Active Substance, Amino Acid, Pe..."
16795230,C5553014,ENG,S,L6449395,PF,S7564764,Y,A33467396,,C179242,,NCI,SY,C179242,ST3GAL2,,N,,"{Amino Acid, Peptide, or Protein, Enzyme}"
16795298,C5553028,ENG,S,L1224407,PF,S7481998,Y,A33478821,,C178161,,NCI,SY,C178161,C1S,,N,,"{Amino Acid, Peptide, or Protein, Enzyme}"


In [112]:
gene_symbol_to_cui_dict = (
    mrconso_st_gene_df[['CUI', 'STR']]
        .groupby('STR')
        .agg(set)
    ['CUI']
        .to_dict()
)

---
---
---

## Applying mapping to pairs

In [117]:
chem_gene_df['chem_cui'] = chem_gene_df['ChemicalID'].apply(
    lambda x: chemid_to_cui_dict.get(x)
)

chem_gene_df['gene_cui'] = chem_gene_df['GeneSymbol'].apply(
    lambda x: gene_symbol_to_cui_dict.get(x)
)

In [118]:
chem_gene_df

Unnamed: 0,ChemicalName,ChemicalID,GeneSymbol,PubMedIDs,chem_umls,chem_cui,gene_cui
0,10074-G5,C534883,AR,32184358,{C2607706},{C2607706},"{C1705240, C1514768, C1704903, C1367578, C1412..."
1,10074-G5,C534883,AR,32184358,{C2607706},{C2607706},"{C1705240, C1514768, C1704903, C1367578, C1412..."
2,10074-G5,C534883,AR,32184358,{C2607706},{C2607706},"{C1705240, C1514768, C1704903, C1367578, C1412..."
3,10074-G5,C534883,AR,32184358,{C2607706},{C2607706},"{C1705240, C1514768, C1704903, C1367578, C1412..."
4,10074-G5,C534883,EPHB2,32184358,{C2607706},{C2607706},"{C1333340, C1705767}"
...,...,...,...,...,...,...,...
2430018,Zymosan,D015054,VEGFA,17724436,"{C0043553, C0043552}","{C0043553, C0043552}",{C1823619}
2430019,Zymosan,D015054,VEGFA,17724436,"{C0043553, C0043552}","{C0043553, C0043552}",{C1823619}
2430020,Zymosan,D015054,XIAP,16803582,"{C0043553, C0043552}","{C0043553, C0043552}",{C1337109}
2430021,zymosterol,C015582,CYP27A1,14622972,{C0078861},{C0078861},"{C3538880, C1413864}"


In [136]:
chem_gene_exploded_df = (
    chem_gene_df
        .dropna()
        .explode('chem_cui')
        .explode('gene_cui')
)

In [137]:
chem_gene_exploded_df

Unnamed: 0,ChemicalName,ChemicalID,GeneSymbol,PubMedIDs,chem_umls,chem_cui,gene_cui
0,10074-G5,C534883,AR,32184358,{C2607706},C2607706,C1705240
0,10074-G5,C534883,AR,32184358,{C2607706},C2607706,C1514768
0,10074-G5,C534883,AR,32184358,{C2607706},C2607706,C1704903
0,10074-G5,C534883,AR,32184358,{C2607706},C2607706,C1367578
0,10074-G5,C534883,AR,32184358,{C2607706},C2607706,C1412322
...,...,...,...,...,...,...,...
2430020,Zymosan,D015054,XIAP,16803582,"{C0043553, C0043552}",C0043553,C1337109
2430020,Zymosan,D015054,XIAP,16803582,"{C0043553, C0043552}",C0043552,C1337109
2430021,zymosterol,C015582,CYP27A1,14622972,{C0078861},C0078861,C3538880
2430021,zymosterol,C015582,CYP27A1,14622972,{C0078861},C0078861,C1413864


---
---

In [145]:
chem_dis_df['chem_cui'] = chem_dis_df['ChemicalID'].apply(
    lambda x: chemid_to_cui_dict.get(x)
)

chem_dis_df['dis_cui'] = chem_dis_df['DiseaseID'].apply(
    lambda x: dis_to_cui_dict.get(x.split(':')[1])
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chem_dis_df['chem_cui'] = chem_dis_df['ChemicalID'].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chem_dis_df['dis_cui'] = chem_dis_df['DiseaseID'].apply(


In [146]:
chem_dis_df

Unnamed: 0,ChemicalName,ChemicalID,DiseaseName,DiseaseID,PubMedIDs,chem_cui,dis_cui
0,06-Paris-LA-66 protocol,C046983,Precursor Cell Lymphoblastic Leukemia-Lymphoma,MESH:D054198,4519131,"{C0625683, C0625684}","{C1961102, C0023452, C0023453}"
97,"10,10-bis(4-pyridinylmethyl)-9(10H)-anthracenone",C112297,Hyperkinesis,MESH:D006948,19098162,"{C0754793, C0674998}","{C0751217, C3887506}"
113,"10,10-bis(4-pyridinylmethyl)-9(10H)-anthracenone",C112297,Seizures,MESH:D012640,26348896,"{C0754793, C0674998}","{C3495874, C0751123, C0149958, C0422854, C4316..."
162,"10,11-dihydro-10-hydroxycarbamazepine",C039775,Epilepsy,MESH:D004827,17516704,"{C0061838, C0089910, C0090008}","{C0086237, C0236018, C0014544, C0751111}"
218,"10,11-dihydroxy-N-n-propylnorapomorphine",C425777,Hyperkinesis,MESH:D006948,15765258,{C0966094},"{C0751217, C3887506}"
...,...,...,...,...,...,...,...
8078393,Zymosan,D015054,Peritonitis,MESH:D010538,11766996|11985352|14761945|15259001|15770054|1...,"{C0043553, C0043552}","{C0031154, C1449646, C1449647}"
8078742,Zymosan,D015054,Sepsis,MESH:D018805,11441115,"{C0043553, C0043552}","{C1719672, C0036690, C0243026, C0034189}"
8078765,Zymosan,D015054,Shock,MESH:D012769,21323892,"{C0043553, C0043552}","{C0020683, C0036974}"
8079009,Zymosan,D015054,Uveitis,MESH:D014605,11006244,"{C0043553, C0043552}",{C0042164}


In [147]:
chem_dis_exploded_df = (
    chem_dis_df
        .dropna()
        .explode('chem_cui')
        .explode('dis_cui')
)

In [148]:
chem_dis_exploded_df

Unnamed: 0,ChemicalName,ChemicalID,DiseaseName,DiseaseID,PubMedIDs,chem_cui,dis_cui
0,06-Paris-LA-66 protocol,C046983,Precursor Cell Lymphoblastic Leukemia-Lymphoma,MESH:D054198,4519131,C0625683,C1961102
0,06-Paris-LA-66 protocol,C046983,Precursor Cell Lymphoblastic Leukemia-Lymphoma,MESH:D054198,4519131,C0625683,C0023452
0,06-Paris-LA-66 protocol,C046983,Precursor Cell Lymphoblastic Leukemia-Lymphoma,MESH:D054198,4519131,C0625683,C0023453
0,06-Paris-LA-66 protocol,C046983,Precursor Cell Lymphoblastic Leukemia-Lymphoma,MESH:D054198,4519131,C0625684,C1961102
0,06-Paris-LA-66 protocol,C046983,Precursor Cell Lymphoblastic Leukemia-Lymphoma,MESH:D054198,4519131,C0625684,C0023452
...,...,...,...,...,...,...,...
8078765,Zymosan,D015054,Shock,MESH:D012769,21323892,C0043552,C0036974
8079009,Zymosan,D015054,Uveitis,MESH:D014605,11006244,C0043553,C0042164
8079009,Zymosan,D015054,Uveitis,MESH:D014605,11006244,C0043552,C0042164
8079065,zymostenol,C056855,Chondrodysplasia Punctata,MESH:D002806,18176751,C0165064,C0008445


---
---

In [149]:
gene_dis_red_df['gene_cui'] = gene_dis_red_df['GeneSymbol'].apply(
    lambda x: gene_symbol_to_cui_dict.get(x)
)

gene_dis_red_df['dis_cui'] = gene_dis_red_df['DiseaseID'].apply(
    lambda x: dis_to_cui_dict.get(x.split(':')[1])
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gene_dis_red_df['gene_cui'] = gene_dis_red_df['GeneSymbol'].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gene_dis_red_df['dis_cui'] = gene_dis_red_df['DiseaseID'].apply(


In [150]:
gene_dis_red_exploded_df = (
    gene_dis_red_df
        .dropna()
        .explode('gene_cui')
        .explode('dis_cui')
)

In [151]:
gene_dis_red_exploded_df

Unnamed: 0,GeneSymbol,DiseaseName,DiseaseID,PubMedIDs,gene_cui,dis_cui
7797,A1BG,Hepatomegaly,MESH:D006529,28108177,C1412045,C0019209
9517,A1BG,Schizophrenia,MESH:D012559,25821032,C1412045,C0036341
14684,A2M,Acute Kidney Injury,MESH:D058186,23052191,C3813120,C0022660
14684,A2M,Acute Kidney Injury,MESH:D058186,23052191,C3813120,C2609414
14684,A2M,Acute Kidney Injury,MESH:D058186,23052191,C3813120,C1565662
...,...,...,...,...,...,...
98656933,ZWILCH,Weight Gain,MESH:D015430,19030233,C1824085,C0043094
98658201,ZWINT,"Carcinoma, Hepatocellular",MESH:D006528,28284560,C1421869,C2239176
98683205,ZYX,Colorectal Neoplasms,MESH:D015179,30697742,C1421874,C0009404
98683205,ZYX,Colorectal Neoplasms,MESH:D015179,30697742,C1421874,C0009402


### Creating connections

In [157]:
ctd_all_pairs = []

In [158]:
ctd_all_pairs += list(
    zip(
        chem_gene_exploded_df['chem_cui'],
        chem_gene_exploded_df['gene_cui']
    )
)

In [159]:
ctd_all_pairs += list(
    zip(
        chem_dis_exploded_df['chem_cui'],
        chem_dis_exploded_df['dis_cui']
    )
)

In [160]:
ctd_all_pairs += list(
    zip(
        gene_dis_red_exploded_df['gene_cui'],
        gene_dis_red_exploded_df['dis_cui']
    )
)

In [162]:
ctd_all_pairs_set = {
    tuple(sorted(pair)) for pair in ctd_all_pairs
}

In [163]:
len(ctd_all_pairs_set)

9754308

## Saving

In [165]:
with open('ctd_all_pairs_set.json', 'w') as f:
    json.dump(list(ctd_all_pairs_set), f)