In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

DCDB - Drug Combination Database

In [2]:
dcdb = pd.read_csv('raw_database/dcdb.txt', sep='\t')
dcdb_id = pd.read_csv('raw_database/dcdb_components_identifier.txt', sep='\t')
dc2dcu = pd.read_csv('raw_database/DC_TO_DCU.txt', sep='\t')
dcu_usage = pd.read_csv('raw_database/DC_USAGE.txt', sep='\t')

In [3]:
print(len(dcdb))
dcdb.head()

1363


Unnamed: 0,DrugCombination_ID,Components_Name,Componets_ID
0,DC000348,Bismuth Subsalicylate; Metronidazole; Tetracyc...,DCC0187/DCC0235/DCC0338
1,DC000349,Brimonidine; Timolol,DCC0072/DCC0106
2,DC000350,Betamethasone; Calcipotriol,DCC0095/DCC0358
3,DC000351,Betamethasone; Clotrimazole,DCC0033/DCC0095
4,DC000352,Cerulenin; Levodopa,DCC0274/DCC0326


In [4]:
# leave only "EFFICACIOUS" combinations
dc2dcu_dict = {}
for i in tqdm(range(len(dc2dcu))):
    dc = dc2dcu['DC_ID'][i]
    dcu = dc2dcu['DCU_ID'][i]
    if dc not in dc2dcu_dict:
        dc2dcu_dict[dc] = set([dcu])
    else:
        dc2dcu_dict[dc].add(dcu)

efficacious_dcu = set()
for i in tqdm(range(len(dcu_usage))):
    dcu = dcu_usage['DCU_ID'][i]
    if dcu_usage['EFFICACY'][i] == 'Efficacious':
        efficacious_dcu.add(dcu)

efficacious_dc = set()
for key in dc2dcu_dict.keys():
    if dc2dcu_dict[key].issubset(efficacious_dcu):
        efficacious_dc.add(key)

100%|██████████| 1793/1793 [00:00<00:00, 63971.72it/s]
100%|██████████| 1813/1813 [00:00<00:00, 91084.41it/s]


In [5]:
dcdb = dcdb.loc[dcdb['DrugCombination_ID'].isin(efficacious_dc)]
print(len(dcdb))

1037


In [6]:
print(len(dcdb_id))
dcdb_id.head()
# sum(dcdb_id['DrugBank'].isna()) # 162

876


Unnamed: 0,DCC_ID,Name,CAS_Number,BindingDB,ChEBI,DrugBank,KEGG Compound,KEGG Drug,PDB,PharmGKB,PubChem Compound,PubChem Substance
0,DCC1838,Methyclothiazide,CAS:135-07-9,,,DB00232,,D00656,,,,7847722.0
1,DCC0413,Garenoxacin,CAS:194804-75-6,,,,,D02540,,,124093.0,
2,DCC0520,Betaine,CAS:107-43-7,,,DB01494,,D07523,,,247.0,
3,DCC0639,Atrasentan,CAS:173937-91-2,,,,,D03009,,,17397165.0,
4,DCC0029,Lovastatin,CAS:75330-75-5,,40303.0,DB00227,C07074,D00359,803.0,PA450272,53232.0,46508223.0


In [7]:
dcdb_dict = dict(zip(dcdb_id['DCC_ID'], dcdb_id['DrugBank']))
bad_keys = []
for key, value in dcdb_dict.items():
    if type(value) != str:
        bad_keys.append(key)
    if type(value) == str and not value.startswith('DB'):
        bad_keys.append(key)
print(len(bad_keys))
    

171


In [8]:
for key in bad_keys:
    del dcdb_dict[key]

In [9]:
dcdb_lst = []
for x in tqdm(dcdb['Componets_ID']):
    dc_ids = x.split('/')
    db_ids = []
    for id_ in dc_ids:
        db_id = dcdb_dict.get(id_) # return None if not exist in the dictionary
        db_ids.append(db_id)
    if (np.nan not in db_ids) and (None not in db_ids):
        dcdb_lst.append(set(db_ids))
print(len(dcdb_lst))

100%|██████████| 1037/1037 [00:00<00:00, 120742.12it/s]

825





In [10]:
dcdb_lst_final = []
for x in dcdb_lst:
    if x not in dcdb_lst_final:
        dcdb_lst_final.append(x)
print(len(dcdb_lst_final))

825


C-DCDB (Continuous Drug combination database)

In [1]:
import sqlite3
import ast

In [88]:
con = sqlite3.connect('raw_database/c_dcdb.sqlite')
cursor = con.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cursor.fetchall())

[('aact_combs',), ('aact_combs_with_identifiers',), ('all_combs_unormalized',), ('conditions',), ('design_group',), ('mesh_terms',), ('orangebook_combs',), ('patents_ipc',), ('patents_metadata',), ('references',), ('transformed_patents_drug',), ('trials',), ('web_preview',)]


In [6]:
df = pd.read_sql_query("SELECT * from all_combs_unormalized", con)

In [90]:
df

Unnamed: 0,index,drugs,drugbank_identifiers,pubchem_identifiers,source_id,source
0,0,"[[""foscarnet"", ""Foscavir""], [""ganciclovir"", ""C...","[""DB00529"", ""DB01004""]","[""-1"", ""-1""]",NCT00000134,clinicaltrials.gov
1,1,"[[""ganciclovir""], [""foscarnet"", ""Foscavir""]]","[""DB01004"", ""DB00529""]","[""-1"", ""-1""]",NCT00000136,clinicaltrials.gov
2,2,"[[""ganciclovir""], [""foscarnet"", ""Foscavir""]]","[""DB01004"", ""DB00529""]","[""-1"", ""-1""]",NCT00000136,clinicaltrials.gov
3,3,"[[""cycloserine""], [""clozapine""]]","[""DB00260"", ""DB00363""]","[""CID6234"", ""CID2818""]",NCT00000372,clinicaltrials.gov
4,4,"[[""PTH protein, human"", ""teriparatide""], [""ale...","[""DB06285"", ""DB00630""]","[""CID16133850"", ""CID2088""]",NCT00000400,clinicaltrials.gov
...,...,...,...,...,...,...
43860,43860,"[""DESOGESTREL"", ""ETHINYL ESTRADIOL""]","[""DB00304"", ""DB00977""]","[""CID40973"", ""CID5991""]",,orangebook
43861,43861,"[""ETHINYL ESTRADIOL"", ""NORGESTIMATE""]","[""DB00977"", ""DB00957""]","[""CID5991"", ""CID6540478""]",,orangebook
43862,43862,"[""ETHINYL ESTRADIOL"", ""FERROUS FUMARATE"", ""NOR...","[""DB00977"", ""DB14491"", ""DB00717""]","[""CID5991"", ""CID6433164"", ""-1""]",,orangebook
43863,43863,"[""ETHINYL ESTRADIOL"", ""FERROUS FUMARATE"", ""NOR...","[""DB00977"", ""DB14491"", ""DB00717""]","[""CID5991"", ""CID6433164"", ""CID6230""]",,orangebook


In [97]:
c_dcdb_lst = []
for x in df['drugbank_identifiers']:
    x = ast.literal_eval(x)
    c_dcdb_lst.append(set(x))
print(len(c_dcdb_lst))

43865


In [98]:
c_dcdb_lst_final = []
for x in c_dcdb_lst:
    flag = 0
    for y in x:
        if not y.startswith('DB'):
            flag += 1
    if flag != 0:
        continue

    if x not in c_dcdb_lst_final:
        c_dcdb_lst_final.append(x)

print(len(c_dcdb_lst_final))

15336


Leave drug combinations of only Two drugs

In [11]:
dcdb_lst_dual = []
for x in dcdb_lst_final:
    if len(x) == 2:
        dcdb_lst_dual.append(list(x))
print(len(dcdb_lst_dual))

598


In [109]:
c_dcdb_lst_dual = []
for x in c_dcdb_lst_final:
    if len(x) == 2:
        c_dcdb_lst_dual.append(list(x))
print(len(c_dcdb_lst_dual))

9092


Export to tsv file

In [12]:
pd.DataFrame(dcdb_lst_dual, columns=['drug_1', 'drug_2']).to_csv('ori_labels/DCDB_dual.tsv', sep='\t', index=None)

In [117]:
pd.DataFrame(c_dcdb_lst_dual, columns=['drug_1', 'drug_2']).to_csv('ori_labels/C_DCDB_dual.tsv', sep='\t', index=None)

In [13]:
dcdb_dual = pd.read_csv('ori_labels/DCDB_dual.tsv', sep='\t')
dcdb_dual.head()

Unnamed: 0,drug_1,drug_2
0,DB00373,DB00484
1,DB00443,DB02300
2,DB00443,DB00257
3,DB01235,DB01034
4,DB01222,DB00983


In [28]:
c_dcdb_dual = pd.read_csv('ori_labels/C_DCDB_dual.tsv', sep='\t')
c_dcdb_dual.head()

Unnamed: 0,drug_1,drug_2
0,DB00529,DB01004
1,DB00363,DB00260
2,DB06285,DB00630
3,DB01104,DB00704
4,DB00375,DB00227
