In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import dok_matrix, save_npz
import os
import json

In [3]:
OUTPUT_PATH = "./DDDB/DrugToDisease_DGIDB_naming.tsv"
#get the claim_name to name correspondence
DGIDB = pd.read_csv("./DGIDB/DrugToGene.tsv", sep="\t")
DDDB = pd.read_csv("./DDDB/DrugToDisease.tsv", sep="\t")

#normalize columns
DGIDB['drug_name'] = DGIDB['drug_name'].str.upper()
DGIDB['drug_claim_name'] = DGIDB['drug_claim_name'].str.upper()

DDDB['ndfrt_preferred_label'] = DDDB['ndfrt_preferred_label'].str.upper()
DDDB_drug_name_array = DDDB['ndfrt_preferred_label'].unique()
print(DDDB_drug_name_array)

DDDB.iloc[38:51]

['DONEPEZIL' 'MESALAMINE' 'TESTOSTERONE' 'DIAZEPAM' 'HYDROCODONE'
 'BACLOFEN' 'GLIMEPIRIDE' 'IBUPROFEN' 'MINOCYCLINE' 'BISOPROLOL'
 'LEVOTHYROXINE' 'CALCIFEDIOL' 'URSODIOL' 'VENLAFAXINE' 'FLUVASTATIN'
 'FLUVOXAMINE' 'METFORMIN' 'VINORELBINE' 'TRAMADOL' 'CETIRIZINE'
 'IRINOTECAN' 'GEMCITABINE' 'ATORVASTATIN' 'AZELASTINE' 'FEXOFENADINE'
 'TAMSULOSIN' 'QUETIAPINE' 'CLOPIDOGREL' 'SILDENAFIL' 'MONTELUKAST'
 'SEVELAMER' 'ESCITALOPRAM' 'ATOMOXETINE' 'PREGABALIN' 'COLCHICINE'
 'PHENOBARBITAL' 'ISONIAZID' 'CODEINE' 'ACETAMINOPHEN' 'NITROGLYCERIN'
 'MORPHINE' 'ASPIRIN' 'TRIAMCINOLONE' 'ESTRADIOL' 'CAPTOPRIL'
 'HYDROCHLOROTHIAZIDE' 'SULFAMETHOXAZOLE' 'TRIMETHOPRIM' 'DOXYCYCLINE'
 'LEVODOPA' 'CLONAZEPAM' 'BUMETANIDE' 'FLUOROURACIL'
 'ISOSORBIDE DINITRATE' 'FUROSEMIDE' 'METRONIDAZOLE' 'LORAZEPAM'
 'CLONIDINE' 'PHENYTOIN' 'CHLORPROMAZINE' 'CARBAMAZEPINE' 'PREDNISONE'
 'SPIRONOLACTONE' 'CHLORTHALIDONE' 'ALLOPURINOL' 'SULFASALAZINE'
 'TRIAMTERENE' 'THIOTEPA' 'PYRAZINAMIDE' 'DEXAMETHASONE' 'PERPHENAZIN

Unnamed: 0,NDF-RT,SNOMED,ndfrt_preferred_label,snomed_disease
38,N0000006993,25064002,IBUPROFEN,Headache
39,N0000006993,37796009,IBUPROFEN,Migraine
40,N0000006993,65754002,IBUPROFEN,Primary dysmenorrhea
41,N0000006993,69896004,IBUPROFEN,Rheumatoid arthritis
42,N0000006993,83330001,IBUPROFEN,Patent ductus arteriosus
43,N0000006993,90560007,IBUPROFEN,Gout
44,N0000006993,128139000,IBUPROFEN,Inflammatory disorder
45,N0000006993,190905008,IBUPROFEN,Cystic fibrosis
46,N0000006993,386661006,IBUPROFEN,Fever (finding)
47,N0000006993,396275006,IBUPROFEN,Osteoarthritis (disorder)


In [5]:
#building correspondence
claim2canon = (
    DGIDB.dropna(subset=["drug_claim_name", "drug_name"])
         .drop_duplicates("drug_claim_name")
         .set_index("drug_claim_name")["drug_name"]
         .to_dict()
)

DDDB["ndfrt_preferred_label"] = (
    DDDB["ndfrt_preferred_label"].map(claim2canon).fillna(DDDB["ndfrt_preferred_label"])
)



In [6]:
#view result
DDDB.iloc[38:51]

Unnamed: 0,NDF-RT,SNOMED,ndfrt_preferred_label,snomed_disease
38,N0000006993,25064002,"IBUPROFEN, SODIUM SALT",Headache
39,N0000006993,37796009,"IBUPROFEN, SODIUM SALT",Migraine
40,N0000006993,65754002,"IBUPROFEN, SODIUM SALT",Primary dysmenorrhea
41,N0000006993,69896004,"IBUPROFEN, SODIUM SALT",Rheumatoid arthritis
42,N0000006993,83330001,"IBUPROFEN, SODIUM SALT",Patent ductus arteriosus
43,N0000006993,90560007,"IBUPROFEN, SODIUM SALT",Gout
44,N0000006993,128139000,"IBUPROFEN, SODIUM SALT",Inflammatory disorder
45,N0000006993,190905008,"IBUPROFEN, SODIUM SALT",Cystic fibrosis
46,N0000006993,386661006,"IBUPROFEN, SODIUM SALT",Fever (finding)
47,N0000006993,396275006,"IBUPROFEN, SODIUM SALT",Osteoarthritis (disorder)


In [9]:
#view missing drug names

# Normalize to lowercase for fair comparison
DGIDB_names = set(DGIDB['drug_name'].str.lower())

# Filter DDDB to keep only the "missing" ones
missing_rows = DDDB[~DDDB['ndfrt_preferred_label'].str.lower().isin(DGIDB_names)]

# Show them
missing_unique = (
    missing_rows['ndfrt_preferred_label']
    .drop_duplicates()
    .sort_values()
)
print(missing_unique.to_string(index=False))


                  CHLORIDE (AS POTASSIUM)
                               COLESTIPOL
                    ELECTROLYTES/PEG-3350
                     ESTROGENS,CONJUGATED
                 INSULIN,ASPART PROTAMINE
                    INSULIN,DETEMIR,HUMAN
              INSULIN,GLARGINE,HUMAN/RDNA
                INSULIN,LISPRO,HUMAN/RDNA
OMEGA-3-ACID ETHYL ESTERS 1000MG CAP,ORAL
                                SEVELAMER
                          STAVUDINE (D4T)
                             UNKNOWN DRUG


In [11]:
#create the updated DDDB tsv file
DDDB.to_csv(OUTPUT_PATH, sep="\t", index=False)