In [None]:
import pandas as pd
from xml.etree import ElementTree as ET
from tqdm import tqdm 
import pickle
from collections import defaultdict 

In [None]:
xml_file = "../data/raw/full database.xml"

tree = ET.parse(xml_file)
root = tree.getroot()

In [None]:
ns = '{http://www.drugbank.ca}'

drug2smilesL = defaultdict(set)
drug2smiles = {}
drug_info_list = []

for i, drug in tqdm(enumerate(root)):
    dbid = drug.findtext(ns + "drugbank-id[@primary='true']")
    name = drug.findtext(ns + "name")
    smiles = drug.findtext(f"{ns}calculated-properties/{ns}property[{ns}kind='SMILES']/{ns}value")
    synonyms_obj = drug.findall(f"{ns}synonyms/{ns}synonym[@language='english']")
    synonyms = [synonym.text for synonym in synonyms_obj]

    if smiles is None:
        continue

    drug2smilesL[name].add(smiles)
    for synonym in synonyms:
        drug2smilesL[synonym].add(smiles)

    drug_info_list.append([dbid, name, smiles, synonyms])

for drug, smiles in drug2smilesL.items():
    drug2smiles[drug.lower()] = list(smiles)[0]

## Debug
# xml_string = ET.tostring(drug, encoding='utf-8').decode('utf-8')
# print(xml_string)

In [None]:
drugsmiles_df = pd.DataFrame(drug_info_list, columns=['dbid', 'name', 'smiles', 'synonyms'])
drugsmiles_df.to_csv("../data/drugbank_drugsmiles.csv", index=False, sep='\t')


In [None]:
pickle.dump(drug2smiles, open("../data/drug2smiles.pkl", 'wb'))


In [None]:
import json

json.dump(drug2smiles, open("../data/drug2smiles.json", "w"))
