In [14]:
EXTRACT = False
if EXTRACT == True:
    
    # Classes #
    class Drug:
        """
        docstring for Drug.
        """
        def __init__(self, features):

            self.id = features['id']
            self.name = features['name']
            self.synonyms = features['synm']
            self.kingdom = features['kgd']
            self.superclass = features['sclass']
            self.interaction = features['itrc']
            self.external_id = features['ext_id']
            self.pathways = features['pathways']
            self.target = []
            self.atc_codes = features['atc_codes']
            self.atc_levels = features['atc_levels']
            
        def getDrugfeatures(self):
            drug_dict = {"dg_id":self.id,
                        "dg_name":self.name,
                        "dg_synm":self.synonyms,
                        "dg_kingdom":self.kingdom,
                        "dg_superclass":self.superclass,
                        "dg_interactions":self.interaction,
                        "dg_ext_id":self.external_id,
                        "dg_pathways":self.pathways,
                        "dg_atc_codes":self.atc_codes,
                        "dg_atc_levels":self.atc_levels}
            return drug_dict

        def addTarget(self, feature_target):
            self.target.append(feature_target)

    # Parameters and required variables #

    dB_file = 'data.xml'
    organism = 'Humans'
    saveFile = 'data.csv'

    import xml.etree.ElementTree as ET
    import time
    from tqdm import tqdm
    import pandas as pd

    xtree = ET.parse(dB_file)
    xroot = xtree.getroot()
    drugs = list(xroot)

    drug_targets = []
    for i in tqdm(range(len(drugs))):
        drug = drugs[i]
        idDB = drug[0].text # Drug Bank ID

        for idx,feature in enumerate(drug):
            if 'name' in str(feature): # drug name
                drug_name = drug[idx].text

            if 'synonyms' in str(feature): # drug's synonyms
                drug_synm = ';'.join([synm.text \
                                        for synm in list(drug[idx])])

            if 'classification' in str(feature): #type of drug
                drug_class_kingdom = list(drug[idx])[2].text
                drug_class_superclass = list(drug[idx])[3].text

            if 'drug-interactions' in str(feature): #interaction other drugs
                drug_interaction = ';'.join([di[0].text
                                            for di in list(drug[idx])])

            if 'external-identifiers' in str(feature): #other drug's IDs
                aux = [ext_id[0].text + ":" + ext_id[1].text \
                                            for ext_id in list(drug[idx])]
                drug_external_id = ';'.join(aux)

            if 'pathways' in str(feature): #related pathways
                drug_pathway = ';'.join([pathway[1].text \
                                        for pathway in list(drug[idx])])

            if 'atc-codes' in str(feature):  # ATC codes
                atc_codes = []  # List of ATC codes
                atc_levels = []  # List of levels for each ATC code
                for atc_code in list(drug[idx]):  # Iterate over each <atc-code>
                    code = atc_code.attrib['code']  # Get the code attribute
                    levels = []  # List to store levels for this ATC code
                    for level in list(atc_code):  # Extract each <level>
                        level_code = level.attrib['code']  # Get the level code
                        levels.append(level_code)  # Combine code and text
                    # print(code, levels)
                    atc_codes.append(code)  # Append the ATC code
                    atc_levels.append(levels)  # Append the list of levels
                drug_atc_codes = atc_codes  # Assign extracted ATC codes
                drug_atc_levels = atc_levels  # Assign extracted levels
            
            # if 'atc-codes' in str(feature):  # ATC codes extraction
            #     atc_codes = [atc_code.attrib.get('code') for atc_code in list(drug[idx])]  # Extract only the 'code' attributes
            #     drug_atc = ';'.join(atc_codes)  # Combine all codes into a single string
                
            if 'targets' in str(feature): #if polypeptide, drug's targets
                targets = list(drug[idx])

        # get all drug-related information in a dictionary
        drug_dict = {"id":idDB,
                    "name":drug_name,
                    "synm":drug_synm,
                    "kgd":drug_class_kingdom,
                    "sclass":drug_class_superclass,
                    "itrc":drug_interaction,
                    "ext_id":drug_external_id,
                    "pathways":drug_pathway,
                    "atc_codes": drug_atc_codes,
                    "atc_levels": drug_atc_levels}
        drug = Drug(drug_dict)

        # get information of polypeptide targets
        if len(targets) > 0:
            for target in targets:
                idx_pep = None
                # get indexes
                for idx,feature in enumerate(target): # check features of targets
                    if 'organism' in str(feature):
                        idx_org = idx
                    if 'name' in str(feature):
                        idx_name = idx
                    if 'actions' in str(feature):
                        idx_act = idx
                    if 'polypeptide' in str(feature):
                        idx_pep = idx

                # Get information for polypeptide
                if target[idx_org].text == organism:

                    target_name = target[idx_name].text

                    actions = ';'.join([action.text
                                        for action in list(target[idx_act])])

                    # Get information for polypeptide
                    if idx_pep is not None: #if there is polypeptide's info...
                        for idx,feature in enumerate(target[idx_pep]):
                            if 'gene-name' in str(feature):
                                gene_name = target[idx_pep][idx].text
                            if 'cellular-location' in str(feature):
                                cell_loc = target[idx_pep][idx].text
                            if 'external-identifiers' in str(feature):
                                for ext_id in list(target[idx_pep][idx]):
                                    if ext_id[0].text == "UniProtKB":
                                        uniprot = ext_id[1].text
                    else:
                        gene_name = None
                        action = None
                        cell_loc = None
                        uniprot = None

                    row = {
                            "dg_id":drug.id,
                            "dg_name":drug.name,
                            "dg_synm":drug.synonyms,
                            "dg_kingdom":drug.kingdom,
                            "dg_superclass":drug.superclass,
                            "dg_interactions":drug.interaction,
                            "dg_ext_id":drug.external_id,
                            "dg_pathways":drug.pathways,
                            "target_name":target_name,
                            "target_uniprot":uniprot,
                            "target_gene_name":gene_name,
                            "action":actions,
                            "cell_loc":cell_loc,
                            "dg_atc_codes": drug.atc_codes,
                            "dg_atc_levels": drug.atc_levels
                            }

                    drug_targets.append(row)


    dt = pd.DataFrame.from_dict(drug_targets, orient='columns')
    dt.shape
    dt.to_csv(saveFile)

100%|██████████| 16581/16581 [00:05<00:00, 2798.66it/s]


In [28]:
df = pd.read_csv("data.csv")
df.columns


Index(['Unnamed: 0', 'dg_id', 'dg_name', 'dg_synm', 'dg_kingdom',
       'dg_superclass', 'dg_interactions', 'dg_ext_id', 'dg_pathways',
       'target_name', 'target_uniprot', 'target_gene_name', 'action',
       'cell_loc', 'dg_atc_codes', 'dg_atc_levels'],
      dtype='object')

In [67]:
df


Unnamed: 0.1,Unnamed: 0,dg_id,dg_name,dg_synm,dg_kingdom,dg_superclass,dg_interactions,dg_ext_id,dg_pathways,target_name,target_uniprot,target_gene_name,action,cell_loc,dg_atc_codes,dg_atc_levels
0,0,DB00001,Lepirudin,"[Leu1, Thr2]-63-desulfohirudin;Desulfatohirudi...",Organic Compounds,Organic Acids,DB06605;DB06695;DB01254;DB01609;DB01586;DB0212...,Drugs Product Database (DPD):11916;PubChem Sub...,Lepirudin Action Pathway,Prothrombin,P00734,F2,inhibitor,Secreted,['B01AE02'],"[['B01AE', 'B01A', 'B01', 'B']]"
1,1,DB00002,Cetuximab,Cetuximab;Cétuximab;Cetuximabum,Organic Compounds,Organic Acids,DB00255;DB00269;DB00286;DB00655;DB00783;DB0089...,Drugs Product Database (DPD):13175;PubChem Sub...,Cetuximab Action Pathway,Epidermal growth factor receptor,P00533,EGFR,binder,Cell membrane,['L01FE01'],"[['L01FE', 'L01F', 'L01', 'L']]"
2,2,DB00002,Cetuximab,Cetuximab;Cétuximab;Cetuximabum,Organic Compounds,Organic Acids,DB00255;DB00269;DB00286;DB00655;DB00783;DB0089...,Drugs Product Database (DPD):13175;PubChem Sub...,Cetuximab Action Pathway,Low affinity immunoglobulin gamma Fc region re...,O75015,FCGR3B,binder,Cell membrane,['L01FE01'],"[['L01FE', 'L01F', 'L01', 'L']]"
3,3,DB00002,Cetuximab,Cetuximab;Cétuximab;Cetuximabum,Organic Compounds,Organic Acids,DB00255;DB00269;DB00286;DB00655;DB00783;DB0089...,Drugs Product Database (DPD):13175;PubChem Sub...,Cetuximab Action Pathway,Complement C1q subcomponent subunit A,P02745,C1QA,binder,Secreted,['L01FE01'],"[['L01FE', 'L01F', 'L01', 'L']]"
4,4,DB00002,Cetuximab,Cetuximab;Cétuximab;Cetuximabum,Organic Compounds,Organic Acids,DB00255;DB00269;DB00286;DB00655;DB00783;DB0089...,Drugs Product Database (DPD):13175;PubChem Sub...,Cetuximab Action Pathway,Complement C1q subcomponent subunit B,P02746,C1QB,binder,Secreted,['L01FE01'],"[['L01FE', 'L01F', 'L01', 'L']]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15022,15022,DB17635,Nedosiran,DCR-PHXC free acid;Nedosiran,Organic Compounds,Organic Acids,,Wikipedia:Nedosiran;RxCUI:2675287,,L-lactate dehydrogenase A chain,P00338,LDHA,antisense oligonucleotide,Cytoplasm,[],[]
15023,15023,DB17851,Flotufolastat F-18,18F-rhPSMA-7.3;Flotufolastat (18F);Flotufolast...,Organic Compounds,Organic Acids,DB00499;DB00665;DB01128;DB04839;DB08899;DB1185...,RxCUI:2637946,,Glutamate carboxypeptidase 2,Q04609,FOLH1,binder,Cell membrane,[],[]
15024,15024,DB18236,(R)-9b,,Organic Compounds,Organic Acids,,,,Activated CDC42 kinase 1,Q07912,TNK2,inhibitor,Cell membrane,[],[]
15025,15025,DB18680,Lovotibeglogene autotemcel,Lovo-cel,Organic Compounds,Organic Acids,,Wikipedia:Lovotibeglogene_autotemcel;RxCUI:267...,,Hemoglobin subunit beta,,,gene replacement,,[],[]


In [72]:
# save to json
df_target = df[['dg_id','target_name', 'target_uniprot']].groupby('dg_id')
df_target = df_target.agg(list).reset_index()
df_target.to_json("data_target.json", orient="records")

In [None]:
from collections import Counter

# df_atc = df[['dg_id', 'dg_name','dg_atc_codes','dg_atc_levels']]
df_atc = pd.read_csv("data_atc.csv")
# for rows with same dg_id, keep only one row
df_atc = df_atc.drop_duplicates(subset='dg_id')
df_atc = df_atc[df_atc['dg_atc_codes'].apply(lambda x: x!= '[]')].reset_index(drop=True)
df_atc['dg_atc_codes'] = df_atc['dg_atc_codes'].apply(lambda x: eval(x))
df_atc['dg_atc_levels'] = df_atc['dg_atc_levels'].apply(lambda x: eval(x))
flatten_atc = [j for i in df_atc['dg_atc_codes'] for j in i] + [k for i in df_atc['dg_atc_levels'] for j in i for k in j]
counts_dict = dict(Counter(flatten_atc))



In [77]:
df_atc

Unnamed: 0,dg_id,dg_name,dg_atc_codes,dg_atc_levels
0,DB00001,Lepirudin,[B01AE02],"[[B01AE, B01A, B01, B]]"
1,DB00002,Cetuximab,[L01FE01],"[[L01FE, L01F, L01, L]]"
2,DB00003,Dornase alfa,[R05CB13],"[[R05CB, R05C, R05, R]]"
3,DB00004,Denileukin diftitox,[L01XX29],"[[L01XX, L01X, L01, L]]"
4,DB00005,Etanercept,[L04AB01],"[[L04AB, L04A, L04, L]]"
...,...,...,...,...
1987,DB16740,Prolgolimab,[L01FF08],"[[L01FF, L01F, L01, L]]"
1988,DB16900,Betibeglogene autotemcel,[B06AX02],"[[B06AX, B06A, B06, B]]"
1989,DB17083,Linzagolix,[H01CC04],"[[H01CC, H01C, H01, H]]"
1990,DB17538,Atidarsagene autotemcel,[A16AB21],"[[A16AB, A16A, A16, A]]"


In [60]:
import math
def cal_code_ts(code1, code2, level1, level2):
    # find the longest common element of the levels
    common_level = set(level1).intersection(set(level2))
    if common_level:
        p_mica = counts_dict[list(common_level)[0]]
    else:
        p_mica = 0
    p_c1 = counts_dict[code1]
    p_c2 = counts_dict[code2]
    if p_c1 == 0 or p_c2 == 0 or p_mica == 0:
        return 0  # If any probability is 0, return similarity as 0 (no shared information)

    # Compute similarity using the provided formula
    similarity = (2 * math.log(p_mica)) / (math.log(p_c1) + math.log(p_c2))
    return similarity

def cal_ts(code_l1, code_l2, level_l1, level_l2):
    ts = 0
    for idx1, code1 in enumerate(code_l1):
        for idx2, code2 in enumerate(code_l2):
            ts = max(cal_code_ts(code1, code2, level_l1[idx1], level_l2[idx2]), ts)
    return ts

idx1 = 0
idx2 = 89
cal_ts(df_atc.iloc[idx1]['dg_atc_codes'], df_atc.iloc[idx2]['dg_atc_codes'], df_atc.iloc[idx1]['dg_atc_levels'], df_atc.iloc[idx2]['dg_atc_levels'])

4.389424549607846

In [None]:
# apply the funtion to the df_atc to get a matrix of ts
