## ATC_FDA DRUG-SET LIBRARY
### Drug-set Labels : ATC Codes
#### ALL DATABASES ACCESSED 02/26/19
##### Author : Eryk Kropiwnicki | eryk.kropiwnicki@icahn.mssm.edu

In [1]:
import json
import pandas as pd
import xml.etree.ElementTree as et
import requests
import time
import pubchempy as pcp
from collections import defaultdict
import csv

### MATCHING ATC CODES TO FDA APPROVED DRUG NAMES
#### DATABASE FOR ATC CODES AND PAIRED DRUGS: https://www.genome.jp/kegg/drug/
#### DATABASE FOR FDA APPROVED DRUGS: http://www.drugbank.ca
#### Input Files : drug links-approved.csv(http://www.drugbank.ca) | br08303.json (https://www.genome.jp/kegg/drug/)

In [2]:
# Iterate through each row and extract all approved drug names from DrugBank database csv file #
drugbank_drugs = []

drugbank = open('input/drug links-approved.csv')
drugbank_csv = csv.reader(drugbank)
for row in drugbank_csv:
    drugbank_drugs.append(row[1])
    drugbank_drugs = [x.lower() for x in drugbank_drugs]

In [3]:
# Iterate through each branch of the ATC Code Database json file and extract all ATC codes paired to drug names #
with open('input/br08303.json') as json_data:
    atc_codes = json.load(json_data)

drugsetlibrary = {}
atc_drugs = []

l1 = []
l2 = []
l3 = []
l4 = []

# Retrieve level 1-4 names & children #
for a in range(0, len(atc_codes["children"])):
    
    level1 = atc_codes["children"][a]
    l1.append(level1["name"])
 
    # Retrieve level 2-4 names & children from the level 1 array #
    for b in range(0, len(level1["children"])):
        
        level2 = level1["children"][b] 
        l2.append(level2["name"])
        
        # Some level 2 names do not contain children, therefore this if statement is required, and will be required in subsequent iterations #
        if "children" in level2.keys():
            
            # Retrieve level 3-4 names & children from the level 2 array #
            for c in range(0, len(level2["children"])):
            
                level3 = level2["children"][c]
                l3.append(level3["name"])
                
                if "children" in level3.keys():
                    
                    # Retrieve level 4 names & children from the level 3 array #
                    for d in range(0, len(level3["children"])):
                
                        level4 = level3["children"][d] 
                        l4.append(level4["name"])


                        # Pairing the level 4 ATC code keys with an empty array #
                        sep = '['
                        drugsetlibrary[(level4["name"].split(sep,1)[0])] = []
                    
                        # Retrieving all children from the final array #
                        if 'children' in level4.keys():
                            for e in range(0, len(level4["children"])):
                    
                                # Extracting all drug names #
                                drug_names = level4["children"][e]
                                drug_names = str((drug_names["name"].lower()))

                                # Editing each drug string to remove unnecessary characters # 
                                drug_names = drug_names[8:]

                                sep = '['
                                drug_names = drug_names.split(sep,1)[0]
                                drug_names = drug_names.strip(' ')
                                
                                # Making a list of drugs with ATC codes #
                                atc_drugs.append(drug_names)
                                
                                # Final output of matched level 4 ATC codes with drug names #
                                drugsetlibrary[(level4["name"].split(sep,1)[0])].append(drug_names)
                        else:
                            continue
                else:
                    continue
            else:
                continue
                
# Editing the dictionary with only FDA approved ATC drugs from the FDA approved drug list #
fda_drugs = list(set([z for z in atc_drugs if z in drugbank_drugs]))
drugsetlibrary = {i: list(set(v) & set(fda_drugs)) for i,v in drugsetlibrary.items()}

# Removing any ATC codes not paired with a drug #
drugsetlibrary = {k: v for k,v in drugsetlibrary.items() if v}

In [5]:
len(fda_drugs)

1834

In [6]:
len(drugsetlibrary)

559

In [7]:
# Exporting FDA Approved Drug Set Library as a csv file #
df = pd.DataFrame.from_dict(drugsetlibrary, orient='index')
df = df[df.columns[0:]].apply(
    lambda x: ','.join(x.dropna().astype(str).astype(str)),
    axis = 1
)
df.to_csv('/Users/maayanlab/Documents/DrugSetEnrichment/Drugsetlibraries/Drugbank/ATC_FDA/atc_fda_drugsetlibrary.csv')

In [8]:
df.head()

A01AA Caries prophylactic agents                                                 stannous fluoride,sodium fluoride
A01AB Antiinfectives and antiseptics for local oral treatment    clotrimazole,chlortetracycline,hydrogen peroxi...
A01AC Corticosteroids for local oral treatment                          triamcinolone,hydrocortisone,dexamethasone
A01AD Other agents for local oral treatment                      amlexanox,becaplermin,epinephrine,acetylsalicy...
A02AA Magnesium compounds                                        magnesium hydroxide,magnesium carbonate,magnes...
dtype: object

In [34]:
# Exporting a list of FDA approved drugs as a txt file #
dataFile = open('/Users/maayanlab/Documents/DrugSetEnrichment/Drugsetlibraries/Drugbank/ATC_FDA/fda_drugs.txt', 'w')
for eachitem in fda_drugs:
    dataFile.write("%s\n" % eachitem)
dataFile.close()