# Anatomical Therapeutic Chemical Classification
Source: eHealth DSI – Master Value Set Catalogue (MVC) 2.2.2 Operation Ready

URL: https://ec.europa.eu/cefdigital/wiki/pages/viewpage.action?pageId=35208905

Fetched: 11/15/2018

Created: 11/15/2018

Updated: 12/19/2018

In [1]:
import pandas as pd
import re
import csv
import pubchempy as pcp
from Bio import Entrez
import Levenshtein as lv
import requests
from ratelimit import limits, sleep_and_retry
import time

Entrez.email = "JohnErol.Evangelista@mssm.edu"
Entrez.api_key = "83a36edabe6b67dec8d71446a47b7c3b3e09"

### Get CSV file

In [2]:
drug_list = pd.read_csv("/Users/maayan/SignatureCommonsMetadata/sigsets/eHealth-Drugs.csv", sep="\t")
drug_list

Unnamed: 0,Code System ID,Code System Version,Concept Code,Description (FSN)
0,2.16.840.1.113883.6.73,2017-01,A,ALIMENTARY TRACT AND METABOLISM
1,2.16.840.1.113883.6.73,2017-01,A01,STOMATOLOGICAL PREPARATIONS
2,2.16.840.1.113883.6.73,2017-01,A01A,STOMATOLOGICAL PREPARATIONS
3,2.16.840.1.113883.6.73,2017-01,A01AA,Caries prophylactic agents
4,2.16.840.1.113883.6.73,2017-01,A01AA01,sodium fluoride
5,2.16.840.1.113883.6.73,2017-01,A01AA02,sodium monofluorophosphate
6,2.16.840.1.113883.6.73,2017-01,A01AA03,olaflur
7,2.16.840.1.113883.6.73,2017-01,A01AA04,stannous fluoride
8,2.16.840.1.113883.6.73,2017-01,A01AA30,combinations
9,2.16.840.1.113883.6.73,2017-01,A01AA51,"sodium fluoride, combinations"


In [3]:
patterns = '(?P<Chemical_Substance>[A-Z][0-9]+[A-Z][A-Z][0-9]+)|(?P<Chemical_Subgroup>[A-Z][0-9]+[A-Z][A-Z])|(?P<Pharmacological_Subgroup>[A-Z][0-9]+[A-Z])|(?P<Therapeutical_Subgroup>[A-Z][0-9]+)|(?P<Anatomical_Main_Group>[A-Z])'

In [4]:
drug_regex = {}

In [5]:
pattern_re = re.compile(patterns)

In [6]:
# Classify Concept Code
concept_code = drug_list["Concept Code"].str.extract(patterns, expand=True)
# Invert columns
concept_code = concept_code[concept_code.columns[::-1]]
concept_code

Unnamed: 0,Anatomical_Main_Group,Therapeutical_Subgroup,Pharmacological_Subgroup,Chemical_Subgroup,Chemical_Substance
0,A,,,,
1,,A01,,,
2,,,A01A,,
3,,,,A01AA,
4,,,,,A01AA01
5,,,,,A01AA02
6,,,,,A01AA03
7,,,,,A01AA04
8,,,,,A01AA30
9,,,,,A01AA51


In [7]:
# Convert to Boolean
concept_code_boolean = concept_code.notnull()
concept_code_boolean

Unnamed: 0,Anatomical_Main_Group,Therapeutical_Subgroup,Pharmacological_Subgroup,Chemical_Subgroup,Chemical_Substance
0,True,False,False,False,False
1,False,True,False,False,False
2,False,False,True,False,False
3,False,False,False,True,False
4,False,False,False,False,True
5,False,False,False,False,True
6,False,False,False,False,True
7,False,False,False,False,True
8,False,False,False,False,True
9,False,False,False,False,True


In [8]:
# Separate values on the description column based on their Concept Code classification
# We also note their concept code classification
d_list = []
d_name = []
for i in concept_code_boolean:
    d_name.append(i)
    d_list.append(drug_list[concept_code_boolean[i]]["Concept Code"]+ "|"
                  + drug_list[concept_code_boolean[i]]["Description (FSN)"])

In [9]:
# Concatenate the list to a pandas dataframe, fill NA's with "-" instead
drug_classified = pd.concat(d_list, axis=1)
drug_classified.columns = d_name
drug_classified = drug_classified.fillna("-")
drug_classified

Unnamed: 0,Anatomical_Main_Group,Therapeutical_Subgroup,Pharmacological_Subgroup,Chemical_Subgroup,Chemical_Substance
0,A|ALIMENTARY TRACT AND METABOLISM,-,-,-,-
1,-,A01|STOMATOLOGICAL PREPARATIONS,-,-,-
2,-,-,A01A|STOMATOLOGICAL PREPARATIONS,-,-
3,-,-,-,A01AA|Caries prophylactic agents,-
4,-,-,-,-,A01AA01|sodium fluoride
5,-,-,-,-,A01AA02|sodium monofluorophosphate
6,-,-,-,-,A01AA03|olaflur
7,-,-,-,-,A01AA04|stannous fluoride
8,-,-,-,-,A01AA30|combinations
9,-,-,-,-,"A01AA51|sodium fluoride, combinations"


In [10]:
# fill in "-" with parent values based on concept code

prev_row_val = ["-"]*(len(drug_classified.columns))
for row in drug_classified.iterrows():
    row_index = row[0]
    row_val = row[1].values
    for i in range(len(row_val)):
        if not row_val[i] == "-" and not row_val[i] == prev_row_val[i]:
            prev_row_val[i] = row_val[i]
            prev_row_val[(i+1):] = ["-"]*len(prev_row_val[(i+1):])
        key = drug_classified.columns[i]
        drug_classified[key][row_index] = prev_row_val[i]

In [11]:
drug_classified

Unnamed: 0,Anatomical_Main_Group,Therapeutical_Subgroup,Pharmacological_Subgroup,Chemical_Subgroup,Chemical_Substance
0,A|ALIMENTARY TRACT AND METABOLISM,-,-,-,-
1,A|ALIMENTARY TRACT AND METABOLISM,A01|STOMATOLOGICAL PREPARATIONS,-,-,-
2,A|ALIMENTARY TRACT AND METABOLISM,A01|STOMATOLOGICAL PREPARATIONS,A01A|STOMATOLOGICAL PREPARATIONS,-,-
3,A|ALIMENTARY TRACT AND METABOLISM,A01|STOMATOLOGICAL PREPARATIONS,A01A|STOMATOLOGICAL PREPARATIONS,A01AA|Caries prophylactic agents,-
4,A|ALIMENTARY TRACT AND METABOLISM,A01|STOMATOLOGICAL PREPARATIONS,A01A|STOMATOLOGICAL PREPARATIONS,A01AA|Caries prophylactic agents,A01AA01|sodium fluoride
5,A|ALIMENTARY TRACT AND METABOLISM,A01|STOMATOLOGICAL PREPARATIONS,A01A|STOMATOLOGICAL PREPARATIONS,A01AA|Caries prophylactic agents,A01AA02|sodium monofluorophosphate
6,A|ALIMENTARY TRACT AND METABOLISM,A01|STOMATOLOGICAL PREPARATIONS,A01A|STOMATOLOGICAL PREPARATIONS,A01AA|Caries prophylactic agents,A01AA03|olaflur
7,A|ALIMENTARY TRACT AND METABOLISM,A01|STOMATOLOGICAL PREPARATIONS,A01A|STOMATOLOGICAL PREPARATIONS,A01AA|Caries prophylactic agents,A01AA04|stannous fluoride
8,A|ALIMENTARY TRACT AND METABOLISM,A01|STOMATOLOGICAL PREPARATIONS,A01A|STOMATOLOGICAL PREPARATIONS,A01AA|Caries prophylactic agents,A01AA30|combinations
9,A|ALIMENTARY TRACT AND METABOLISM,A01|STOMATOLOGICAL PREPARATIONS,A01A|STOMATOLOGICAL PREPARATIONS,A01AA|Caries prophylactic agents,"A01AA51|sodium fluoride, combinations"


In [12]:
# drug_classified.to_csv("/Users/maayan/SignatureCommonsMetadata/sigsets/drug_classified1.tsv", sep="\t")

In [13]:
# Get only columns with Chemical Substance
drug_final_list = drug_classified[drug_classified["Chemical_Substance"] != "-"]
drug_final_list

Unnamed: 0,Anatomical_Main_Group,Therapeutical_Subgroup,Pharmacological_Subgroup,Chemical_Subgroup,Chemical_Substance
4,A|ALIMENTARY TRACT AND METABOLISM,A01|STOMATOLOGICAL PREPARATIONS,A01A|STOMATOLOGICAL PREPARATIONS,A01AA|Caries prophylactic agents,A01AA01|sodium fluoride
5,A|ALIMENTARY TRACT AND METABOLISM,A01|STOMATOLOGICAL PREPARATIONS,A01A|STOMATOLOGICAL PREPARATIONS,A01AA|Caries prophylactic agents,A01AA02|sodium monofluorophosphate
6,A|ALIMENTARY TRACT AND METABOLISM,A01|STOMATOLOGICAL PREPARATIONS,A01A|STOMATOLOGICAL PREPARATIONS,A01AA|Caries prophylactic agents,A01AA03|olaflur
7,A|ALIMENTARY TRACT AND METABOLISM,A01|STOMATOLOGICAL PREPARATIONS,A01A|STOMATOLOGICAL PREPARATIONS,A01AA|Caries prophylactic agents,A01AA04|stannous fluoride
8,A|ALIMENTARY TRACT AND METABOLISM,A01|STOMATOLOGICAL PREPARATIONS,A01A|STOMATOLOGICAL PREPARATIONS,A01AA|Caries prophylactic agents,A01AA30|combinations
9,A|ALIMENTARY TRACT AND METABOLISM,A01|STOMATOLOGICAL PREPARATIONS,A01A|STOMATOLOGICAL PREPARATIONS,A01AA|Caries prophylactic agents,"A01AA51|sodium fluoride, combinations"
11,A|ALIMENTARY TRACT AND METABOLISM,A01|STOMATOLOGICAL PREPARATIONS,A01A|STOMATOLOGICAL PREPARATIONS,A01AB|Antiinfectives and antiseptics for local...,A01AB02|hydrogen peroxide
12,A|ALIMENTARY TRACT AND METABOLISM,A01|STOMATOLOGICAL PREPARATIONS,A01A|STOMATOLOGICAL PREPARATIONS,A01AB|Antiinfectives and antiseptics for local...,A01AB03|chlorhexidine
13,A|ALIMENTARY TRACT AND METABOLISM,A01|STOMATOLOGICAL PREPARATIONS,A01A|STOMATOLOGICAL PREPARATIONS,A01AB|Antiinfectives and antiseptics for local...,A01AB04|amphotericin B
14,A|ALIMENTARY TRACT AND METABOLISM,A01|STOMATOLOGICAL PREPARATIONS,A01A|STOMATOLOGICAL PREPARATIONS,A01AB|Antiinfectives and antiseptics for local...,A01AB05|polynoxylin


In [14]:
drug_final_list.to_csv("/Users/maayan/SignatureCommonsMetadata/sigsets/drug_classified.tsv", sep="\t")

In [15]:
with open("/Users/maayan/SignatureCommonsMetadata/sigsets/Anatomical_Therapeutic_Chemical_Classification.gmt", "w") as w:
    prev_sig = ""
    gmt_row = ""
    for row in drug_final_list.iterrows():
        row_val = row[1].values
        sMol = row_val[-1]
        sMol_sig = "_".join(row_val[:-1])
        if not sMol_sig == prev_sig:
            prev_sig = sMol_sig
            if gmt_row.strip():
                w.write(gmt_row+"\n")
            gmt_row = sMol_sig + "\t"
        gmt_row = gmt_row + "\t" + sMol
    else:
        w.write(gmt_row)

In [16]:
# Helper Functions
PERIOD = 5
@sleep_and_retry
@limits(calls=5, period=PERIOD)
def get_synonyms(term_type,term):
    global PERIOD
    try:
        for i in range(5):
            url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/%s/name/%s/synonyms/JSON"%(term_type,term)
            res = requests.get(url)
            throttling = res.headers['X-Throttling-Control']
            if "Black" in throttling:
                print("Blocked")
            elif "Red" in throttling:
                print("Throttling soon, sleeping for a minute...")
                time.sleep(60)
            if "Yellow" in throttling:
                print("Getting busy, increasing period")
                PERIOD = 15
            else: # Green
                if PERIOD != 5:
                    print("Back to business baby!")
                PERIOD = 5

            if res.status_code != 200 and res.status_code != 404:
                print(PERIOD,throttling, res.status_code)
                time.sleep(60)
                raise Exception('API response: {}'.format(res.status_code))
    except Exception as e:
        print(PERIOD,throttling, res.status_code)
        raise e
    if "InformationList" in res.json():
        return res.json()["InformationList"]["Information"]
    else:
        return None

def get_nearest_name(term, synonyms):
    min_score = 9999999999
    current_best = ""
    for syn in synonyms:
        score = lv.distance(term, syn)
        if score < min_score:
            min_score = score
            current_best = syn
    return current_best, min_score

def get_standard_name(term):
    min_score = 9999999999
    current_best = term
    comp_info = get_synonyms("compound",term)
    if comp_info:
        for comp in comp_info:
            synonyms = comp["Synonym"]
            result, score = get_nearest_name(term, synonyms)
            if score < min_score:
                current_best = result
                min_score = score
            if score == 0:
                break
    else:
        sub_info = get_synonyms("substance",term)
        if sub_info:
            for sub in sub_info:
                synonyms = sub["Synonym"]
                result, score = get_nearest_name(term, synonyms)
                if score < min_score:
                    current_best = result
                    min_score = score
                if score == 0:
                    break
        else:
            print("no match: %s"%term)
    return(current_best)

In [17]:
# Function for getting standardized names of the compounds
def get_standardized_names(row):
    concept_code,drugs = row["Chemical_Substance"].split("|")
    d_list = []
    drugs = drugs.replace(" and ", ",").replace(" with ", ",").replace("ispaghula (psylla seeds)", "ispaghula")
    for term in drugs.split(","):
        standard = get_standard_name(term.strip())
        d_list.append(standard)
    return concept_code + "|" + ",".join(d_list)

In [18]:
drug_final_list["Standardized_Name"] = drug_final_list.apply(get_standardized_names, axis=1)

5 Request Count status: Green (3%), Request Time status: Green (0%), Service status: Green (3%), too many requests per second or blacklisted 503


KeyboardInterrupt: 

In [None]:
drug_final_list