## Basic initial statistics drawn from the training data

#### Imports

In [45]:
import sys
sys.path.append('../')
from os import listdir,system
import re
from xml.dom.minidom import parse
from nltk.tokenize import word_tokenize
import util.evaluator as evaluator
import neptune
from dotenv import dotenv_values
from collections import Counter 

#### General methods and variables

In [128]:

## dictionary containig information from external knowledge resources
## WARNING: You may need to adjust the path to the resource files
external = {}
with open("../resources/HSDB.txt", encoding="utf-8") as h :
    for x in h.readlines() :
        external[x.strip().lower()] = "drug"
with open("../resources/DrugBank.txt", encoding="utf-8") as h :
    for x in h.readlines() :
        (n,t) = x.strip().lower().split("|")
        external[n] = t

        
## --------- tokenize sentence ----------- 
## -- Tokenize sentence, returning tokens and span offsets

def tokenize(txt):
    offset = 0
    tks = []
    for t in word_tokenize(txt):
        offset = txt.find(t, offset)
        tks.append((t, offset, offset+len(t)-1))
        offset += len(t)
    return tks

## -----------------------------------------------
## -- check if a token is a drug part, and of which type

suffixes = ['azole', 'idine', 'amine', 'mycin']

def classify_token(txt):

   # WARNING: This function must be extended with 
   #          more and better rules

   if txt.lower() in external : return external[txt.lower()]
   elif txt.isupper() : return "brand"
   elif txt[-5:] in suffixes : return "drug"
   else : return "NONE"

   

## --------- Entity extractor ----------- 
## -- Extract drug entities from given text and return them as
## -- a list of dictionaries with keys "offset", "text", and "type"

def extract_entities(stext) :

    # WARNING: This function must be extended to
    #          deal with multi-token entities.
    
    # tokenize text
    tokens = tokenize(stext)
         
    result = []
    # classify each token and decide whether it is an entity.
    for (token_txt, token_start, token_end)  in tokens:
        drug_type = classify_token(token_txt)
        
        if drug_type != "NONE" :
            e = { "offset" : str(token_start)+"-"+str(token_end),
                  "text" : stext[token_start:token_end+1],
                  "type" : drug_type
                 }
            result.append(e)
                    
    return result
      
## --------- main function ----------- 

def read_sentences(datadir) :
    sentences = {}
    # process each file in input directory
    for f in listdir(datadir) :   
        # parse XML file, obtaining a DOM tree
        tree = parse(datadir+"/"+f)
        #print(tree.getElementsByTagName("sentence")[0].attributes)
        # process each sentence in the file
        for sentence in tree.getElementsByTagName("entity"):
            if not sentence.attributes['type'].value in sentences.keys():
                sentences[sentence.attributes["type"].value] = [sentence.attributes['text'].value]
            else:
                sentences[sentence.attributes["type"].value].append(sentence.attributes['text'].value)
                        
    return sentences


In [132]:
sentences = read_sentences('../data/train')

In [133]:
sentences.keys()

dict_keys(['group', 'drug', 'drug_n', 'brand'])

In [154]:
Counter(word[:3] for word in sentences['drug'])

Counter({'phe': 213,
         'met': 183,
         'war': 171,
         'dig': 147,
         'pro': 137,
         'flu': 127,
         'ket': 115,
         'the': 100,
         'lit': 95,
         'alc': 90,
         'rif': 88,
         'cyc': 87,
         'car': 79,
         'eth': 73,
         'cim': 71,
         'amp': 60,
         'ery': 59,
         'lev': 51,
         'chl': 50,
         'cis': 49,
         'dex': 48,
         'cef': 47,
         'mor': 45,
         'ind': 45,
         'tri': 45,
         'ace': 45,
         'nor': 44,
         'clo': 43,
         'hyd': 43,
         'Apr': 43,
         'ter': 40,
         'val': 39,
         'dil': 39,
         'ins': 38,
         'iso': 38,
         'qui': 38,
         'suc': 37,
         'ami': 37,
         'hal': 36,
         'tol': 36,
         'Ket': 36,
         'dia': 35,
         'all': 34,
         'Phe': 34,
         'itr': 34,
         'Var': 34,
         'alp': 33,
         'par': 31,
         'nif': 31,
         'mi

In [152]:
[i[0] for i in Counter(word[-5:] for word in sentences['drug_n']).most_common(30)]

['gents',
 'itors',
 'sants',
 'etics',
 'otics',
 'drugs',
 'tives',
 'lants',
 'mines',
 'roids',
 'ckers',
 'rates',
 'nists',
 'acids',
 'SAIDs',
 'zines',
 'ulant',
 'pines',
 'ogens',
 'bitor',
 'lones',
 'ssant',
 'tions',
 'sides',
 'TCA',
 'retic',
 'esics',
 'min D',
 'SSRIs',
 'eroid']

In [135]:
external

{'formaldehyde': 'drug',
 'dexamethasone': 'drug',
 'phenobarbital': 'drug',
 'mitomycin c': 'drug',
 'mephenytoin': 'drug',
 'vitamin d2': 'drug',
 'cyclophosphamide': 'drug',
 'lactic acid': 'drug',
 'hydrocortisone': 'drug',
 'prednisolone': 'drug',
 'estriol': 'drug',
 'estradiol': 'drug',
 'ddt': 'drug',
 '2,3,6-trichlorobenzoic acid': 'drug',
 'benzo(a)pyrene': 'drug',
 'phenylbutazone': 'drug',
 'thalidomide': 'drug',
 'cocaine': 'drug',
 'lsd': 'drug',
 'mercaptopurine': 'drug',
 'desipramine': 'drug',
 'amitriptyline': 'drug',
 'imipramine': 'drug',
 'thioridazine': 'drug',
 'chlorpromazine': 'drug',
 'quinidine sulfate': 'drug',
 'reserpine': 'drug',
 'oxytocin': 'drug',
 'cephaloridine': 'group',
 'phentolamine': 'drug',
 'niclosamide': 'drug',
 'd-sorbitol': 'drug',
 'alloxan': 'drug',
 'dactinomycin': 'drug',
 'acetylsalicylic acid': 'drug',
 'l-ascorbic acid': 'drug',
 'floxuridine': 'drug',
 'd(+)-glucose': 'drug',
 'piperonyl butoxide': 'drug',
 'procainamide': 'drug',


In [136]:
suffixes_count = Counter(key[-5:] for key in external)
suffixes_count

Counter({'elief': 1573,
         'pf 15': 1532,
         'oride': 1310,
         ' acid': 1225,
         'pf 30': 1135,
         'tizer': 1132,
         'ength': 972,
         'creen': 965,
         'amide': 869,
         'beige': 795,
         'cream': 724,
         'amine': 711,
         'tment': 593,
         'ollen': 568,
         ' plus': 559,
         'pf 50': 543,
         'mg/ml': 535,
         'ction': 519,
         ' mask': 495,
         'orant': 495,
         ' wash': 492,
         'phate': 482,
         'blets': 444,
         'itors': 427,
         'azole': 401,
         'odium': 391,
         'anser': 377,
         'idine': 376,
         'otion': 367,
         'd kit': 360,
         ' mint': 327,
         ' balm': 325,
         'drops': 320,
         'ation': 303,
         'serum': 293,
         'onate': 272,
         'oxide': 269,
         'pf 20': 267,
         'cough': 255,
         'etate': 254,
         'ylate': 253,
         'drate': 252,
         'edium': 249,
     

In [137]:
matches = {}
for i in suffixes_count.most_common()[:150]:
    if not ' ' in i[0]:
        matches[i[0]] = Counter([value for key, value in external.items() if i[0] in key]).most_common()

In [138]:
final_lists = {'brand':[], 'drug':[], 'group':[]}
for k,v in matches.items():
    final_lists[v[0][0]].append(k)

In [139]:
final_lists['drug']

['amide',
 'amine',
 'phate',
 'idine',
 'etate',
 'ylate',
 'drate',
 'lfate',
 'erase',
 'azine',
 'ated)',
 'henol',
 'otein',
 'thway',
 'omide',
 'zumab',
 'amate',
 'tigen',
 'tinib',
 'oline',
 'inate',
 'dione']

In [96]:
final_lists_numbers = {'brand':0, 'drug':0, 'group':0}

for k,v in external.items():
    if any(num in k for num in ['0','1','2','3','4','5','6','7','8','9']):
        final_lists_numbers[v]+=1   

In [97]:
final_lists_numbers

{'brand': 29449, 'drug': 9582, 'group': 505}

In [94]:
external

{'formaldehyde': 'drug',
 'dexamethasone': 'drug',
 'phenobarbital': 'drug',
 'mitomycin c': 'drug',
 'mephenytoin': 'drug',
 'vitamin d2': 'drug',
 'cyclophosphamide': 'drug',
 'lactic acid': 'drug',
 'hydrocortisone': 'drug',
 'prednisolone': 'drug',
 'estriol': 'drug',
 'estradiol': 'drug',
 'ddt': 'drug',
 '2,3,6-trichlorobenzoic acid': 'drug',
 'benzo(a)pyrene': 'drug',
 'phenylbutazone': 'drug',
 'thalidomide': 'drug',
 'cocaine': 'drug',
 'lsd': 'drug',
 'mercaptopurine': 'drug',
 'desipramine': 'drug',
 'amitriptyline': 'drug',
 'imipramine': 'drug',
 'thioridazine': 'drug',
 'chlorpromazine': 'drug',
 'quinidine sulfate': 'drug',
 'reserpine': 'drug',
 'oxytocin': 'drug',
 'cephaloridine': 'group',
 'phentolamine': 'drug',
 'niclosamide': 'drug',
 'd-sorbitol': 'drug',
 'alloxan': 'drug',
 'dactinomycin': 'drug',
 'acetylsalicylic acid': 'drug',
 'l-ascorbic acid': 'drug',
 'floxuridine': 'drug',
 'd(+)-glucose': 'drug',
 'piperonyl butoxide': 'drug',
 'procainamide': 'drug',
