In [1]:
import urllib.request, urllib.error, urllib.parse
import json
import os
import pandas
from pprint import pprint



In [2]:
REST_URL = "http://data.bioontology.org"
API_KEY = "a28f1d5b-0cc4-454a-8baf-1b2285cfa549"

def get_json(url):
    opener = urllib.request.build_opener()
    opener.addheaders = [('Authorization', 'apikey token=' + API_KEY)]
    return json.loads(opener.open(url).read())

def print_annotations(annotations, get_class=True):
    #Result is the row
    for result in annotations:
        #returns a martix called annotatedClass which holds all of the ids, links, ontologies, etc
        class_details = result["annotatedClass"]
        if get_class:
            try:
                class_details = get_json(result["annotatedClass"]["links"]["self"])
            except urllib.error.HTTPError:
                print(f"Error retrieving {result['annotatedClass']['@id']}")
                continue
        print("Class details")
        print("\tid: " + class_details["@id"])
        print("\tprefLabel: " + class_details["prefLabel"])
        print("\tontology: " + class_details["links"]["ontology"])

        print("Annotation details")
        for annotation in result["annotations"]:
            print("\tfrom: " + str(annotation["from"]))
            print("\tto: " + str(annotation["to"]))
            print("\tmatch type: " + annotation["matchType"])

        if result["hierarchy"]:
            print("\n\tHierarchy annotations")
            for annotation in result["hierarchy"]:
                try:
                    class_details = get_json(annotation["annotatedClass"]["links"]["self"])
                except urllib.error.HTTPError:
                    print(f"Error retrieving {annotation['annotatedClass']['@id']}")
                    continue
                pref_label = class_details["prefLabel"] or "no label"
                print("\t\tClass details")
                print("\t\t\tid: " + class_details["@id"])
                print("\t\t\tprefLabel: " + class_details["prefLabel"])
                print("\t\t\tontology: " + class_details["links"]["ontology"])
                print("\t\t\tdistance from originally annotated class: " + str(annotation["distance"]))

        print("\n\n")


In [3]:
#Input the data you want to work with in here
df = pandas.read_csv('../data/unlabeled_withDBID.csv')
print(len(df))
df.head()


3726


Unnamed: 0.1,Unnamed: 0,Label_ID,Drug_Brand_Name,Active_ingredient,UNII_ID,Formatted_Text,Text,DB_ID
0,0,00352228-39a9-4cef-9e91-16c5cd009dad.xml,DIGITEK,DIGOXIN,73K4184T59,DIGITEK|DIGITEK|DIGITEK|DIGITEK,DIGITEK|®|\n |\n ...,DB00390
1,0,2a584093-fb6c-4a22-9c31-ac835d2b474a.xml,Digoxin,DIGOXIN,73K4184T59,Digoxin is a cardiac glycoside indicated for:|...,Digoxin is a cardiac glycoside indicated for:|...,DB00390
2,0,5b0297ba-497e-46af-b027-6d46d9409384.xml,Digoxin,DIGOXIN,73K4184T59,"Digoxin Tablets, USP are a cardiac glycoside i...","Digoxin Tablets, USP are a cardiac glycoside i...",DB00390
3,0,83793809-2e55-366b-e053-2991aa0a414f.xml,DIGOXIN,DIGOXIN,73K4184T59,DIGOXIN is a cardiac glycoside indicated for:|...,DIGOXIN is a cardiac glycoside indicated for:|...,DB00390
4,0,83d37d5a-ed63-e142-e053-2991aa0ad846.xml,DIGOXIN,DIGOXIN,73K4184T59,DIGOXIN is a cardiac glycoside indicated for:|...,DIGOXIN is a cardiac glycoside indicated for:|...,DB00390


In [12]:
#Now to make this run for each context available!!!!
id = []
From = []
To = []
matchType = []
annotation2 = []
ontology = []
context = []
dbid = []
drugname = []

#Adds additional parameters here for the bioportal search engine
additional_parameters = "&ontologies=DOID&require_exact_match=true"

data = []
for index, row in df.iterrows():
    
    if index % 100  == 0:
        perc = index/len(df) *100
        print(str(index) + " : "+ str(perc))
    
    #Text input for the ontology search engine
    text_to_annotate = row["Text"]
    db_id = row["DB_ID"]
    drug_name = row["Active_ingredient"]
    label_id = row["Label_ID"]

    try:
        # Annotate using the provided text
        annotations = get_json(REST_URL + "/annotator?text=" + urllib.parse.quote(text_to_annotate) + additional_parameters)

        for result in annotations:
            class_details = result["annotatedClass"]

            for annotation in result["annotations"]:
                From.append(annotation["from"])
                To.append(annotation["to"])
                matchType.append(annotation["matchType"])
                annotation2.append(annotation["text"])   
                context.append(text_to_annotate)
                ontology.append("\tontology: " + class_details["links"]["ontology"])
                id.append(class_details["@id"])
                dbid.append(db_id)
                drugname.append(drug_name)
                data.append([label_id, annotation["from"], annotation["to"], annotation["matchType"], annotation["text"], 
                             text_to_annotate, class_details["@id"], db_id, drug_name ])
    except:
        pass







0 : 0.0
100 : 2.6838432635534084
200 : 5.367686527106817
300 : 8.051529790660226
400 : 10.735373054213634
500 : 13.419216317767043
600 : 16.10305958132045
700 : 18.78690284487386
800 : 21.470746108427267
900 : 24.154589371980677
1000 : 26.838432635534087
1100 : 29.52227589908749
1200 : 32.2061191626409
1300 : 34.88996242619431
1400 : 37.57380568974772
1500 : 40.25764895330113
1600 : 42.941492216854535
1900 : 50.99302200751477
2000 : 53.67686527106817
2100 : 56.36070853462157
2200 : 59.04455179817498
2300 : 61.72839506172839
2500 : 67.0960815888352
2600 : 69.77992485238862
2700 : 72.46376811594203
2800 : 75.14761137949544
2900 : 77.83145464304884
3000 : 80.51529790660226
3100 : 83.19914117015567
3200 : 85.88298443370907
3300 : 88.56682769726248
3400 : 91.25067096081588
3500 : 93.9345142243693
3600 : 96.61835748792271
3700 : 99.30220075147611


In [16]:
len(data)

25670

In [17]:
#Constructs the new dataframe (newdf) from the collected lists_
columns =['Label_ID','From','To','Type', 'Annotation', 'Context','DO_ID','DB_ID','DrugName' ]

newdf = pandas.DataFrame(data, columns= columns)
newdf.head()

Unnamed: 0,Label_ID,From,To,Type,Annotation,Context,DO_ID,DB_ID,DrugName
0,2a584093-fb6c-4a22-9c31-ac835d2b474a.xml,828,846,PREF,ATRIAL FIBRILLATION,Digoxin is a cardiac glycoside indicated for:|...,http://purl.obolibrary.org/obo/DOID_0060224,DB00390,DIGOXIN
1,5b0297ba-497e-46af-b027-6d46d9409384.xml,902,920,PREF,ATRIAL FIBRILLATION,"Digoxin Tablets, USP are a cardiac glycoside i...",http://purl.obolibrary.org/obo/DOID_0060224,DB00390,DIGOXIN
2,83793809-2e55-366b-e053-2991aa0a414f.xml,412,430,PREF,ATRIAL FIBRILLATION,DIGOXIN is a cardiac glycoside indicated for:|...,http://purl.obolibrary.org/obo/DOID_0060224,DB00390,DIGOXIN
3,83793809-2e55-366b-e053-2991aa0a414f.xml,1101,1119,PREF,ATRIAL FIBRILLATION,DIGOXIN is a cardiac glycoside indicated for:|...,http://purl.obolibrary.org/obo/DOID_0060224,DB00390,DIGOXIN
4,83d37d5a-ed63-e142-e053-2991aa0ad846.xml,412,430,PREF,ATRIAL FIBRILLATION,DIGOXIN is a cardiac glycoside indicated for:|...,http://purl.obolibrary.org/obo/DOID_0060224,DB00390,DIGOXIN


In [18]:

#Length of each of the df and the average number of annotations made per label
numOfAnno = len(newdf)
numOfContext = len(df)

print(numOfAnno)
print(numOfContext)

print(numOfAnno/numOfContext)                              
                          


25670
3726
6.8894256575416


In [19]:
newdf.to_csv('../data/output/unlabeled_withBPAnnotations.csv', index=False)