In [51]:
import urllib.request, urllib.error, urllib.parse
import json
import os
import pandas
import seaborn
from pprint import pprint


REST_URL = "http://data.bioontology.org"
API_KEY = "a28f1d5b-0cc4-454a-8baf-1b2285cfa549"

def get_json(url):
    opener = urllib.request.build_opener()
    opener.addheaders = [('Authorization', 'apikey token=' + API_KEY)]
    return json.loads(opener.open(url).read())

def print_annotations(annotations, get_class=True):
    #Result is the row
    for result in annotations:
        #returns a martix called annotatedClass which holds all of the ids, links, ontologies, etc
        class_details = result["annotatedClass"]
        if get_class:
            try:
                class_details = get_json(result["annotatedClass"]["links"]["self"])
            except urllib.error.HTTPError:
                print(f"Error retrieving {result['annotatedClass']['@id']}")
                continue
        print("Class details")
        print("\tid: " + class_details["@id"])
        print("\tprefLabel: " + class_details["prefLabel"])
        print("\tontology: " + class_details["links"]["ontology"])

        print("Annotation details")
        for annotation in result["annotations"]:
            print("\tfrom: " + str(annotation["from"]))
            print("\tto: " + str(annotation["to"]))
            print("\tmatch type: " + annotation["matchType"])

        if result["hierarchy"]:
            print("\n\tHierarchy annotations")
            for annotation in result["hierarchy"]:
                try:
                    class_details = get_json(annotation["annotatedClass"]["links"]["self"])
                except urllib.error.HTTPError:
                    print(f"Error retrieving {annotation['annotatedClass']['@id']}")
                    continue
                pref_label = class_details["prefLabel"] or "no label"
                print("\t\tClass details")
                print("\t\t\tid: " + class_details["@id"])
                print("\t\t\tprefLabel: " + class_details["prefLabel"])
                print("\t\t\tontology: " + class_details["links"]["ontology"])
                print("\t\t\tdistance from originally annotated class: " + str(annotation["distance"]))

        print("\n\n")



In [59]:
#Input the data you want to work with in here
df = pandas.read_csv('GSD_DBID.csv')
print(len(df))
#Now to make this run for each context available!!!!
id = []
From = []
To = []
matchType = []
annotation2 = []
ontology = []
context = []
dbid = []
drugname = []

#Adds additional parameters here for the bioportal search engine
additional_parameters = "&ontologies=DOID&require_exact_match=true"

for index, row in df.iterrows():
    
    if index  % 200 == 0:
        perc = index/len(df) *100
        print(str(index) + " : "+ str(perc))
    
    #Text input for the ontology search engine
    text_to_annotate = row["Text"]
    db_id = row["DB_ID"]
    drug_name = row["Active_ingredient"]
    
    

    try:
        # Annotate using the provided text
        annotations = get_json(REST_URL + "/annotator?text=" + urllib.parse.quote(text_to_annotate) + additional_parameters)

        for result in annotations:
            class_details = result["annotatedClass"]

            for annotation in result["annotations"]:
                From.append(annotation["from"])
                To.append(annotation["to"])
                matchType.append(annotation["matchType"])
                annotation2.append(annotation["text"])   
                context.append(text_to_annotate)
                ontology.append("\tontology: " + class_details["links"]["ontology"])
                id.append(class_details["@id"])
                dbid.append(db_id)
                drugname.append(drug_name)
    except:
        pass

#Constructs the new dataframe (newdf) from the collected lists
newdf = pandas.DataFrame({'Id':id})
newdf['Drug Name'] = drugname
newdf['DB_ID'] = dbid
newdf['Context'] = context
newdf['Ontology'] = ontology
newdf['From'] = From
newdf['To'] = To
newdf['Type'] = matchType
newdf['Text'] = annotation2    


#Length of each of the df and the average number of annotations made per label
numOfAnno = len(newdf)
numOfContext = len(df)

print(numOfAnno)
print(numOfContext)

print(numOfAnno/numOfContext)    

3726
10 : 0.2683843263553409
14
3726
0.003757380568974772


In [56]:
newdf.head()

Unnamed: 0,Id,Drug Name,DB_ID,Context,Ontology,From,To,Type,Text
0,http://purl.obolibrary.org/obo/DOID_0060224,DIGOXIN,DB00390,Digoxin is a cardiac glycoside indicated for:|...,\tontology: http://data.bioontology.org/ontolo...,828,846,PREF,ATRIAL FIBRILLATION
1,http://purl.obolibrary.org/obo/DOID_0060224,DIGOXIN,DB00390,"Digoxin Tablets, USP are a cardiac glycoside i...",\tontology: http://data.bioontology.org/ontolo...,902,920,PREF,ATRIAL FIBRILLATION
2,http://purl.obolibrary.org/obo/DOID_0060224,DIGOXIN,DB00390,DIGOXIN is a cardiac glycoside indicated for:|...,\tontology: http://data.bioontology.org/ontolo...,412,430,PREF,ATRIAL FIBRILLATION
3,http://purl.obolibrary.org/obo/DOID_0060224,DIGOXIN,DB00390,DIGOXIN is a cardiac glycoside indicated for:|...,\tontology: http://data.bioontology.org/ontolo...,1101,1119,PREF,ATRIAL FIBRILLATION
4,http://purl.obolibrary.org/obo/DOID_0060224,DIGOXIN,DB00390,DIGOXIN is a cardiac glycoside indicated for:|...,\tontology: http://data.bioontology.org/ontolo...,412,430,PREF,ATRIAL FIBRILLATION


In [27]:
#Now use the gathered annotations and use the DOID mapping to find the disease UMLS
df = newdf

# Download and process UMLS to DOID mappings
# We use the propagated mappings here
url = 'https://raw.githubusercontent.com/dhimmel/disease-ontology/72614ade9f1cc5a5317b8f6836e1e464b31d5587/data/xrefs-prop-slim.tsv'
domap_df = pandas.read_csv(url)
domap_df = domap_df.query('resource == "UMLS"')
domap_df['diseaseId'] = domap_df['resource_id']
domap_df = domap_df[['doid_code', 'doid_name', 'diseaseId']]


x = []

for index, row in df.iterrows():
    y = row["Id"].split("_")
    x.append(y[1])

#df["DOID"] = x
df.insert(3, 'DOID', x)

x = []

for index, row in domap_df.iterrows():
    y = row["doid_code"].split(":")
    x.append(y[1])

domap_df["DOID"] = x


domap_df = domap_df[['doid_name', 'diseaseId', 'DOID']]   

#Merge the DOID mapping to the BPAnnotator
df2 = df.merge(domap_df, on=["DOID"], how='inner')

df2.to_csv("BPA_UMLS.csv")

  import sys


ValueError: cannot insert DOID, already exists

In [36]:
df2 = pandas.read_csv("BPAUMLS.csv")

In [45]:
#Merge the BPAnnotations with the XML file
df3 = pandas.read_csv("GSD_DBID.csv")
df4 = df2.merge(df3, on = ["DB_ID"], how = 'inner')
df5 = df4[['Label_ID', 'Drug_Brand_Name', 'Active_ingredient', 'Text_y', 'UNII_ID', 'DB_ID', 'Ontology', 'From', 'To', 'Text_x', 'diseaseId']]
df5.columns = ['Label ID', 'Drug Brand Name', 'Active Ingredient', 'Context', 'UNII ID', 'DBID', 'Ontology', 'From', 'To', 'Text', 'UMLS ID']

In [48]:
df5.head()

Unnamed: 0,Label ID,Drug Brand Name,Active Ingredient,Context,UNII ID,DBID,Ontology,From,To,Text,UMLS ID
0,092edb13-fe21-410e-b507-835f162d1de7.xml,Diclofenac Sodium,DICLOFENAC,"Diclofenac Sodium Gel, 3% is indicated for the...",144O8QL0L1,DB00586,\tontology: http://data.bioontology.org/ontolo...,419,432,OSTEOARTHRITIS,C0157946
1,09f44813-da4b-4407-a9dd-61e4802a7dbf.xml,Diclofenac Sodium,DICLOFENAC,Carefully consider the potential benefits and ...,144O8QL0L1,DB00586,\tontology: http://data.bioontology.org/ontolo...,419,432,OSTEOARTHRITIS,C0157946
2,26b04cee-7f9a-4787-a726-01c4ae03803c.xml,Diclofenac Sodium,DICLOFENAC,Diclofenac sodium topical solution is indicate...,144O8QL0L1,DB00586,\tontology: http://data.bioontology.org/ontolo...,419,432,OSTEOARTHRITIS,C0157946
3,27749712-9d85-4bde-9ae8-d948184d9426.xml,Diclofenac Sodium,Diclofenac,Diclofenac sodium topical solution is a nonste...,144O8QL0L1,DB00586,\tontology: http://data.bioontology.org/ontolo...,419,432,OSTEOARTHRITIS,C0157946
4,2c8e6412-5c93-4128-921d-d9207b98709c.xml,Diclofenac,DICLOFENAC,"Diclofenac Sodium Ophthalmic Solution, 0.1% is...",144O8QL0L1,DB00586,\tontology: http://data.bioontology.org/ontolo...,419,432,OSTEOARTHRITIS,C0157946


In [20]:
#Input the data you want to work with in here
df = pandas.read_csv('GSD_DBID.csv')
print(len(df))
#Now to make this run for each context available!!!!
id = []
From = []
To = []
matchType = []
annotation2 = []
ontology = []
context = []
dbid = []
drugname = []

#Adds additional parameters here for the bioportal search engine
additional_parameters = "&ontologies=DOID&require_exact_match=true"

for index, row in df.iterrows():
    
    print(index)
    #if index % 200 == 0:
        #perc = index/len(df) *100
        #print(str(index) + " : "+ str(perc))
    
    #Text input for the ontology search engine
    text_to_annotate = row["Formatted_Text"]
    db_id = row["DrugBank_ID"]
    drug_name = row["Active_ingredient"]
    
    print(type(text_to_annotate))
    
    if type(text_to_annotate) != "<class 'str'>":
        print(str(index) + " This is empty")
    else:
        # Annotate using the provided text
        annotations = get_json(REST_URL + "/annotator?text=" + urllib.parse.quote(text_to_annotate) + additional_parameters)

        for result in annotations:
            class_details = result["annotatedClass"]

            for annotation in result["annotations"]:
                From.append(annotation["from"])
                To.append(annotation["to"])
                matchType.append(annotation["matchType"])
                annotation2.append(annotation["text"])   
                context.append(text_to_annotate)
                ontology.append("\tontology: " + class_details["links"]["ontology"])
                id.append(class_details["@id"])
                dbid.append(db_id)
                drugname.append(drug_name)



#Constructs the new dataframe (newdf) from the collected lists
newdf = pandas.DataFrame({'Id':id})
newdf['Drug Name'] = drugname
newdf['DB_ID'] = dbid
newdf['Context'] = context
newdf['Ontology'] = ontology
newdf['From'] = From
newdf['To'] = To
newdf['Type'] = matchType
newdf['Text'] = annotation2    


#Length of each of the df and the average number of annotations made per label
numOfAnno = len(newdf)
numOfContext = len(df)

print(numOfAnno)
print(numOfContext)

print(numOfAnno/numOfContext)


16338
0
<class 'str'>
0 This is empty
1
<class 'str'>
1 This is empty
2
<class 'str'>
2 This is empty
3
<class 'str'>
3 This is empty
4
<class 'str'>
4 This is empty
5
<class 'str'>
5 This is empty
6
<class 'str'>
6 This is empty
7
<class 'str'>
7 This is empty
8
<class 'str'>
8 This is empty
9
<class 'str'>
9 This is empty
10
<class 'str'>
10 This is empty
11
<class 'str'>
11 This is empty
12
<class 'str'>
12 This is empty
13
<class 'str'>
13 This is empty
14
<class 'str'>
14 This is empty
15
<class 'str'>
15 This is empty
16
<class 'str'>
16 This is empty
17
<class 'str'>
17 This is empty
18
<class 'str'>
18 This is empty
19
<class 'float'>
19 This is empty
20
<class 'str'>
20 This is empty
21
<class 'str'>
21 This is empty
22
<class 'str'>
22 This is empty
23
<class 'str'>
23 This is empty
24
<class 'float'>
24 This is empty
25
<class 'str'>
25 This is empty
26
<class 'str'>
26 This is empty
27
<class 'str'>
27 This is empty
28
<class 'str'>
28 This is empty
29
<class 'str'>
29 This

KeyboardInterrupt: 

In [27]:
newdf = pandas.read_csv("BioAnnotateDOID.csv")
df = pandas.read_csv('GSD_DBID.csv')

In [8]:
    #Length of each of the df and the average number of annotations made per label
    numOfAnno = len(newdf)
    numOfContext = len(newdf2)

    print(numOfAnno)
    print(numOfContext)

    print(numOfAnno/numOfContext)

44482
3714
11.976844372644049


In [2]:
#Text input for the ontology search engine
text_to_annotate = "Carefully consider the potential benefits and risks of nabumetone tablets and other treatment options before deciding to use nabumetone tablets. Use the lowest effective dose for the shortest duration consistent with individual patient treatment goals (see WARNINGS).Nabumetone tablets are indicated for relief of signs and symptoms of osteoarthritis and rheumatoid arthritis."

#Adds additional parameters here for the bioportal search engine
additional_parameters = "&ontologies=MESH&semantic_types=T047"

In [3]:
# Annotate using the provided text
annotations = get_json(REST_URL + "/annotator?text=" + urllib.parse.quote(text_to_annotate) + additional_parameters)

# Print out annotation details
print_annotations(annotations)

Class details
	id: http://purl.bioontology.org/ontology/MESH/D010003
	prefLabel: Osteoarthritis
	ontology: http://data.bioontology.org/ontologies/MESH
Annotation details
	from: 337
	to: 350
	match type: PREF



Class details
	id: http://purl.bioontology.org/ontology/MESH/D001172
	prefLabel: Arthritis, Rheumatoid
	ontology: http://data.bioontology.org/ontologies/MESH
Annotation details
	from: 356
	to: 375
	match type: SYN



Class details
	id: http://purl.bioontology.org/ontology/MESH/D001168
	prefLabel: Arthritis
	ontology: http://data.bioontology.org/ontologies/MESH
Annotation details
	from: 367
	to: 375
	match type: PREF





In [6]:
newdf2 = newdf.drop_duplicates(['Context'], keep='first')

In [7]:
len(newdf2)

3714

In [42]:

newdf.to_csv("revies.csv")

In [14]:
# Annotate with hierarchy information
annotations = get_json(REST_URL + "/annotator?max_level=3&text=" + urllib.parse.quote(text_to_annotate) + additional_parameters)
print_annotations(annotations)

Class details
	id: http://purl.bioontology.org/ontology/SNOMEDCT/396275006
	prefLabel: Osteoarthritis
	ontology: http://data.bioontology.org/ontologies/SNOMEDCT
Annotation details
	from: 337
	to: 350
	match type: PREF

	Hierarchy annotations
		Class details
			id: http://purl.bioontology.org/ontology/SNOMEDCT/399269003
			prefLabel: Arthropathy
			ontology: http://data.bioontology.org/ontologies/SNOMEDCT
			distance from originally annotated class: 1
		Class details
			id: http://purl.bioontology.org/ontology/SNOMEDCT/363059001
			prefLabel: Degenerative disorder of musculoskeletal system
			ontology: http://data.bioontology.org/ontologies/SNOMEDCT
			distance from originally annotated class: 1
		Class details
			id: http://purl.bioontology.org/ontology/SNOMEDCT/928000
			prefLabel: Disorder of musculoskeletal system
			ontology: http://data.bioontology.org/ontologies/SNOMEDCT
			distance from originally annotated class: 2
		Class details
			id: http://purl.bioontology.org/ontology/SNO

In [15]:
# Annotate with prefLabel, synonym, definition returned
annotations = get_json(REST_URL + "/annotator?include=prefLabel,synonym,definition&text=" + urllib.parse.quote(text_to_annotate) + additional_parameters)
print_annotations(annotations, False)

Class details
	id: http://purl.bioontology.org/ontology/SNOMEDCT/396275006
	prefLabel: Osteoarthritis
	ontology: http://data.bioontology.org/ontologies/SNOMEDCT
Annotation details
	from: 337
	to: 350
	match type: PREF



Class details
	id: http://purl.bioontology.org/ontology/MESH/D010003
	prefLabel: Osteoarthritis
	ontology: http://data.bioontology.org/ontologies/MESH
Annotation details
	from: 337
	to: 350
	match type: PREF



Class details
	id: http://purl.bioontology.org/ontology/MESH/D001172
	prefLabel: Arthritis, Rheumatoid
	ontology: http://data.bioontology.org/ontologies/MESH
Annotation details
	from: 356
	to: 375
	match type: SYN



Class details
	id: http://purl.bioontology.org/ontology/SNOMEDCT/69896004
	prefLabel: Rheumatoid arthritis
	ontology: http://data.bioontology.org/ontologies/SNOMEDCT
Annotation details
	from: 356
	to: 375
	match type: PREF



Class details
	id: http://purl.bioontology.org/ontology/MESH/D001168
	prefLabel: Arthritis
	ontology: http://data.bioontology

In [10]:

#Text input for the ontology search engine
text_to_annotate = "Carefully consider the potential benefits and risks of nabumetone tablets and other treatment options before deciding to use nabumetone tablets. Use the lowest effective dose for the shortest duration consistent with individual patient treatment goals (see WARNINGS).Nabumetone tablets are indicated for relief of signs and symptoms of osteoarthritis and rheumatoid arthritis."

#Adds additional parameters here for the bioportal search engine
additional_parameters = "&ontologies=MESH,SNOMEDCT&semantic_types=T047"
                          
# Annotate using the provided text
annotations = get_json(REST_URL + "/annotator?text=" + urllib.parse.quote(text_to_annotate) + additional_parameters)

# Print out annotation details
print_annotations(annotations)

# Annotate with hierarchy information
annotations = get_json(REST_URL + "/annotator?max_level=3&text=" + urllib.parse.quote(text_to_annotate) + additional_parameters)
print_annotations(annotations)

# Annotate with prefLabel, synonym, definition returned
annotations = get_json(REST_URL + "/annotator?include=prefLabel,synonym,definition&text=" + urllib.parse.quote(text_to_annotate) + additional_parameters)
print_annotations(annotations, False)
                          
                          
#Annotate all current (non-NaN) context files to obtain 3 things, the id, the ontology and the text(annotations)

Class details
	id: http://purl.bioontology.org/ontology/SNOMEDCT/396275006
	prefLabel: Osteoarthritis
	ontology: http://data.bioontology.org/ontologies/SNOMEDCT
Annotation details
	from: 337
	to: 350
	match type: PREF



Class details
	id: http://purl.bioontology.org/ontology/MESH/D010003
	prefLabel: Osteoarthritis
	ontology: http://data.bioontology.org/ontologies/MESH
Annotation details
	from: 337
	to: 350
	match type: PREF



Class details
	id: http://purl.bioontology.org/ontology/MESH/D001172
	prefLabel: Arthritis, Rheumatoid
	ontology: http://data.bioontology.org/ontologies/MESH
Annotation details
	from: 356
	to: 375
	match type: SYN



Class details
	id: http://purl.bioontology.org/ontology/SNOMEDCT/69896004
	prefLabel: Rheumatoid arthritis
	ontology: http://data.bioontology.org/ontologies/SNOMEDCT
Annotation details
	from: 356
	to: 375
	match type: PREF



Class details
	id: http://purl.bioontology.org/ontology/MESH/D001168
	prefLabel: Arthritis
	ontology: http://data.bioontology