In [16]:
import pandas as pd
import requests
import json
import gzip
import sys

# Constants
URL = 'https://www.ebi.ac.uk/europepmc/annotations_api/annotationsByArticleIds'

In [8]:
# Extract PUBmed ids about human proteins 
uniprot_pmid = {}  # { pmid : list_of_uniprot_ids }
with gzip.open("../data/human_pubmed.tab.gz") as f:
    for line in f:
        line = line.decode().strip().split("\t")
        if len(line) == 3:
            for pmid in line[2].split("; "):
                uniprot_pmid.setdefault(pmid, []).append(line[0])

In [10]:
del uniprot_pmid['PubMed ID'] # delete header

In [13]:
len(uniprot_pmid) / 8 # number of queries vs pubmed

9591.125

In [None]:
'''
# DO mining
diseases = {}  # { uniprot_id : list_of_diseases }
pmids = list(uniprot_pmid.keys())

for i in range(0, len(pmids), 8):
    # Parameters definition for the query
    params = {
        "articleIds": ",".join(["MED:{}".format(pmid) for pmid in pmids[i:i+8]]),
        "type": "Diseases",
        "section": "Abstract",
        "format": "JSON"
    }
    # Make query
    r = requests.get(URL, params=params)
    print(i, r.status_code)
    obj = json.loads(r.text)
    for ele in obj:
        print(ele)
        for annotation in ele.get("annotations"):
            for uniprot_id in uniprot_pmid[ele["extId"]]:
                diseases.setdefault(uniprot_id, []).append(annotation["exact"])
# Save result
json.dump(diseases, '../data/uniprot_do.json', indent = 1)

'''

In [19]:
# Load DO on humans
with open('../data/uniprot_do.json') as file:
    do_human_dict = json.load(file)

# Create DO dataframe
do_human = []
for key in do_human_dict.keys():
    for ids in do_human_dict[key]:
        do_human.append([key, ids])
do_human = pd.DataFrame(do_human, columns=['entry_acc', 'do_id'])
do_human.sort_values(by=['entry_acc', 'do_id']).head()

Unnamed: 0,entry_acc,do_id
21824,A0A075B6H7,50460
21822,A0A075B6H7,60058
21825,A0A075B6H7,80322
21821,A0A075B6H7,1240
21823,A0A075B6H7,12858


In [21]:
# Parse the disease ontology
do = {}  # { do_id : do_object }
obj = {}  # { id: do_id, name: definition, xref: list_of_omim_ids, is_a: list_of_parents, is_obsolete: True }
with open("../data/doid.obo") as f:
    for line in f:
        line = line.strip().split(": ")
        if line and len(line) == 2:
            k, v = line
            if k == "id" and v.startswith("DOID:"):
                obj["id"] = v.split(":")[1]
            elif k == "xref" and "OMIM" in v:
                obj["omim"] = v.split(":")[1]
            elif k == "name":
                obj["name"] = v
            elif k == "is_a":
                obj.setdefault("is_a", []).append(v.split()[0].split(":")[1])
            elif k == "is_obsolete":
                obj["is_obsolete"] = True
        else:
            if obj.get("id") and not obj.get("is_obsolete"):
                do[obj["id"]] = obj
            obj = {}