In [None]:
from collections import namedtuple
import re

import requests

In [None]:
api_url = "http://127.0.0.1:8000"

In [None]:
files = ["NCBItrainset_corpus.txt", "NCBIdevelopset_corpus.txt", "NCBItestset_corpus.txt"]
source_names = ["NCBI_Disease_Train", "NCBI_Disease_Dev", "NCBI_Disease_Test"]

In [None]:
text_line_re = re.compile("^(\d+)\|(a|t)\|(.*)")
mention_line_re = re.compile("^\d+\t")

In [None]:
Mention = namedtuple("Mention", ["pmid", "start_char", "end_char", "match", "match_type", "id_", "section"])

In [None]:
def write_doc(pmid, title, abstract, source_name):
    id_ = "NCBI_Disease:" + str(pmid)
    r = requests.get(f"{api_url}/documents/{id_}")
    if r.status_code == 200:
        return None
    doc_dict = {
      "id": id_,
      "version": "v1",
      "source": source_name,
      "journal": "string",
      "document_type": "postprint",
      "title": title,
      "publication_date": "2020-10-24",
      "update_date": "2020-10-24",
      "urls": [
       "string"
      ],
      "pmid": pmid,
      #"license": "string",
      #"doi": "string",
      #"arxiv_id": "string",
      "summary": abstract,
      #"full_text": "string",
#       "authors": [
#         "string"
#       ],
#       "affiliations": [
#         "string"
#       ],
#       "language": "string",
#       "keywords": [
#         "string"
#       ],
#       "in_citations": [
#         "string"
#       ],
#       "out_citations": [
#         "string"
#       ],
#       "tags": [
#         "string"
#       ],
#       "other_ids": [
#         "string"
#       ]
    }
    r = requests.post(f"{api_url}/documents/", json=doc_dict)
    return r

In [None]:
def write_mention(mention, source_name):
    # check if entity exists and register if non-existant
    r = requests.get(f"{api_url}/entities/{mention.id_}")
    if r.status_code == 404:
        entity_post_json = {
          "id": mention.id_,
          "preferred_name": mention.id_,
          "entity_type": "disease",
          "synonyms": [

          ],
          "source": "NCBI_Disease"
        }
        r = requests.post(f"{api_url}/entities/", json=entity_post_json)
        assert r.status_code == 200
    
    # check if mention exists and register if non-existant
    doc_id = "NCBI_Disease:" + str(mention.pmid)
    r = requests.get(f"{api_url}/entities/{mention.id_}/documents/{doc_id}")
    j = r.json()
    starts_ends = {(x['start_char'], x['end_char'],  x['source']) for x in j}
    if (mention.start_char, mention.end_char, source_name) not in starts_ends:
        mention_dict = {
          "document_id": doc_id,
          "entity_id": mention.id_,
          "text": mention.match,
          "document_section": mention.section,
          "start_char": mention.start_char,
          "end_char": mention.end_char,
          "start_token": -1,
          "end_token": -1,
          "source": source_name,
        }
        r = requests.post(f"{api_url}/entity_mentions/", json=mention_dict)
        return r
    else:
        return None

In [None]:
for file, source_name in zip(files, source_names):
    previous_doc_pmid = None
    previous_mention_pmid = None
    title = None
    abstract = None
    mentions = []
    with open(file, "rt") as fin:
        for i, line in enumerate(fin):
#             if i < 19:
#                 print(line)
#             else:
#                 break
            if line.strip() == "":
                continue
            text_line_match = text_line_re.match(line)
            mention_line_match = mention_line_re.match(line)
            if text_line_match:
                pmid = int(text_line_match.group(1))
                if previous_doc_pmid is None:
                    previous_doc_pmid = pmid
                elif pmid != previous_doc_pmid:
                    assert title is not None, previous_doc_pmid
                    assert abstract is not None, previous_doc_pmid
                    r = write_doc(previous_doc_pmid, title, abstract, source_name)
                    assert r is None or r.status_code == 200, previous_doc_pmid
                    previous_doc_pmid = pmid
                    title = None
                    abstract = None
                text_type = text_line_match.group(2)
                if text_type == "t":
                    title = text_line_match.group(3)
                elif text_type == "a":
                    abstract = text_line_match.group(3)
                else:
                    raise Exception(f"Unrecognized text type: {text_type}")
            elif mention_line_match:
                pmid, start_char, end_char, match, match_type, id_ = line.split("\t")
                pmid = int(pmid)
                start_char = int(start_char)
                end_char = int(end_char)
                title_len = len(title)
                if start_char > title_len:
                    section = "summary"
                    start_char -= title_len
                    end_char -= title_len
                else:
                    section = "title"
                mention = Mention(pmid=pmid, start_char=start_char, end_char=end_char,
                                  match=match, match_type=match_type, id_=id_, section = section)
                if previous_mention_pmid is None:
                    previous_mention_pmid = pmid
                    mentions = [mention]
                elif pmid != previous_mention_pmid:
                    assert len(mentions) > 0, previous_mention_pmid
                    for m in mentions:
                        r = write_mention(m, source_name)
                        assert r is None or r.status_code == 200, previous_mention_pmid
                    previous_mention_pmid = pmid
                    mentions = [mention]
                else:
                    mentions.append(mention)
            else:
                raise Exception(f"Could not parse line:\n{line}")
    r = write_doc(previous_doc_pmid, title, abstract, source_name)
    assert r is None or r.status_code == 200, previous_doc_pmid
    for m in mentions:
        r = write_mention(m, source_name)
        assert r is None or r.status_code == 200, previous_mention_pmid