In [None]:
import spacy
from spacy import displacy

import requests

In [None]:
spacy.__version__

In [None]:
api_url = "http://127.0.0.1:8000"
doc_id = "NCBI_Disease:10192393"

In [None]:
doc_req = requests.get(f"{api_url}/documents/{doc_id}")
doc_json = doc_req.json()

In [None]:
def get_matches(doc_json, match_source, document_section="title"):
    start_end_labels = []
    for ent in doc_json["entities"]:
        if ent["document_section"] != document_section or ent["source"] != match_source:
            continue
        # print(ent)
        ent_req = requests.get(f"{api_url}/entities/{ent['entity_id']}")
        ent_json = ent_req.json()
        start = ent["start_char"]
        end = ent["end_char"]
        label = ent_json["entity_type"]
        start_end_labels.append([start, end, label])
        start_end_labels = sorted(start_end_labels, key=lambda x: x[0])
    return start_end_labels

In [None]:
def display_matches(text, labels, title):
    colors = {"DISEASE": "linear-gradient(90deg, #999999, #cccccc)"}
    options = {"ents": ["DISEASE"], "colors": colors}
    ex = [{"text": text,
       "ents": [{"start": x[0], "end": x[1], "label": x[2]} for x in labels],
       "title": title}]
    html = displacy.render(ex, style="ent", manual=True, options=options)
    return html

In [None]:
doc_text = doc_json["title"]

In [None]:
m = display_matches(doc_text, get_matches(doc_json, "NCBI_Disease_Train"), "NCBI Disease annotation")

In [None]:
m = display_matches(doc_text, get_matches(doc_json, "scispacy_en_ner_bc5cdr_md"), "scispacy_en_ner_bc5cdr_md NER")

## Metric-based evaluation

In [None]:
gold_matches = get_matches(doc_json, "NCBI_Disease_Train")
gold_matches = {(x[0], x[1]) for x in gold_matches}
gold_matches

In [None]:
ner_matches = get_matches(doc_json, "scispacy_en_ner_bc5cdr_md")
ner_matches = {(x[0], x[1]) for x in ner_matches}
ner_matches

In [None]:
import pandas as pd
import os
import sys

sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "..")))

from db_api.crud import precision_recall_fscore

In [None]:
precision_recall_fscore(ner_matches, gold_matches)

In [None]:
docs_req = requests.get(f"{api_url}/documents/?skip=0&limit=500")
docs_json = docs_req.json()

results = []
for doc_json in docs_json:
    ner_matches = get_matches(doc_json, "scispacy_en_ner_bc5cdr_md", document_section="summary")
    ner_matches = {(x[0], x[1]) for x in ner_matches}

    gold_matches = get_matches(doc_json, "NCBI_Disease_Train", document_section="summary")
    gold_matches = {(x[0], x[1]) for x in gold_matches}
    
    precision, recall, fscore, tp, fp, fn = precision_recall_fscore(ner_matches, gold_matches)
    results.append((doc_json["id"], precision, recall, fscore, tp, fp, fn, len(gold_matches), len(ner_matches)))

In [None]:
results_df = pd.DataFrame(results, columns=["id", "precision", "recall", "fscore", "tp", "fp", "fn",
                                            "len_gold_matches", "len_ner_matches"])

In [None]:
results_df.sort_values(by=['len_gold_matches'], inplace=True, ascending=False)

In [None]:
results_df

In [None]:
doc_id = "NCBI_Disease:10556285"
doc_req = requests.get(f"{api_url}/documents/{doc_id}")
doc_json = doc_req.json()
m = display_matches(doc_json["summary"], get_matches(doc_json, "NCBI_Disease_Train", "summary"), "NCBI Disease annotation")

In [None]:
doc_id = "NCBI_Disease:10556285"
doc_req = requests.get(f"{api_url}/documents/{doc_id}")
doc_json = doc_req.json()
m = display_matches(doc_json["summary"], get_matches(doc_json, "scispacy_en_ner_bc5cdr_md", "summary"), "scispacy_en_ner_bc5cdr_md NER")

In [None]:
doc_id = "NCBI_Disease:10192393"
doc_req = requests.get(f"{api_url}/documents/{doc_id}")
doc_json = doc_req.json()
m = display_matches(doc_json["summary"], get_matches(doc_json, "NCBI_Disease_Train", "summary"), "NCBI Disease annotation")

In [None]:
doc_id = "NCBI_Disease:10192393"
doc_req = requests.get(f"{api_url}/documents/{doc_id}")
doc_json = doc_req.json()
m = display_matches(doc_json["summary"], get_matches(doc_json, "scispacy_en_ner_bc5cdr_md", "summary"), "scispacy_en_ner_bc5cdr_md NER")