# Answers to Exercises

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
%reload_ext lab_black
import json

from fastai.tabular.all import *

import pandas as pd

import kglab
import rdflib
import requests
import helpers
import widgets

# Introduction and Setup for Sinopia's Knowledge Graph
## Exercise 1
Compare the total number of triples for National Library of Medicine in each Sinopia environment; development, stage, and production.

In [None]:
dev_nlm = helpers.create_kg(
    "https://api.development.sinopia.io/resource?group=nlm",
    name="NLM Sinopia Development",
)
stage_nlm = helpers.create_kg(
    "https://api.stage.sinopia.io/resource?group=nlm", name="NLM Sinopia Stage"
)
prod_nlm = helpers.create_kg(
    "https://api.sinopia.io/resource?group=nlm", name="NLM Sinopia Production"
)

In [None]:
print(
    f"""Number of Triples for NLM per Environment:
Development: {len(dev_nlm.rdf_graph()):,}
      Stage: {len(stage_nlm.rdf_graph()):,}
 Production: {len(prod_nlm.rdf_graph()):,}"""
)

# Analysis and Visualization of Sinopia Graphs
## Exercise 1
Using any of the three Sinopia environments (development, stage, or production), find a BIBFRAME Work with a corresponding BIBFRAME Instance and then replicate the steps above to create a Sinopia BIBFRAME Knowledge Graph.

For this exercise, I am using this BIBFRAME work from production, https://api.sinopia.io/resource/07d0f7e4-244d-40ed-8c83-537b495eee0b, that has an embedded BIBFRAME Instance as a blank node.

In [None]:
work_instance_result = requests.get(
    "https://api.sinopia.io/resource/07d0f7e4-244d-40ed-8c83-537b495eee0b"
)
bf_graph = rdflib.Graph()
for ns, url in helpers.NAMESPACES.items():
    bf_graph.namespace_manager.bind(ns, url)
bf_graph.parse(
    data=json.dumps(work_instance_result.json().get("data")), format="json-ld"
)

In [None]:
print(bf_graph.serialize(format="turtle").decode())

In [None]:
bf_kg = kglab.KnowledgeGraph(
    import_graph=bf_graph, name="BIBFRAME Work and Instance graph"
)
measure = kglab.Measure()
measure.measure_graph(bf_kg)

print("Edges: {:,}\n".format(measure.get_edge_count()))
print("Nodes: {:,}\n".format(measure.get_node_count()))

# Natural Language Processing (NLP) with spaCy
## Exercise 1
Download and create a `nlp` pipeline using the *en_core_web_sm* model and see what entities are identified using your sample from the `stage_text_nodes` dataframe.

In [None]:
# Load the dataframe from disk
stage_text_nodes = pd.read_json("data/stage-text-nodes.json")

In [None]:
import spacy
from spacy import displacy

eng_nlp = spacy.load("en_core_web_sm")

In [None]:
stage_text_nodes.sample(10)

In [None]:
doc_10407 = eng_nlp(stage_text_nodes.iloc[10407].summary)

displacy.render(doc_10407, style="ent", jupyter=True)

## Exercise 2
For the following Sinopia Resources, create a graph, and a NLP pipeline to identify any people and locations in the full-text. The full-text is available in `data/5f8c765d-dc4f-45a0-b2d8-87fe29a38712.txt` file.

- BIBFRAME Work https://api.stage.sinopia.io/resource/5f8c765d-dc4f-45a0-b2d8-87fe29a38712 
- BIBFRAME Instance https://api.stage.sinopia.io/resource/2e041e16-1b4d-431f-b2d0-396f62270efa
- BIBFRAME Item https://api.stage.sinopia.io/resource/53e68480-f82b-4dea-9b9a-68fd5ceacb41

# HuggingFace Transformers
## Exercise 1
From the `stage_text_nodes`, select a series *title*, *label*, or *summary* values and compare the [spaCy][SPACY] 'en_core_web_sm' NER model with the [Huggingface][HUG] NER model results.

[HUG]: https://huggingface.co/
[SPACY]: https://spacy.io/

In [None]:
from transformers import pipeline

hug_ner = pipeline("ner")

In [None]:
print(stage_text_nodes.iloc[10407].summary)

In [None]:
hug_ner_result = hug_ner(stage_text_nodes.iloc[10407].summary)

In [None]:
hug_ner_result

In [None]:
for ent in doc_10407.ents:
    print(ent.label_, ent.text)

HuggingFace NER found *Meg*, *Jo*, and *Beth* as persons, spaCy only found *Jo* and *Beth*. Hugging Face and spaCy found *New England* as a location.

# FastAI with PyTorch
## Exercise 1
So far we have been using all of the RDF in Sinopia's stage environment on building and training a resource template classifier, repeat the steps for Sinopia production environment.

In [None]:
prod_kg = kglab.KnowledgeGraph()
prod_kg.load_jsonld("data/production.json")

In [None]:
data = []
for row in prod_kg.query(
    """
SELECT ?template ?url 
WHERE {
   ?url <http://sinopia.io/vocabulary/hasResourceTemplate> ?template .
   FILTER isIRI(?url)
} """
):
    # Skip if RDF resource is a Sinopia resource template
    if str(row[0]).startswith("sinopia:template:resource"):
        continue
    data.append(helpers.predicate_row(row[1], prod_kg.rdf_graph()))
prod_pred_df = pd.DataFrame(data)
prod_pred_df = prod_pred_df.fillna(0.0)

In [None]:
prod_pred_df.head()

In [None]:
prod_df_copy = prod_pred_df.drop(
    columns=["uri", "http://sinopia.io/vocabulary/hasResourceTemplate"]
)
prod_df_clean = prod_df_copy[prod_df_copy.duplicated(subset=["template"], keep=False)]
splits = helpers.create_splits(prod_df_clean)

In [None]:
continous = [col for col in prod_df_clean.columns]
continous.pop(0)

prod_to = TabularPandas(
    prod_df_clean,
    procs=[Categorify],
    cont_names=continous,
    y_names="template",
    y_block=CategoryBlock,
    splits=splits,
)

In [None]:
prod_data_loader = prod_to.dataloaders(bs=64)

In [None]:
prod_learner = tabular_learner(prod_data_loader, metrics=accuracy)

In [None]:
prod_learner.fit_one_cycle(5)