# Answers to Exercises

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
%reload_ext lab_black
import json

from fastai.tabular.all import *

import pandas as pd

import kglab
import rdflib
import requests
import helpers
import widgets

# Introduction and Setup for Sinopia's Knowledge Graph
## Exercise 1
Compare the total number of triples for National Library of Medicine in each Sinopia environment; development, stage, and production.

In [4]:
dev_nlm = helpers.create_kg(
    "https://api.development.sinopia.io/resource?group=nlm",
    name="NLM Sinopia Development",
)
stage_nlm = helpers.create_kg(
    "https://api.stage.sinopia.io/resource?group=nlm", name="NLM Sinopia Stage"
)
prod_nlm = helpers.create_kg(
    "https://api.sinopia.io/resource?group=nlm", name="NLM Sinopia Production"
)

0....100....2000....100....200....300....400..

In [9]:
print(
    f"""Number of Triples for NLM per Environment:
Development: {len(dev_nlm.rdf_graph()):,}
      Stage: {len(stage_nlm.rdf_graph()):,}
 Production: {len(prod_nlm.rdf_graph()):,}"""
)

Number of Triples for NLM per Environment:
Development: 0
      Stage: 7,969
 Production: 12,046


# Analysis and Visualization of Sinopia Graphs
## Exercise 1
Using any of the three Sinopia environments (development, stage, or production), find a BIBFRAME Work with a corresponding BIBFRAME Instance and then replicate the steps above to create a Sinopia BIBFRAME Knowledge Graph.

For this exercise, I am using this BIBFRAME work from production, https://api.sinopia.io/resource/07d0f7e4-244d-40ed-8c83-537b495eee0b, that has an embedded BIBFRAME Instance as a blank node.

In [16]:
work_instance_result = requests.get(
    "https://api.sinopia.io/resource/07d0f7e4-244d-40ed-8c83-537b495eee0b"
)
bf_graph = rdflib.Graph()
for ns, url in helpers.NAMESPACES.items():
    bf_graph.namespace_manager.bind(ns, url)
bf_graph.parse(
    data=json.dumps(work_instance_result.json().get("data")), format="json-ld"
)

<Graph identifier=N782f29d9e8744ac683d28d6325d72a5a (<class 'rdflib.graph.Graph'>)>

In [19]:
print(bf_graph.serialize(format="turtle").decode())

@prefix bf: <http://id.loc.gov/ontologies/bibframe/> .
@prefix bflc: <http://id.loc.gov/ontologies/bflc/> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix sinopia: <http://sinopia.io/vocabulary/> .

<https://api.sinopia.io/resource/07d0f7e4-244d-40ed-8c83-537b495eee0b> a bf:Work ;
    bf:Work <http://share-vde.org/sharevde/rdfBibframe/Work/13390539-1> ;
    bf:classification [ a bf:ClassificationLcc ;
            bf:classificationPortion "B2430.R272"@eng,
                "HD4851"@eng ] ;
    bf:content <http://id.loc.gov/vocabulary/contentTypes/txt> ;
    bf:contribution [ a bf:Contribution ;
            bf:agent [ a bf:Person ;
                    rdf:value "http://id.loc.gov/authorities/names/n81053868" ] ;
            bf:role <http://id.loc.gov/vocabulary/relators/aut> ] ;
    bf:expressionOf [ a bf:Work ;
            bf:Work <http://share-vde.org/sharevde/rdfBibframe/Work/13390539> ] ;
    bf:hasInstance 

In [18]:
bf_kg = kglab.KnowledgeGraph(
    import_graph=bf_graph, name="BIBFRAME Work and Instance graph"
)
measure = kglab.Measure()
measure.measure_graph(bf_kg)

print("Edges: {:,}\n".format(measure.get_edge_count()))
print("Nodes: {:,}\n".format(measure.get_node_count()))

Edges: 101

Nodes: 51



# Natural Language Processing (NLP) with spaCy
## Exercise 1
Download and create a `nlp` pipeline using the *en_core_web_sm* model and see what entities are identified using your sample from the `stage_text_nodes` dataframe.

In [3]:
# Load the dataframe from disk
stage_text_nodes = pd.read_json("data/stage-text-nodes.json")

In [10]:
import spacy
from spacy import displacy

eng_nlp = spacy.load("en_core_web_sm")

In [16]:
stage_text_nodes.sample(10)

Unnamed: 0,title,url,label,summary
219,Driad,<https://api.sinopia.io/resource/1c2f62d5-177a-4fed-a76f-377e9d041b31>,,
11263,Shutter,<https://api.stage.sinopia.io/resource/3f97be15-bea9-4e48-94c1-aad61334805f#b1>,,
9447,Conchology haiku,<https://api.stage.sinopia.io/resource/834dac90-6fbd-48fe-8d38-255aa98ae3f8>,,
10088,"Current directions in ostracism, social exclusion, and rejection research",<https://api.stage.sinopia.io/resource/538f7f8a-b46e-4eee-bc08-a3afeaa0be0b>,,
4122,"Die Arbeitsgerichtsbarkeit Baden-Württemberg, 1946-2016",<https://api.stage.sinopia.io/resource/639106d1-9cb3-4a94-a123-b117a2bf68ec>,,
844,"Medieval art, architecture and archaeology at Canterbury",<https://api.stage.sinopia.io/resource/e71043b5-f91e-4f46-93ea-099ca3abcc2b>,,
5525,Cinders Gallery,<https://api.stage.sinopia.io/resource/6283558d-f3b5-4034-b763-eb2113e03e7f#b1>,,
7479,"Annonij monachi Benedictini diserti & veridici, quoru[n]damq[ue] aliorum venerabilium eiusdem professorio[n]is patrum, De regum procerumq[ue] Francorum origine gestisq[ue] clarissimis vsq[ue] ad Philippum Augustum libri quinq[ue] nunc primum impressi",<https://api.stage.sinopia.io/resource/21b25c96-425a-4a7c-b5e1-8175986d03e9>,,
10407,Little women. English (Meigs and Smith),<https://api.stage.sinopia.io/resource/bca2b606-b4b6-4a45-849e-4ed19e2212ba>,,"The adventures of Meg, Jo, Beth, and Amy as they grow into young women in mid-nineteenth-century New England."
10786,不思議の国のアリス,<https://api.stage.sinopia.io/resource/d6b74251-1532-477d-942c-92802c7cb2ba>,,


In [20]:
doc_10407 = eng_nlp(stage_text_nodes.iloc[10407].summary)

displacy.render(doc_10407, style="ent", jupyter=True)

## Exercise 2
For the following Sinopia Resources, create a graph, and a NLP pipeline to identify any people and locations in the full-text. The full-text is available in `data/5f8c765d-dc4f-45a0-b2d8-87fe29a38712.txt` file.

- BIBFRAME Work https://api.stage.sinopia.io/resource/5f8c765d-dc4f-45a0-b2d8-87fe29a38712 
- BIBFRAME Instance https://api.stage.sinopia.io/resource/2e041e16-1b4d-431f-b2d0-396f62270efa
- BIBFRAME Item https://api.stage.sinopia.io/resource/53e68480-f82b-4dea-9b9a-68fd5ceacb41

# HuggingFace Transformers
## Exercise 1
From the `stage_text_nodes`, select a series *title*, *label*, or *summary* values and compare the [spaCy][SPACY] 'en_core_web_sm' NER model with the [Huggingface][HUG] NER model results.

[HUG]: https://huggingface.co/
[SPACY]: https://spacy.io/

In [21]:
from transformers import pipeline

hug_ner = pipeline("ner")

In [28]:
print(stage_text_nodes.iloc[10407].summary)

The adventures of Meg, Jo, Beth, and Amy as they grow into young women in mid-nineteenth-century New England.


In [23]:
hug_ner_result = hug_ner(stage_text_nodes.iloc[10407].summary)

In [24]:
hug_ner_result

[{'entity': 'I-PER',
  'score': 0.9988243,
  'index': 4,
  'word': 'Meg',
  'start': 18,
  'end': 21},
 {'entity': 'I-PER',
  'score': 0.99841213,
  'index': 6,
  'word': 'Jo',
  'start': 23,
  'end': 25},
 {'entity': 'I-PER',
  'score': 0.9980937,
  'index': 8,
  'word': 'Beth',
  'start': 27,
  'end': 31},
 {'entity': 'I-PER',
  'score': 0.99858505,
  'index': 11,
  'word': 'Amy',
  'start': 37,
  'end': 40},
 {'entity': 'I-LOC',
  'score': 0.99251026,
  'index': 24,
  'word': 'New',
  'start': 97,
  'end': 100},
 {'entity': 'I-LOC',
  'score': 0.995323,
  'index': 25,
  'word': 'England',
  'start': 101,
  'end': 108}]

In [27]:
for ent in doc_10407.ents:
    print(ent.label_, ent.text)

ORG Meg
PERSON Jo
PERSON Beth
DATE mid-nineteenth-century
LOC New England


HuggingFace NER found *Meg*, *Jo*, and *Beth* as persons, spaCy only found *Jo* and *Beth*. Hugging Face and spaCy found *New England* as a location.

# FastAI with PyTorch
## Exercise 1
So far we have been using all of the RDF in Sinopia's stage environment on building and training a resource template classifier, repeat the steps for Sinopia production environment.

In [22]:
prod_kg = kglab.KnowledgeGraph()
prod_kg.load_jsonld("data/production.json")

<kglab.kglab.KnowledgeGraph at 0x7fdabc8d86d0>

In [47]:
data = []
for row in prod_kg.query(
    """
SELECT ?template ?url 
WHERE {
   ?url <http://sinopia.io/vocabulary/hasResourceTemplate> ?template .
   FILTER isIRI(?url)
} """
):
    # Skip if RDF resource is a Sinopia resource template
    if str(row[0]).startswith("sinopia:template:resource"):
        continue
    data.append(helpers.predicate_row(row[1], prod_kg.rdf_graph()))
prod_pred_df = pd.DataFrame(data)
prod_pred_df = prod_pred_df.fillna(0.0)

In [48]:
prod_pred_df.head()

Unnamed: 0,uri,http://www.w3.org/2002/07/owl#sameAs,template,http://sinopia.io/vocabulary/hasResourceTemplate,http://id.loc.gov/ontologies/bibframe/itemOf,http://id.loc.gov/ontologies/bibframe/adminMetadata,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://id.loc.gov/ontologies/bibframe/heldBy,http://id.loc.gov/ontologies/bibframe/title,http://id.loc.gov/ontologies/bibframe/contribution,...,http://rdaregistry.info/Elements/w/P10205,http://rdaregistry.info/Elements/e/P20205,http://rdaregistry.info/Elements/w/P10127,http://id.loc.gov/ontologies/bibframe/partNumber,http://id.loc.gov/ontologies/bibframe/partName,http://www.loc.gov/mads/rdf/v1#authoritativeLabel,http://rdaregistry.info/Elements/m/P30266,http://rdaregistry.info/Elements/w/P10206,http://rdaregistry.info/Elements/w/P10015,http://rdaregistry.info/Elements/m/P30121
0,https://api.sinopia.io/resource/9ba24a69-87b5-478b-a576-10f130b76c64,1.0,WAU:RT:BF2:Item,1.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,https://api.sinopia.io/resource/faeb4fb6-dbdf-405f-960d-40799129774e,0.0,Yale:RT:BF2:Monograph:SuperWork:CtY-BR,1.0,0.0,1.0,2.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,https://api.sinopia.io/resource/2624d812-1f4c-40a1-a93e-cd419ee5af14,0.0,Yale:RT:BF2:Monograph:Instance:CtY,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,https://api.sinopia.io/resource/326cdf4f-38dc-450c-8fb4-2156ce6a6b20,0.0,WAU:RT:RDA:Manifestation:monograph,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,https://api.sinopia.io/resource/e6e2204b-df84-4359-9d20-2c99e052bfd9,0.0,WAU:RT:RDA:Item:monograph,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [49]:
prod_df_copy = prod_pred_df.drop(
    columns=["uri", "http://sinopia.io/vocabulary/hasResourceTemplate"]
)
prod_df_clean = prod_df_copy[prod_df_copy.duplicated(subset=["template"], keep=False)]
splits = helpers.create_splits(prod_df_clean)

In [50]:
continous = [col for col in prod_df_clean.columns]
continous.pop(0)

prod_to = TabularPandas(
    prod_df_clean,
    procs=[Categorify],
    cont_names=continous,
    y_names="template",
    y_block=CategoryBlock,
    splits=splits,
)

In [51]:
prod_data_loader = prod_to.dataloaders(bs=64)

In [52]:
prod_learner = tabular_learner(prod_data_loader, metrics=accuracy)

In [53]:
prod_learner.fit_one_cycle(5)

epoch,train_loss,valid_loss,accuracy,time
0,3.517597,2.331611,0.698795,00:00
1,1.929397,1.113929,0.753681,00:00
2,1.143654,1.365955,0.776439,00:00
3,0.768098,1.448101,0.789826,00:00
4,0.593888,1.395507,0.793842,00:00
