# Sinopia Knowledge Graph

In [25]:
import json
from datetime import datetime
from typing import Dict, List, Optional

import pandas as pd

import kglab
import rdflib
import requests

In [2]:
namespaces = {
    "bf": "http://id.loc.gov/ontologies/bibframe/",
    "skos": "http://www.w3.org/2004/02/skos/core#",
    "sinopia": "http://sinopia.io/vocabulary/"
}



In [3]:
kg.describe_ns()

Unnamed: 0,prefix,namespace
0,dct,http://purl.org/dc/terms/
1,owl,http://www.w3.org/2002/07/owl#
2,prov,http://www.w3.org/ns/prov#
3,rdf,http://www.w3.org/1999/02/22-rdf-syntax-ns#
4,rdfs,http://www.w3.org/2000/01/rdf-schema#
5,schema,http://schema.org/
6,sh,http://www.w3.org/ns/shacl#
7,xsd,http://www.w3.org/2001/XMLSchema#
8,bf,http://id.loc.gov/ontologies/bibframe/
9,skos,http://www.w3.org/2004/02/skos/core#


In [6]:
LDP = rdflib.Namespace("http://www.w3.org/ns/ldp#")
SINOPIA = rdflib.Namespace("http://sinopia.io/vocabulary/")

def from_api(api_url: str) -> Dict:
    """Takes the new Sinopia API endpoint URI, extracts each resource and
    template, and returns a dictionary with two lists, a resources and a
    templates, and the total number of resources harvested from the api.

    @param api_url -- URI to Sinopia API endpoint
    """

    def add_resource(resource):
        if not 'data' in resource:
            print(f"\n{resource.get('uri')} missing data")
            return
        output["total"] += 1
        graph = rdflib.Graph()
        graph.namespace_manager.bind("sinopia", SINOPIA)
        jsonld = json.dumps(resource.pop("data")).encode()
        try:
            graph.parse(data=jsonld, format="json-ld")
        except Exception as error:
            print(f"Failed to parse {resource}\n{error}")
            return
        payload = {"graph": graph, "meta": resource}
        if "sinopia:template:resource" in resource.get("templateId"):
            output["templates"].append(payload)
        else:
            output["resources"].append(payload)

    output = {"resources": [], "templates": [], "total": 0}
    start = datetime.utcnow()
    print(f"Started harvest of resources at {start} for {api_url}")
    initial = requests.get(f"{api_url}/resource")
    print("0", end="")
    for row in initial.json().get("data"):
        add_resource(row)
    next_link = initial.json().get("links").get("next")
    while 1:
        result = requests.get(next_link)
        if result.status_code > 300:
            break
        payload = result.json()
        new_next = payload.get("links").get("next")
        if new_next == next_link or new_next is None:
            break
        for row in payload.get("data"):
            add_resource(row)
        next_link = new_next
        print(".", end="")
        if not output["total"] % 250:
            print(f"{output['total']}", end="")
    end = datetime.utcnow()
    print(f"\nFinished total time {(end-start).seconds / 60.}")
    return output

In [7]:
dev_rdf = from_api("https://api.stage.sinopia.io")

Started harvest of resources at 2021-05-12 23:27:45.028062 for https://api.stage.sinopia.io
0.........250..........500..........750..........1000..........1250..........1500..........1750..........2000..........2250..........2500..........2750..........3000..........3250..........3500..........3750..........4000..........4250..........4500..........4750

http://desktop.loc.gov/search?view=document&id=Infobasedcrmg0Dash0Dash0Dash247&hl=true&fq=allresources|true# does not look like a valid URI, trying to serialize this will break.


..........5000..........5250

ld4p:RT:bf2:2D graphic material:Item does not look like a valid URI, trying to serialize this will break.


..........5500....

urn:ld4p:qa:gettyaat:Objects__Object_Groupings and Systems does not look like a valid URI, trying to serialize this will break.


......5750..........6000
https://api.stage.sinopia.io/resource/e49c5f1d-5e62-4b45-b87f-5d0cf3e573e5 missing data

https://api.stage.sinopia.io/resource/3770137a-bed5-4a97-bd9a-fea4f3822dd7 missing data
..........
https://api.stage.sinopia.io/resource/28961949-72b2-4c94-b1f5-a7788f1ae1f0 missing data

https://api.stage.sinopia.io/resource/c3a1d5dd-a829-4ba7-8fbe-20490c018407 missing data

https://api.stage.sinopia.io/resource/4e80a183-4487-44fd-9bf8-8497c50d27f3 missing data
.
https://api.stage.sinopia.io/resource/16625687-0208-4ea5-b299-204d36180c45 missing data
..

https://api.stage.sinopia.io/resource/this is a test does not look like a valid URI, trying to serialize this will break.


............
https://api.stage.sinopia.io/resource/a6acbbea-1770-468b-904b-51cc4a3d7f27 missing data
....Failed to parse {'user': 'mcm104', 'group': 'washington', 'templateId': 'WAU:RT:BF2:Work', 'types': ['http://id.loc.gov/ontologies/bibframe/Work'], 'id': '0398ce54-ff15-4e9f-8948-c44bcc393798', 'uri': 'https://api.stage.sinopia.io/resource/0398ce54-ff15-4e9f-8948-c44bcc393798', 'timestamp': '2021-03-30T22:02:40.077Z'}
'@eng' is not a valid language tag!
............................................................................................................................................................
Finished total time 5.0


In [9]:
kg.load_rdf_text()

[0;31mType:[0m        dict
[0;31mString form:[0m {'resources': [{'graph': <Graph identifier=N494e317f7ffe41d18737c060c4bcbb29 (<class 'rdflib.grap <...> lit:RT:bf2:Monograph:Work:Un-nested', 'timestamp': '2021-05-03T17:59:10.972Z'}}], 'total': 10618}
[0;31mLength:[0m      3
[0;31mDocstring:[0m  
dict() -> new empty dictionary
dict(mapping) -> new dictionary initialized from a mapping object's
    (key, value) pairs
dict(iterable) -> new dictionary initialized as if via:
    d = {}
    for k, v in iterable:
        d[k] = v
dict(**kwargs) -> new dictionary initialized with the name=value pairs
    in the keyword argument list.  For example:  dict(one=1, two=2)


In [16]:
kg.rdf_graph()

<Graph identifier=N48a11d2f09c141ac9dab2324547ea42a (<class 'rdflib.graph.Graph'>)>

In [17]:
kglab.KnowledgeGraph?

[0;31mInit signature:[0m
[0mkglab[0m[0;34m.[0m[0mKnowledgeGraph[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mname[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;34m'generic'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbase_uri[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlanguage[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;34m'en'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0muse_gpus[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mimport_graph[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mrdflib[0m[0;34m.[0m[0mgraph[0m[0;34m.[0m[0mConjunctiveGraph[0m[0;34m,[0m [0mrdflib[0m[0;34m.[0m[0mgraph[0m[0;34m.[0m[0mDataset[0m[0;34m,[0m [0mrdflib[0m[0;34m.[0m[0mgraph[0m[0;34m.[0m[0mGraph[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0m

In [18]:
stage = rdflib.ConjunctiveGraph()
for row in dev_rdf['resources']:
    stage += row.get('graph')

    
kg = kglab.KnowledgeGraph(
    name = "Sinopia Stage KG",
    base_uri = "https://api.stage.sinopia.io/resource/",
    namespaces = namespaces,
    import_graph=stage
)

In [19]:
measure = kglab.Measure()

In [20]:
measure.measure_graph(kg)
print("edges: {}\n".format(measure.get_edge_count()))
print("nodes: {}\n".format(measure.get_node_count()))

edges: 314046

nodes: 67649



In [31]:
works_query = """SELECT ?s ?p ?o 
WHERE {
   ?s ?p ?o .
}"""

In [32]:
work_instance = kg.visualize_query(works_query, notebook=True)

In [33]:
work_instance.force_atlas_2based()
work_instance.show('tmp.work_instance.html')

In [34]:
df = kg.query_as_df(works_query)

In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 314046 entries, 0 to 314045
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   s       314046 non-null  object
 1   p       314046 non-null  object
 2   o       314046 non-null  object
dtypes: object(3)
memory usage: 7.2+ MB


In [37]:
subgraph = kglab.SubgraphTensor(kg)
pyvis_graph = subgraph.build_pyvis_graph(notebook=True)
pyvis_graph.force_atlas_2based()
pyvis_graph.show("tmp.fig03.html")

KeyboardInterrupt: 

In [39]:
second_kg = kglab.KnowledgeGraph(
    name = "Sinopia Stage Second KG",
    base_uri = "https://api.stage.sinopia.io/resource/",
    namespaces = namespaces,
    import_graph=dev_rdf['resources'][1]['graph']
)

In [40]:
subgraph = kglab.SubgraphTensor(second_kg)
pyvis_graph = subgraph.build_pyvis_graph(notebook=True)
pyvis_graph.force_atlas_2based()
pyvis_graph.show("tmp.fig03.html")