# Using RDF Classify

## Setup 

In [1]:
import json
import random
import rdflib

import pandas as pd
from datetime import datetime
from zipfile import ZipFile

## Extracting RDF Entity Graphs

The `extract_graph` function takes the path to the exported Sinopia entities zip file, creates a graph and parses the JSON-LD of each entity into the graph and then returns the populated graph.

In [18]:
def extract_graphs(zip_filepath):
    graphs = []
    with ZipFile(zip_filepath) as zip_file:
        for zip_info in zip_file.infolist():
            if zip_info.file_size < 1:
                continue
            # Filter out all Resource Templates
            if zip_info.filename.split("/")[1].startswith("ld4p_"):
                continue
            with zip_file.open(zip_info) as zip_extract:
                graph = rdflib.ConjunctiveGraph()
                raw_rdf = zip_extract.read()
                try:
                    graph.parse(data=raw_rdf, format='json-ld')
                    graphs.append(graph)
                except json.JSONDecodeError:
                    print(f"Failed to parse {zip_info.filename}")
                    continue
    return graphs

In [19]:
dev_graph = extract_graphs("../data/input/sinopia_export_dev_all_2020-01-19T00_01_10.700Z.zip")

Failed to parse sinopia_export_all_2020-01-19T00:01:10.700Z/ucdavis_2020-01-19T00:01:12.515Z/complete.log
Failed to parse sinopia_export_all_2020-01-19T00:01:10.700Z/boulder_2020-01-19T00:01:11.088Z/complete.log
Failed to parse sinopia_export_all_2020-01-19T00:01:10.700Z/pcc_2020-01-19T00:01:10.894Z/complete.log
Failed to parse sinopia_export_all_2020-01-19T00:01:10.700Z/cornell_2020-01-19T00:01:12.963Z/complete.log
Failed to parse sinopia_export_all_2020-01-19T00:01:10.700Z/dlc_2020-01-19T00:01:12.788Z/complete.log
Failed to parse sinopia_export_all_2020-01-19T00:01:10.700Z/harvard_2020-01-19T00:01:14.013Z/complete.log
Failed to parse sinopia_export_all_2020-01-19T00:01:10.700Z/chicago_2020-01-19T00:01:12.732Z/complete.log
Failed to parse sinopia_export_all_2020-01-19T00:01:10.700Z/alberta_2020-01-19T00:01:11.428Z/complete.log
Failed to parse sinopia_export_all_2020-01-19T00:01:10.700Z/ucsd_2020-01-19T00:01:12.437Z/complete.log
Failed to parse sinopia_export_all_2020-01-19T00:01:10.70

In [40]:
resource_templates = {}
for graph in dev_graph:
    rt_query = graph.query("""SELECT ?subj ?resource_template WHERE {
 ?subj <http://sinopia.io/vocabulary/hasResourceTemplate> ?resource_template
}""")
    for row in rt_query:
        rt_key = str(row[1])
        if not rt_key in resource_templates:
            resource_templates[rt_key] = {}
        for predicate in graph.predicates(subject=row[0]):
            pred_key = str(predicate)
            if pred_key in resource_templates[rt_key]:
                resource_templates[rt_key][pred_key] += 1
            else:
                resource_templates[rt_key][pred_key] = 1
    

In [42]:
pd.Series(resource_templates['ld4p:RT:bf2:WorkTitle'])

http://www.w3.org/1999/02/22-rdf-syntax-ns#type     17
http://id.loc.gov/ontologies/bibframe/mainTitle     17
http://sinopia.io/vocabulary/hasResourceTemplate    17
http://www.w3.org/ns/prov#wasGeneratedBy            18
http://id.loc.gov/ontologies/bibframe/partNumber     1
dtype: int64

In [14]:
resource_templates = dev_graph.query("""SELECT ?resource_template WHERE {
 ?subj <http://sinopia.io/vocabulary/hasResourceTemplate> ?resource_template
}""")

In [17]:
for row in resource_templates:
    print(row)

(rdflib.term.Literal('ld4p:RT:bf2:Note'),)
(rdflib.term.Literal('ld4p:RT:bf2:ReferenceInstance'),)
(rdflib.term.Literal('ld4p:RT:bf2:WorkTitle'),)
(rdflib.term.Literal('Yale:RT:BF2:Topic'),)
(rdflib.term.Literal('ld4p:RT:bf2:Identifiers:EAN'),)
(rdflib.term.Literal('ld4p:RT:bf2:RareMat:AAT'),)
(rdflib.term.Literal('ld4p:RT:bf2:Cartographic:Instance'),)
(rdflib.term.Literal('ld4p:RT:bf2:Identifiers:Copyright'),)
(rdflib.term.Literal('WAU:RT:RDA:Work:monograph'),)
(rdflib.term.Literal('WAU:RT:RDA:Item:monograph'),)
(rdflib.term.Literal('WAU:RT:RDA:Work:monograph'),)
(rdflib.term.Literal('sinopia:RT:demo:Note'),)
(rdflib.term.Literal('WAU:RT:RDA:Manifestation:monograph'),)
(rdflib.term.Literal('WAU:RT:RDA:Manifestation:monograph'),)
(rdflib.term.Literal('Yale:RT:BF2:AdminMetadata'),)
(rdflib.term.Literal('ld4p:RT:bf2:Monograph:Work:Un-nested'),)
(rdflib.term.Literal('lc:RT:bf2:Title:AbbrTitle'),)
(rdflib.term.Literal('WAU:RT:RDA:Manifestation:monograph'),)
(rdflib.term.Literal('ld4p:RT:bf

Original size 9512 triples, current triples are 9680

In [19]:
count = 0
for s,p,o in dev_graph:
    print(s,p)
    if count > 250:
        break
    count += 1

b0 http://www.w3.org/1999/02/22-rdf-syntax-ns#_5
https://trellis.development.sinopia.io/repository/yale/135a9f82-38ab-4b28-9879-a3dee13a78bc http://www.w3.org/1999/02/22-rdf-syntax-ns#type
b0 http://www.w3.org/1999/02/22-rdf-syntax-ns#_2
b1 http://www.w3.org/1999/02/22-rdf-syntax-ns#type
b1 http://www.w3.org/ns/prov#atTime
b0 http://www.w3.org/1999/02/22-rdf-syntax-ns#_1
https://trellis.development.sinopia.io/repository/yale/135a9f82-38ab-4b28-9879-a3dee13a78bc http://www.loc.gov/mads/rdf/v1#componentList
b0 http://www.w3.org/1999/02/22-rdf-syntax-ns#_4
b1 http://www.w3.org/ns/prov#wasAssociatedWith
https://trellis.development.sinopia.io/repository/yale/135a9f82-38ab-4b28-9879-a3dee13a78bc http://sinopia.io/vocabulary/hasResourceTemplate
https://trellis.development.sinopia.io/repository/yale/135a9f82-38ab-4b28-9879-a3dee13a78bc http://www.w3.org/ns/prov#wasGeneratedBy
b0 http://www.w3.org/1999/02/22-rdf-syntax-ns#_3
b0 http://www.w3.org/1999/02/22-rdf-syntax-ns#type
https://trellis.dev