In [43]:
import pandas as pd
import csv
import rdflib
import requests
import sys
sys.path.append("../src")
import data_loader
import data_prep

In [2]:
production = data_loader.from_api("https://api.sinopia.io")

Started harvest of resources at 2020-10-13 16:20:07.085717 for https://api.sinopia.io
.........250..........500..........750..........1000..........1250..........1500..........1750..........2000..........2250..........2500..........2750..........3000..........3250..........3500.....
Finished at 2020-10-13 16:21:54.691369, total time 1.7833333333333334


In [3]:
len(production['templates']), len(production['resources'])

(791, 2834)

## Resource Count by Templates 

In [7]:
template_totals = dict()
for row in production['templates']:
    template_totals[row['meta']['id']] = []

In [8]:
len(template_totals)

791

In [27]:
sparql = """PREFIX sinopia: <http://sinopia.io/vocabulary/>

SELECT ?subject ?template
WHERE {{ ?subject sinopia:hasResourceTemplate ?template . }}"""

In [14]:
print(production['resources'][0]['graph'].serialize(format='turtle').decode())

@prefix ns1: <http://id.loc.gov/ontologies/bibframe/> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix sinopia: <http://sinopia.io/vocabulary/> .

<https://api.sinopia.io/resource/d3ebee7d-d393-46b4-80dc-9ce720a15758> a ns1:Title ;
    ns1:mainTitle "my title"@en ;
    ns1:note [ a ns1:Note ;
            rdfs:label "dssdsdsds"@en ] ;
    ns1:partName "just something"@en ;
    ns1:partNumber "0202"@en ;
    sinopia:hasResourceTemplate "ld4p:RT:bf2:WorkTitle" .




In [28]:
result = production['resources'][0]['graph'].query(sparql)

In [29]:
result.bindings

[{rdflib.term.Variable('subject'): rdflib.term.URIRef('https://api.sinopia.io/resource/d3ebee7d-d393-46b4-80dc-9ce720a15758'), rdflib.term.Variable('template'): rdflib.term.Literal('ld4p:RT:bf2:WorkTitle')}]

In [23]:
sparql.format(template_id="ld4p:RT:bf2:WorkTitle")

'PREFIX sinopia: <http://sinopia.io/vocabulary/>\n\nSELECT ?s \nWHERE { ?s sinopia:hasResourceTemplate "ld4p:RT:bf2:WorkTitle" . }'

In [31]:
def extract_templates(graph):
    results = graph.query(sparql)
    for row in results.bindings:
        subject = str(row[rdflib.term.Variable('subject')])
        template_id = str(row[rdflib.term.Variable('template')])
        if not template_id in template_totals:
            raise ValueError(f"{template_id} not in list")
        template_totals[template_id].append(subject)

In [32]:
for row in production['resources']:
    get_templates(row.get('graph'))

In [33]:
def process_env(environment):
    start = datetime.datetime.utcnow()
    print(f"Started process at {start}")
    for i,row in enumerate(environment['resources']):
        extract_templates(row.get('graph'))
        extract_authorities(row.get('graph'))
        if not i%10:
            print(".", end="")
        if not i%100:
            print(f"{i}", end="")
    end = datetime.datetime.utcnow()
    print(f"Finished at {end}, total time {(end-start)/60.} minutes for {i}")

In [41]:
with open("production-templates.csv", "w") as fo:
    templates_writer = csv.writer(fo)
    templates_writer.writerow(['template', 'count'])
    for key, value in template_totals.items():
        if len(value) > 0:
            templates_writer.writerow([key, len(value)])
            print(f"{key} {len(value)}")

UCSD:RT:BF2:NotatedMusic:Work:Un-nested 10
Yale:RT:BF2:Monograph:Instance:CtY-BR 104
WAU:RT:BF2:BFLC:AdminMetadata:Status 1
WAU:RT:RDA:Manifestation:monograph 107
ld4p:RT:bf2:WorkTitle 1
UAL:resourceTemplate:bf2:Monograph:Instance 2
sinopia:resourceTemplate:schema:Book 1
harvard:RT:bf2:RareMat:item:unnested 6
WAU:RT:RDA:Expression:eBook 4
ld4p:RT:bf2:RareMat:Item 1
WAU:RT:RDA:Expression:soundRecording 2
WAU:RT:RDA:Manifestation:graphic 2
UChicago:RT:bf2:Monograph:Work 32
ld4p:RT:bf2:Serial:Work:Un-nested 21
Yale:RT:BF2:Monograph:SuperWork:CtY-BR 103
CornellSinatra45:Instance 24
ld4p:RT:bf2:NotatedMusic:Instance:Un-nested 2
Yale:RT:BF2:Event:CtY 2
UCSD:RT:BF2:Monograph:Work:Un-nested 1
WAU:RT:RDA:Work:soundRecording 3
WAU:RT:RDA:Expression:etd 11
Yale:RT:BF2:Monograph:Instance:CtY 123
PMO:RT:bf2:SoundRecording:Work 1
Yale:RT:BF2:Monograph:Item-CtY:Nested:BoundWith 2
UMN:RT:bf2:Monograph:Instance:Un-nested 10
ld4p:RT:bf2:Serial:Instance:Un-nested 16
Yale:RT:BF2:Monograph:Work:CtY 124
WAU

## Number of Referenced Authorities in Templates

In [44]:
authority_sparql = """PREFIX sinopia: <http://sinopia.io/vocabulary/>

SELECT ?subject ?authority
WHERE {{ ?subject sinopia:hasAuthority ?authority . }}"""

existing_authorities_request = requests.get('https://raw.githubusercontent.com/LD4P/sinopia_editor/master/static/authorityConfig.json')

existing_authorities = dict()
depreciated_authorites = dict()

for row in existing_authorities_request.json():
    existing_authorities[row['uri']] = []

In [46]:
query_result = production['resources'][23]['graph'].query(authority_sparql)
print(query_result.bindings)

[]


In [47]:
def extract_authority(graph):
    result = graph.query(authority_sparql)
    for row in result.bindings:
        authority = str(row[rdflib.term.Variable('authority')])
        subject = str(row[rdflib.term.Variable('subject')])
        if not authority in existing_authorities:
            if authority in depreciated_authorites:
                depreciated_authorites[authority].append(subject)
            else:
                depreciated_authorites[authority] = [subject,]
        else:
            existing_authorities[authority].append(subject)
        

In [53]:
for row in production['templates']:
    extract_authority(row.get('graph'))

In [54]:
for key, value in existing_authorities.items():
    if len(value) > 0:
        print(key, len(value))

urn:ld4p:sinopia 2013
urn:ld4p:sinopia:bibframe:instance 11
urn:ld4p:sinopia:bibframe:work 10
urn:ld4p:sinopia:resourceTemplate 1
urn:ld4p:qa:cerl:person 4
urn:ld4p:qa:cerl:corporate 4
urn:ld4p:qa:cerl:imprint 6
urn:ld4p:qa:dbpedia 32
urn:discogs 1
urn:ld4p:qa:geonames 5
urn:ld4p:qa:geonames:place 1
urn:ld4p:qa:geonames:area_and_place 40
urn:ld4p:qa:gettyaat 11
urn:ld4p:qa:gettyaat:Materials 1
urn:ld4p:qa:gettyaat:Materials__Materials 2
urn:ld4p:qa:gettyaat:Objects 12
urn:ld4p:qa:gettyaat:Objects__Components 12
urn:ld4p:qa:gettyaat:Objects__Object_Genres 7
urn:ld4p:qa:gettyaat:Physical_Attributes__Conditions_and_Effects 1
urn:ld4p:qa:isni 21
urn:ld4p:qa:ligatus 3
urn:ld4p:qa:demographics 34
urn:ld4p:qa:performance 14
urn:ld4p:qa:genres 52
urn:ld4p:qa:genres:active 2
urn:ld4p:qa:names 291
urn:ld4p:qa:names:person 409
urn:ld4p:qa:names:organization 245
urn:ld4p:qa:names:family 29
urn:ld4p:qa:names:geographic 74
urn:ld4p:qa:names:conference 14
urn:ld4p:qa:subjects 141
urn:ld4p:qa:mesh:sub

In [58]:
with open("production-templates-authorities.csv", "w") as fo:
    authorities_writer = csv.writer(fo)
    authorities_writer.writerow(['authority', 'count'])
    for key, value in existing_authorities.items():
        if len(value) > 0:
            authorities_writer.writerow([key, len(value)])
            print(f"{key} {len(value)}")

urn:ld4p:sinopia 2013
urn:ld4p:sinopia:bibframe:instance 11
urn:ld4p:sinopia:bibframe:work 10
urn:ld4p:sinopia:resourceTemplate 1
urn:ld4p:qa:cerl:person 4
urn:ld4p:qa:cerl:corporate 4
urn:ld4p:qa:cerl:imprint 6
urn:ld4p:qa:dbpedia 32
urn:discogs 1
urn:ld4p:qa:geonames 5
urn:ld4p:qa:geonames:place 1
urn:ld4p:qa:geonames:area_and_place 40
urn:ld4p:qa:gettyaat 11
urn:ld4p:qa:gettyaat:Materials 1
urn:ld4p:qa:gettyaat:Materials__Materials 2
urn:ld4p:qa:gettyaat:Objects 12
urn:ld4p:qa:gettyaat:Objects__Components 12
urn:ld4p:qa:gettyaat:Objects__Object_Genres 7
urn:ld4p:qa:gettyaat:Physical_Attributes__Conditions_and_Effects 1
urn:ld4p:qa:isni 21
urn:ld4p:qa:ligatus 3
urn:ld4p:qa:demographics 34
urn:ld4p:qa:performance 14
urn:ld4p:qa:genres 52
urn:ld4p:qa:genres:active 2
urn:ld4p:qa:names 291
urn:ld4p:qa:names:person 409
urn:ld4p:qa:names:organization 245
urn:ld4p:qa:names:family 29
urn:ld4p:qa:names:geographic 74
urn:ld4p:qa:names:conference 14
urn:ld4p:qa:subjects 141
urn:ld4p:qa:mesh:sub

In [59]:
with open("production-templates-depreciated-authorities.csv", "w") as fo:
    dep_authorities_writer = csv.writer(fo)
    dep_authorities_writer.writerow(['authority', 'count'])
    for key, value in depreciated_authorites.items():
        if len(value) > 0:
            dep_authorities_writer .writerow([key, len(value)])
            print(f"{key} {len(value)}")

file:///authorityConfig.json 1
file:///propertyAttribute.json 1
file:///propertyType.json 1
urn:ld4p:qa:subjects:person 5
http://mlvlp04.loc.gov:8230/resources/works 5
http://id.loc.gov/vocabulary/mbroadstd 1
http://id.loc.gov/vocabulary/organizations 3
http://id.loc.gov/vocabulary/descriptionConventions 1
http://id.loc.gov/vocabulary/languages 2
http://id.loc.gov/vocabulary/mcapturestorage 1
http://id.loc.gov/vocabulary/millus 2
http://id.loc.gov/vocabulary/contentTypes 2
http://id.loc.gov/vocabulary/mregencoding 1
http://id.loc.gov/vocabulary/geographicAreas 1
http://id.loc.gov/vocabulary/mrecmedium 1
http://id.loc.gov/vocabulary/mplayback 1
http://id.loc.gov/vocabulary/mstatus 4
http://id.loc.gov/vocabulary/subjectSchemes 2
https://id.loc.gov/authorities/performanceMediums 3
urn:ld4p:qa:subjects:place 1
urn:ld4p:qa:names:place 1
http://id.loc.gov/vocabulary/mproduction 1
http://id.loc.gov/vocabulary/maudience 1
http://id.loc.gov/vocabulary/classSchemes 1
http://id.loc.gov/vocabulary