# Sinopia Entity Resource Template Classification

In [4]:
from __future__ import absolute_import, division, print_function, unicode_literals

import datetime
import os
import rdflib
import numpy as np
import pandas as pd
import tensorflow as tf
SINOPIA_BASE_PATH = "/Users/jpnelson/2019/sinopia-data/2019/09/24"

## Setup
Create two graphs, one for testing and one for training

In [3]:
LDP = rdflib.Namespace('http://www.w3.org/ns/ldp#')
SINOPIA_TRAIN = rdflib.ConjunctiveGraph()
SINOPIA_TRAIN.namespace_manager.bind("ldp", LDP)
SINOPIA_TEST = rdflib.ConjunctiveGraph()
SINOPIA_TEST.namespace_manager.bind("ldp", LDP)
SINOPIA_TESTING_PATH = "/Users/jpnelson/2019/sinopia-data/2019/09/24/test/"
for filename in next(os.walk(SINOPIA_TESTING_PATH))[2]:
    SINOPIA_TEST.parse(os.path.join(SINOPIA_TESTING_PATH, filename), format='turtle')
SINOPIA_TRAIN_PATH = "/Users/jpnelson/2019/sinopia-data/2019/09/24/train/"
for filename in next(os.walk(SINOPIA_TRAIN_PATH))[2]:
    SINOPIA_TRAIN.parse(os.path.join(SINOPIA_TRAIN_PATH, filename), format='turtle')


In [4]:
print(f"Testing triples: {len(SINOPIA_TEST):,}, Training triples: {len(SINOPIA_TRAIN):,}")

Testing triples: 1,081, Training triples: 5,637


Origin 8/28 Testing triples: 446, Training triples: 2,602

In [4]:
single_graph = rdflib.ConjunctiveGraph()
single_graph.parse("/Users/jpnelson/2019/sinopia-data/2019/09/24/train/00002.ttl", format='turtle')
print(single_graph.serialize(format='turtle').decode())

@prefix acl: <http://www.w3.org/ns/auth/acl#> .
@prefix as: <https://www.w3.org/ns/activitystreams#> .
@prefix dc: <http://purl.org/dc/terms/> .
@prefix dc11: <http://purl.org/dc/elements/1.1/> .
@prefix geo: <http://www.w3.org/2003/01/geo/wgs84_pos#> .
@prefix ldp: <http://www.w3.org/ns/ldp#> .
@prefix memento: <http://mementoweb.org/ns#> .
@prefix ns1: <http://sinopia.io/vocabulary/> .
@prefix ns2: <http://id.loc.gov/ontologies/bibframe/> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix prov: <http://www.w3.org/ns/prov#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix schema: <http://schema.org/> .
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .
@prefix time: <http://www.w3.org/2006/time#> .
@prefix xml: <http://www.w3.org/XML/1998/namespace> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

<https://trellis.stage.sinopia.io/repository/ucdavis/f179f138-8d01-47ac-8c9d-0e9c4ff7b3dd> a ns2

In [5]:
BF = rdflib.Namespace("http://id.loc.gov/ontologies/bibframe/")

In [45]:
def rdf_series(graph, subject):
    index_data = { 'subject': str(subject) }
    for p,o in graph.predicate_objects(subject=subject):
        if str(p) in index_data:
            if isinstance(index_data[str(p)], list):
                index_data[str(p)].append(str(o))
            else:
                index_data[str(p)] = [index_data[str(p)], str(o)]
        else:
            index_data[str(p)] = str(o)
    return pd.Series(index_data)

def load_graph(graph):
    data_series = []
    # Loads triples into subject data series
    for subject in set(graph.subjects()):
        data_series.append(rdf_series(graph, subject))
    # For now returns a list, should create data frames for each rdf:type
    return pd.DataFrame(data_series)

In [33]:
data_frames = load_graph(single_graph)

In [46]:
def load_data(path):
    for filename in next(os.walk(path))[-1]:
        graph = rdflib.Graph()
        graph.parse(os.path.join(path, filename), format='turtle')
        yield load_graph(graph)
        

In [52]:
training_iterator = load_data("../../sinopia-data/2019/10/24/train/")
first_df = next(training_iterator)
second_df = next(training_iterator)



In [55]:
first_df

Unnamed: 0,subject,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://www.w3.org/2000/01/rdf-schema#label,http://id.loc.gov/ontologies/bibframe/geographicCoverage,http://id.loc.gov/ontologies/bibframe/classification,http://id.loc.gov/ontologies/bibframe/content,http://id.loc.gov/ontologies/bibframe/illustrativeContent,http://id.loc.gov/ontologies/bibframe/hasInstance,http://id.loc.gov/ontologies/bibframe/genreForm,http://www.w3.org/ns/prov#wasGeneratedBy,...,http://id.loc.gov/ontologies/bibframe/supplementaryContent,http://id.loc.gov/ontologies/bibframe/title,http://sinopia.io/vocabulary/hasResourceTemplate,http://id.loc.gov/ontologies/bibframe/tableOfContents,http://www.w3.org/ns/prov#wasAssociatedWith,http://www.w3.org/ns/prov#atTime,http://id.loc.gov/ontologies/bibframe/mainTitle,http://id.loc.gov/ontologies/bibframe/classificationPortion,http://bibframe.org/ontologies/Contribution,http://bibframe.org/ontologies/agent
0,fcb0680250abd48dfb31e96d2d0d287edb2,http://id.loc.gov/ontologies/bibframe/Suppleme...,Errata slip laid in volume 1,,,,,,,,...,,,,,,,,,,
1,fcb0680250abd48dfb31e96d2d0d287edb4,http://id.loc.gov/ontologies/bibframe/TableOfC...,volumen I. Estructura productiva y financiera ...,,,,,,,,...,,,,,,,,,,
2,https://trellis.stage.sinopia.io/repository/st...,http://id.loc.gov/ontologies/bibframe/Work,,Panama,fcb0680250abd48dfb31e96d2d0d287edb3,http://id.loc.gov/vocabulary/contentTypes/txt,http://id.loc.gov/vocabulary/millus/ill,https://trellis.stage.sinopia.io/repository/st...,http://id.loc.gov/authorities/genreForms/gf201...,fcb0680250abd48dfb31e96d2d0d287edb6,...,fcb0680250abd48dfb31e96d2d0d287edb2,fcb0680250abd48dfb31e96d2d0d287edb5,ld4p:RT:bf2:Monograph:Work:Un-nested,fcb0680250abd48dfb31e96d2d0d287edb4,,,,,,
3,fcb0680250abd48dfb31e96d2d0d287edb6,"[http://www.w3.org/ns/prov#Activity, https://w...",,,,,,,,,...,,,,,https://cognito-idp.us-west-2.amazonaws.com/us...,2019-10-22T15:58:54.365000+00:00,,,,
4,fcb0680250abd48dfb31e96d2d0d287edb5,http://id.loc.gov/ontologies/bibframe/Title,,,,,,,,,...,,,,,,,VI censos nacionales econÃ³micos,,,
5,fcb0680250abd48dfb31e96d2d0d287edb3,http://id.loc.gov/ontologies/bibframe/Classifi...,,,,,,,,,...,,,,,,,,HC147,,
6,fcb0680250abd48dfb31e96d2d0d287edb1,http://bibframe.org/ontologies/Contribution,,,,,,,,,...,,,,,,,,,http://id.loc.gov/vocabulary/relators/isb,http://id.loc.gov/authorities/names/no2010024374


In [56]:
second_df

Unnamed: 0,subject,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://bibframe.org/ontologies/agent,http://bibframe.org/ontologies/Contribution,http://www.w3.org/ns/prov#wasAssociatedWith,http://www.w3.org/ns/prov#atTime,http://id.loc.gov/ontologies/bibframe/content,http://id.loc.gov/ontologies/bibframe/contribution,http://id.loc.gov/ontologies/bibframe/subject,http://sinopia.io/vocabulary/hasResourceTemplate,...,http://id.loc.gov/ontologies/bibframe/classification,http://id.loc.gov/ontologies/bibframe/tableOfContents,http://id.loc.gov/ontologies/bibframe/language,http://id.loc.gov/ontologies/bibframe/title,http://id.loc.gov/ontologies/bibframe/hasInstance,http://www.w3.org/ns/prov#wasGeneratedBy,http://www.w3.org/2000/01/rdf-schema#label,http://id.loc.gov/ontologies/bibframe/classificationPortion,http://id.loc.gov/ontologies/bibframe/mainTitle,http://id.loc.gov/ontologies/bibframe/itemPortion
0,f7d01aa348bb24134a9e8d881181bda25b2,http://bibframe.org/ontologies/Contribution,http://id.loc.gov/authorities/names/n85258986,http://id.loc.gov/vocabulary/relators/edc,,,,,,,...,,,,,,,,,,
1,f7d01aa348bb24134a9e8d881181bda25b7,"[http://www.w3.org/ns/prov#Activity, https://w...",,,https://cognito-idp.us-west-2.amazonaws.com/us...,2019-10-11T00:00:33.035000+00:00,,,,,...,,,,,,,,,,
2,https://trellis.stage.sinopia.io/repository/st...,http://id.loc.gov/ontologies/bibframe/Work,,,,,http://id.loc.gov/vocabulary/contentTypes/txt,"[f7d01aa348bb24134a9e8d881181bda25b6, f7d01aa3...",[http://id.loc.gov/authorities/names/no9302517...,ld4p:RT:bf2:Monograph:Work:Un-nested,...,"[f7d01aa348bb24134a9e8d881181bda25b1, f7d01aa3...",f7d01aa348bb24134a9e8d881181bda25b5,http://id.loc.gov/vocabulary/languages/eng,f7d01aa348bb24134a9e8d881181bda25b4,https://trellis.stage.sinopia.io/repository/st...,f7d01aa348bb24134a9e8d881181bda25b7,,,,
3,f7d01aa348bb24134a9e8d881181bda25b5,http://id.loc.gov/ontologies/bibframe/TableOfC...,,,,,,,,,...,,,,,,,Music in the OCLC WorldCata: a replication / t...,,,
4,f7d01aa348bb24134a9e8d881181bda25b1,http://id.loc.gov/ontologies/bibframe/Classifi...,,,,,,,,,...,,,,,,,,025.3,,
5,f7d01aa348bb24134a9e8d881181bda25b4,http://id.loc.gov/ontologies/bibframe/Title,,,,,,,,,...,,,,,,,,,Directions in music cataloging,
6,f7d01aa348bb24134a9e8d881181bda25b6,http://bibframe.org/ontologies/Contribution,http://id.loc.gov/authorities/names/n2011050771,http://id.loc.gov/vocabulary/relators/edc,,,,,,,...,,,,,,,,,,
7,f7d01aa348bb24134a9e8d881181bda25b3,http://id.loc.gov/ontologies/bibframe/Classifi...,,,,,,,,,...,,,,,,,,ML111,,.D43 2010
