# Loader script for Neptune


In [10]:
# (c) 2022 Vanderbilt University, except Sparqler class: (c) Steven J. Baskauf (same license)
# This program is released under a GNU General Public License v3.0 http://www.gnu.org/licenses/gpl-3.0
# Author: Steve Baskauf
# Date: 2022-06-06

# ----------------
# Configuration
# ----------------
import requests
import json
from time import sleep
import datetime
import os
from os.path import exists
import pandas as pd
import yaml

# AWS Python SDK
import boto3
# import botocore  # not used

import csv
from pathlib import Path
import sys
import re # regex
import urllib.parse

# Global variables
loader_endpoint_url = 'https://triplestore1.cluster-cml0hq81gymg.us-east-1.neptune.amazonaws.com:8182/sparql'
#reader_endpoint_url = 'https://5j6diw4i0h.execute-api.us-east-1.amazonaws.com/sparql'
reader_endpoint_url = 'https://sparql.vanderbilt.edu/sparql'
local_upload_directory = '/Users/baskausj/triplestore_upload/'
s3_bucket_name = 'triplestore-upload'
utc_offset = '-06:00'

# See https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3.html#uploads
# local_file_path = local_directory + local_filename
# s3_file_key = local_filename
# s3_file_key = s3_iiif_project_directory + '/' + subdirectory + '/' + local_filename

def update_upload_status(row_index, dataframe, field, filename, message):
    """Adds status message to DataFrame and saves out to CSV in case of crash."""
    dataframe.at[row_index, field] = message
    dataframe.to_csv(filename, index = False)

def parse_filename(filename):
    """Separates and returns the root of a filename and the extension as a tuple."""
    pieces = filename.split('.')
    file_name_root = '.'.join(pieces[:-1])
    extension = pieces[len(pieces)-1]
    return file_name_root, extension

def csv_read(path, **kwargs):
    """Loads a CSV table into a Pandas DataFrame with all cells as strings and blank cells as empty strings
    
    Keyword argument:
    rows -- the number of rows of the table to return when used for testing. When omitted, all rows are returned.
    """
    dataframe = pd.read_csv(path, na_filter=False, dtype = str)
    if 'rows' in kwargs:
        return dataframe.head(kwargs['rows']).copy(deep=True)
    else:
        return dataframe

class Sparqler:
    """Build SPARQL queries of various sorts

    Parameters
    -----------
    useragent : str
        Required if using the Wikidata Query Service, otherwise optional.
        Use the form: appname/v.v (URL; mailto:email@domain.com)
        See https://meta.wikimedia.org/wiki/User-Agent_policy
    endpoint: URL
        Defaults to Wikidata Query Service if not provided.
    method: str
        Possible values are "post" (default) or "get". Use "get" if read-only query endpoint.
        Must be "post" for update endpoint.
    sleep: float
        Number of seconds to wait between queries. Defaults to 0.1
        
    Required modules:
    -------------
    import requests
    from time import sleep
    """
    def __init__(self, **kwargs):
        # attributes for all methods
        try:
            self.http_method = kwargs['method']
        except:
            self.http_method = 'post' # default to POST
        try:
            self.endpoint = kwargs['endpoint']
        except:
            self.endpoint = 'https://query.wikidata.org/sparql' # default to Wikidata endpoint
        try:
            self.useragent = kwargs['useragent']
        except:
            if self.endpoint == 'https://query.wikidata.org/sparql':
                print('You must provide a value for the useragent argument when using the Wikidata Query Service.')
                print()
                raise KeyboardInterrupt # Use keyboard interrupt instead of sys.exit() because it works in Jupyter notebooks
            else:
                self.useragent = ''
        try:
            self.sleep = kwargs['sleep']
        except:
            self.sleep = 0.1 # default throtting of 0.1 seconds

        self.requestheader = {}
        if self.useragent:
            self.requestheader['User-Agent'] = self.useragent
        
        if self.http_method == 'post':
            self.requestheader['Content-Type'] = 'application/x-www-form-urlencoded'

    def query(self, query_string, **kwargs):
        """Sends a SPARQL query to the endpoint.
        
        Parameters
        ----------
        form : str
            The SPARQL query form.
            Possible values are: "select" (default), "ask", "construct", and "describe".
        mediatype: str
            The response media type (MIME type) of the query results.
            Some possible values for "select" and "ask" are: "application/sparql-results+json" (default) and "application/sparql-results+xml".
            Some possible values for "construct" and "describe" are: "text/turtle" (default) and "application/rdf+xml".
            See https://docs.aws.amazon.com/neptune/latest/userguide/sparql-media-type-support.html#sparql-serialization-formats-neptune-output
            for response serializations supported by Neptune.
        verbose: bool
            Prints status when True. Defaults to False.
        default: list of str
            The graphs to be merged to form the default graph. List items must be URIs in string form.
            If omitted, no graphs will be specified and default graph composition will be controlled by FROM clauses
            in the query itself. 
            See https://www.w3.org/TR/sparql11-query/#namedGraphs and https://www.w3.org/TR/sparql11-protocol/#dataset
            for details.
        named: list of str
            Graphs that may be specified by IRI in a query. List items must be URIs in string form.
            If omitted, named graphs will be specified by FROM NAMED clauses in the query itself.
            
        Returns
        -------
        If the form is "select" and mediatype is "application/json", a list of dictionaries containing the data.
        If the form is "ask" and mediatype is "application/json", a boolean is returned.
        If the mediatype is "application/json" and an error occurs, None is returned.
        For other forms and mediatypes, the raw output is returned.

        Notes
        -----
        To get UTF-8 text in the SPARQL queries to work properly, send URL-encoded text rather than raw text.
        That is done automatically by the requests module for GET. I guess it also does it for POST when the
        data are sent as a dict with the urlencoded header. 
        See SPARQL 1.1 protocol notes at https://www.w3.org/TR/sparql11-protocol/#query-operation        
        """
        try:
            query_form = kwargs['form']
        except:
            query_form = 'select' # default to SELECT query form
        try:
            media_type = kwargs['mediatype']
        except:
            #if query_form == 'construct' or query_form == 'describe':
            if query_form == 'construct':
                media_type = 'text/turtle'
            else:
                media_type = 'application/sparql-results+json' # default for SELECT and ASK query forms
        self.requestheader['Accept'] = media_type
        try:
            verbose = kwargs['verbose']
        except:
            verbose = False # default to no printouts
            
        # Build the payload dictionary (query and graph data) to be sent to the endpoint
        payload = {'query' : query_string}
        try:
            payload['default-graph-uri'] = kwargs['default']
        except:
            pass
        
        try:
            payload['named-graph-uri'] = kwargs['named']
        except:
            pass

        if verbose:
            print('querying SPARQL endpoint')

        start_time = datetime.datetime.now()
        if self.http_method == 'post':
            response = requests.post(self.endpoint, data=payload, headers=self.requestheader)
        else:
            response = requests.get(self.endpoint, params=payload, headers=self.requestheader)
        elapsed_time = (datetime.datetime.now() - start_time).total_seconds()
        self.response = response.text
        sleep(self.sleep) # Throttle as a courtesy to avoid hitting the endpoint too fast.

        if verbose:
            print('done retrieving data in', int(elapsed_time), 's')

        if query_form == 'construct' or query_form == 'describe':
            return response.text
        else:
            if media_type != 'application/sparql-results+json':
                return response.text
            else:
                try:
                    data = response.json()
                except:
                    return None # Returns no value if an error. 

                if query_form == 'select':
                    # Extract the values from the response JSON
                    results = data['results']['bindings']
                else:
                    results = data['boolean'] # True or False result from ASK query 
                return results           

    def update(self, request_string, **kwargs):
        """Sends a SPARQL update to the endpoint.
        
        Parameters
        ----------
        mediatype : str
            The response media type (MIME type) from the endpoint after the update.
            Default is "application/json"; probably no need to use anything different.
        verbose: bool
            Prints status when True. Defaults to False.
        default: list of str
            The graphs to be merged to form the default graph. List items must be URIs in string form.
            If omitted, no graphs will be specified and default graph composition will be controlled by USING
            clauses in the query itself. 
            See https://www.w3.org/TR/sparql11-update/#deleteInsert
            and https://www.w3.org/TR/sparql11-protocol/#update-operation for details.
        named: list of str
            Graphs that may be specified by IRI in the graph pattern. List items must be URIs in string form.
            If omitted, named graphs will be specified by USING NAMED clauses in the query itself.
        """
        try:
            media_type = kwargs['mediatype']
        except:
            media_type = 'application/json' # default response type after update
        self.requestheader['Accept'] = media_type
        try:
            verbose = kwargs['verbose']
        except:
            verbose = False # default to no printouts
        
        # Build the payload dictionary (update request and graph data) to be sent to the endpoint
        payload = {'update' : request_string}
        try:
            payload['using-graph-uri'] = kwargs['default']
        except:
            pass
        
        try:
            payload['using-named-graph-uri'] = kwargs['named']
        except:
            pass

        if verbose:
            print('  beginning update')
            
        start_time = datetime.datetime.now()
        response = requests.post(self.endpoint, data=payload, headers=self.requestheader)
        elapsed_time = (datetime.datetime.now() - start_time).total_seconds()
        self.response = response.text
        sleep(self.sleep) # Throttle as a courtesy to avoid hitting the endpoint too fast.

        if verbose:
            print('  done updating data in', int(elapsed_time), 's')

        if media_type != 'application/json':
            return response.text
        else:
            try:
                data = response.json()
            except:
                return None # Returns no value if an error converting to JSON (e.g. plain text) 
            return data           

    def load(self, file_location, graph_uri, **kwargs):
        """Loads an RDF document into a specified graph.
        
        Parameters
        ----------
        s3 : str
            Name of an AWS S3 bucket containing the file. Omit load a generic URL.
        verbose: bool
            Prints status when True. Defaults to False.
        
        Notes
        -----
        The triplestore may or may not rely on receiving a correct Content-Type header with the file to
        determine the type of serialization. Blazegraph requires it, AWS Neptune does not and apparently
        interprets serialization based on the file extension.
        """
        try:
            s3 = kwargs['s3']
        except:
            s3 = ''
        try:
            verbose = kwargs['verbose']
        except:
            verbose = False # default to no printouts

        if s3:
            request_string = 'LOAD <https://' + s3 + '.s3.amazonaws.com/' + file_location + '> INTO GRAPH <' + graph_uri + '>'
        else:
            request_string = 'LOAD <' + file_location + '> INTO GRAPH <' + graph_uri + '>'
        
        if verbose:
            print('Loading file:', file_location, ' into graph: ', graph_uri)
        data = self.update(request_string, verbose=verbose)
        return data

    def drop(self, graph_uri, **kwargs):
        """Drop a specified graph.
        
        Parameters
        ----------
        verbose: bool
            Prints status when True. Defaults to False.
        """
        try:
            verbose = kwargs['verbose']
        except:
            verbose = False # default to no printouts

        request_string = 'DROP GRAPH <' + graph_uri + '>'

        if verbose:
            print('Deleting graph:', graph_uri)
        data = self.update(request_string, verbose=verbose)
        return data


# Connect and test the SSH tunnel

In [None]:
# Don't run this cell until we figure out how to make this command complete. The cell never stops executing and
# that locks up the rest of the notebook. For now, run it in a separate terminal window (left open).

# The EC2 server acting as the SSH tunnel has the IP address: 35.173.230.91

# The following lines need to be added to the ~/.ssh/config file:

"""
host neptune
 ForwardAgent yes
 User ec2-user
 HostName 172.31.90.204
 IdentitiesOnly yes
 IdentityFile ~/NeptuneSSHtunnel.pem
 LocalForward 8182 triplestore1.cluster-cml0hq81gymg.us-east-1.neptune.amazonaws.com:8182
"""

# and the NeptuneSSHtunnel.pem file needs to be in the home directory
# NOTE: The HostName is the public IP address of the EC2 server. It will change if the server is restarted.
# You can get it from the AWS console.

# !!!!! As of 2022-09-12, this does not work. Probably need to use an actual SSH Python
# library.

# Start up SSH tunnel
#os.system('ssh neptune -N &') # start as a background process
#os.system('echo $! > process_id.txt')

# Does not  yet work

The following line is here for testing, but in production will be added at the end of the script

In [None]:
with open ('process_id.txt', 'rt') as file_object:
    process_id_string = file_object.read()
print(process_id_string)    

In [None]:
# Close the SSH tunnel 
os.system('kill -9 ' + process_id_string)


In [9]:
# Run this to make sure SSH tunnel is working
try:
    response = requests.get(loader_endpoint_url + '/status')
    print(response.json())
except Exception as e:
    print('error', e.args[0])
  

{'acceptedQueryCount': 627, 'runningQueryCount': 0, 'queries': []}


# Combined uploader script

Combines code blocks previously developed to run in a single loop

## Setup and load data

In [11]:
print_dump = False

# Instantiate objects for data transfer
s3 = boto3.client('s3')
sve = Sparqler(endpoint=reader_endpoint_url, method='get', sleep=0)
neptune = Sparqler(endpoint=loader_endpoint_url, sleep=0)

# ---------
# Load data
# ---------

# Load the query prefix section (for named graph metadata in Service Description) from a text file
with open('prefixes.txt', 'rt', encoding='utf-8') as file_object:
    prefixes_text = file_object.read()
    
# Load the named graph data CSV configuration
if exists('named_graphs_config.yml'):
    with open('named_graphs_config.yml') as file_object:
        config_data = yaml.safe_load(file_object)
else:
    print('Must have a named_graphs_config.yml file for this script to operate.')
    print()
    raise KeyboardInterrupt

# Load the data about named graphs to be updated
named_graphs_df = csv_read(local_upload_directory + 'named_graphs.csv')

# Load the data relating the graph names to the datafiles that contain the serializations
graph_file_associations_df = csv_read(local_upload_directory + 'graph_file_associations.csv')

# Check that all of the files exist before moving on
for index, file in graph_file_associations_df.iterrows():
    if not exists(local_upload_directory + file['filename']):
        print(file['filename'], 'does not exist in the directory:', local_upload_directory)
        print('Put it there, or adjust the graph_file_associations.csv file, and try again.')
        print()
        raise KeyboardInterrupt

#    else:
#        print(file['filename'], 'exists')


## !! Run the following cell ONLY ONCE when the triplestore is set up !!

To prevent you from running it accidentally, you need to uncomment the next to last line to actually load the data.

In [None]:
# One-time generation of service description metadata

dcterms_modified = datetime.datetime.now().isoformat() + utc_offset

request_string = '''prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
prefix xsd: <http://www.w3.org/2001/XMLSchema#>
prefix dc: <http://purl.org/dc/elements/1.1/>
prefix dcterms: <http://purl.org/dc/terms/>
prefix sd: <http://www.w3.org/ns/sparql-service-description#>
prefix tdwgutility: <http://rs.tdwg.org/dwc/terms/attributes/>

insert data {
graph <https://sparql.vanderbilt.edu/graphs> {
<https://sparql.vanderbilt.edu/graphs> dcterms:modified "''' + dcterms_modified + '''"^^xsd:dateTime;
    sd:name <https://sparql.vanderbilt.edu/graphs>;
    dc:publisher "Vanderbilt Heard Libraries";
    a sd:NamedGraph;
    tdwgutility:status "production".
<https://sparql.vanderbilt.edu/sparql#service> a sd:Service;
    sd:endpoint <https://sparql.vanderbilt.edu/sparql>;
    sd:availableGraphs <https://sparql.vanderbilt.edu/graphs#collection>.
<https://sparql.vanderbilt.edu/graphs#collection> a sd:GraphCollection;
    sd:namedGraph <https://sparql.vanderbilt.edu/graphs>.
}}
'''

#print(request_string)

#data = neptune.update(request_string, verbose=True) # Uncomment this line to make functional
print(json.dumps(data, indent=2))


## Load data and update metadata

In [12]:
for index, graph in named_graphs_df.iterrows():
    named_graph_iri = graph['sd:name']
    print('Processing named graph:', named_graph_iri)
    
    # Slice the rows whose graph name matches the current named graph
    matching_files_df = graph_file_associations_df[graph_file_associations_df['sd:name']==named_graph_iri]

    # Drop the existing version of the graph
    update_upload_status(index, named_graphs_df, 'load_status', local_upload_directory + 'named_graphs.csv', 'dropping previous version')
    start_time = datetime.datetime.now()
    data = neptune.drop(named_graph_iri, verbose=True)
    elapsed_time = (datetime.datetime.now() - start_time).total_seconds()
    update_upload_status(index, named_graphs_df, 'load_status', local_upload_directory + 'named_graphs.csv', 'dropped previous version in ' + str(elapsed_time) + 's')
    if print_dump:
        print(json.dumps(data, indent=2))
        print()        

    # Delete old metadata about that graph
    request_string = '''delete where {
graph <https://sparql.vanderbilt.edu/graphs> {\n'''
    request_string += '<' + named_graph_iri + '> ?o ?p.\n'
    request_string += '}}\n'    
    #print(request_string)
    
    print('Deleting previous graph metadata')
    update_upload_status(index, named_graphs_df, 'load_status', local_upload_directory + 'named_graphs.csv', 'deleting previous metadata')
    data = neptune.update(request_string, verbose=True)
    update_upload_status(index, named_graphs_df, 'load_status', local_upload_directory + 'named_graphs.csv', 'deleted previous metadata')
    if print_dump:
        print(json.dumps(data, indent=2))
        print()
        
    # Step through each file to be loaded into that graph
    for i, matching_file in matching_files_df.iterrows():
    
        # Upload local file data to S3 bucket
        print('Uploading to s3:', matching_file['filename'])
        update_upload_status(i, graph_file_associations_df, 's3_upload_status', local_upload_directory + 'graph_file_associations.csv', 'upload initiated')
        start_time = datetime.datetime.now()
        s3.upload_file(local_upload_directory + matching_file['filename'], s3_bucket_name, matching_file['filename'])
        elapsed_time = (datetime.datetime.now() - start_time).total_seconds()
        update_upload_status(i, graph_file_associations_df, 's3_upload_status', local_upload_directory + 'graph_file_associations.csv', 'upload complete in ' + str(elapsed_time) + 's')
        print('  upload complete in', elapsed_time, 's.')
        
        # Load file from S3 bucket to triplestore
        update_upload_status(i, graph_file_associations_df, 'graph_load_status', local_upload_directory + 'graph_file_associations.csv', 'load initiated')
        start_time = datetime.datetime.now()
        data = neptune.load(matching_file['filename'], named_graph_iri, s3=s3_bucket_name, verbose=True)
        elapsed_time = (datetime.datetime.now() - start_time).total_seconds()
        update_upload_status(i, graph_file_associations_df, 'graph_load_status', local_upload_directory + 'graph_file_associations.csv', 'load complete in ' + str(elapsed_time) + 's')
        if print_dump:
            print(json.dumps(data, indent=2))
            print()

        # Insert the linking triple from the named graph to the uploaded triples using the sd:graph property
        # These triples may be considered a dcat:Dataset https://www.w3.org/TR/void/#datasethttps://www.w3.org/TR/vocab-dcat-2/#Class:Dataset
        # or a void:Dataset https://www.w3.org/TR/void/#dataset or both. 
        # The range of sd:graph also implies that it's a "graph description", a sd:Graph .
        request_string = '''prefix sd: <http://www.w3.org/ns/sparql-service-description#>
insert data {
graph <https://sparql.vanderbilt.edu/graphs> {
<''' + named_graph_iri + '> sd:graph <' + matching_file['sd:graph'] + '''>.
}}'''
        #print(request_string)

        print('Inserting link from named graph to graph description')
        update_upload_status(index, named_graphs_df, 'load_status', local_upload_directory + 'named_graphs.csv', 'inserting graph description link')
        data = neptune.update(request_string, verbose=True)
        update_upload_status(index, named_graphs_df, 'load_status', local_upload_directory + 'named_graphs.csv', 'inserted graph description link')
        if print_dump:
            print(json.dumps(data, indent=2))
            print()

    # Add updated metadata about that graph
    dcterms_modified = datetime.datetime.now().isoformat() + utc_offset

    request_string = prefixes_text + '''insert data {
graph <https://sparql.vanderbilt.edu/graphs> {\n'''
    request_string += '<' + named_graph_iri + '> dcterms:modified "' + dcterms_modified + '"^^xsd:dateTime.\n'
    for column in config_data:
        if graph[column['column_header']]: # skip if the column has an empty string value
            triple_pattern = '<' + named_graph_iri + '> ' + column['column_header'] + ' '
            if column['object_type'] == 'iri':
                triple_pattern += '<' + graph[column['column_header']] + '>'
            elif column['object_type'] == 'curie':
                triple_pattern += graph[column['column_header']]
            elif column['object_type'] == 'literal':
                triple_pattern += '"' + graph[column['column_header']] + '"'
                if 'datatype' in column:
                    triple_pattern += '^^' + column['datatype']
                if 'lang' in column:
                    triple_pattern += '@' + column['lang']
            triple_pattern += '.\n'
            request_string += triple_pattern
    request_string += '}}\n'    
    #print(request_string)

    print('Updating current graph metadata')
    update_upload_status(index, named_graphs_df, 'load_status', local_upload_directory + 'named_graphs.csv', 'uploading current metadata')
    data = neptune.update(request_string, verbose=True)
    update_upload_status(index, named_graphs_df, 'load_status', local_upload_directory + 'named_graphs.csv', 'uploaded current metadata')
    if print_dump:
        print(json.dumps(data, indent=2))
        print()

    # Update the modified time for the graph of graphs data 
    dcterms_modified = datetime.datetime.now().isoformat() + utc_offset
    request_string = '''prefix dcterms: <http://purl.org/dc/terms/>
with <https://sparql.vanderbilt.edu/graphs> 
delete {
<https://sparql.vanderbilt.edu/graphs> dcterms:modified ?dateTime.
}
insert {
<https://sparql.vanderbilt.edu/graphs> dcterms:modified "''' + dcterms_modified + '''"^^xsd:dateTime.
}
where {
<https://sparql.vanderbilt.edu/graphs> dcterms:modified ?dateTime.
}'''
    #print(request_string)

    print('Updating modified time for graph of graphs')
    update_upload_status(index, named_graphs_df, 'load_status', local_upload_directory + 'named_graphs.csv', 'updating modified time')
    data = neptune.update(request_string, verbose=True)
    update_upload_status(index, named_graphs_df, 'load_status', local_upload_directory + 'named_graphs.csv', 'updated modified time')
    if print_dump:
        print(json.dumps(data, indent=2))
        print()

    # Insert the linking triple from the GraphCollection to the named graph
    request_string = '''prefix sd: <http://www.w3.org/ns/sparql-service-description#>
insert data {
graph <https://sparql.vanderbilt.edu/graphs> {\n'''
    request_string += '<https://sparql.vanderbilt.edu/graphs#collection> sd:namedGraph <' + named_graph_iri + '>.\n'
    request_string += '}}\n'    
    #print(request_string)

    print('Inserting link to named graph')
    update_upload_status(index, named_graphs_df, 'load_status', local_upload_directory + 'named_graphs.csv', 'inserting link to named graph')
    data = neptune.update(request_string, verbose=True)
    update_upload_status(index, named_graphs_df, 'load_status', local_upload_directory + 'named_graphs.csv', 'update complete')
    if print_dump:
        print(json.dumps(data, indent=2))
        print()
    
    print('Update complete for graph')
    print()

# Close the SSH tunnel

print('All graphs loaded')


Processing named graph: https://spear-prosop.org
Deleting graph: https://spear-prosop.org
  beginning update
  done updating data in 0 s
Deleting previous graph metadata
  beginning update
  done updating data in 0 s
Uploading to s3: allSPEAR.ttl
  upload complete in 7.029591 s.
Loading file: allSPEAR.ttl  into graph:  https://spear-prosop.org
  beginning update
  done updating data in 0 s
Inserting link from named graph to graph description
  beginning update
  done updating data in 0 s
Updating current graph metadata
  beginning update
  done updating data in 0 s
Updating modified time for graph of graphs
  beginning update
  done updating data in 0 s
Inserting link to named graph
  beginning update
  done updating data in 0 s
Update complete for graph

All graphs loaded


# Utilities

Not necessarily part of the workflow, but can be used to accomplish various tasks

## List graphs loaded in Neptune

In [13]:
sve = Sparqler(endpoint=reader_endpoint_url, method='get', sleep=0)
query_string = '''select distinct ?graph where {
graph ?graph {?s ?o ?p.}
}
order by ?graph'''

data = sve.query(query_string)
#print(json.dumps(data, indent=2))

for graph in data:
    print(graph['graph']['value'])

http://AATOut_1Subjects
http://AATOut_2Terms
http://AATOut_AssociativeRels
http://AATOut_ContribRels
http://AATOut_Contribs
http://AATOut_HierarchicalRels
http://AATOut_LCSHAlignment
http://AATOut_Lang_sameAs
http://AATOut_Notations
http://AATOut_ObsoleteSubjects
http://AATOut_OrderedCollections
http://AATOut_ScopeNotes
http://AATOut_SemanticLinks
http://AATOut_SourceRels
http://AATOut_Sources
http://AATOut_WikidataCoref
http://architecturasinica.org/place/building
http://architecturasinica.org/place/image
http://architecturasinica.org/place/site
http://bioimages.vanderbilt.edu/images
http://bioimages.vanderbilt.edu/organisms
http://bioimages.vanderbilt.edu/people
http://bioimages.vanderbilt.edu/places
http://bioimages.vanderbilt.edu/rdf/stdview
http://bioimages.vanderbilt.edu/specimens
http://bioimages.vanderbilt.edu/vocabs
http://lod.vanderbilt.edu/apulian/scene
http://lod.vanderbilt.edu/vase
http://nomenclature_2022-02-02
http://rs.tdwg.org/
http://rs.tdwg.org/cv/status
http://syria

## Examine some triples in a graph

In [14]:
query_string = '''select distinct ?s ?o ?p where {
?s ?o ?p.
}
limit 5'''

from_graphs = ['https://spear-prosop.org']
#from_graphs = ['put graph IRI here']
sve = Sparqler(endpoint=reader_endpoint_url, method='get', sleep=0)
data = sve.query(query_string, default=from_graphs)
print(json.dumps(data, indent=2))


[]


## Dump triples from a graph to a CSV

Note: SPARQL Query results serializations supported are listed at https://docs.aws.amazon.com/neptune/latest/userguide/sparql-media-type-support.html#sparql-serialization-formats-neptune-output

In [None]:
query_string = '''select distinct ?s ?o ?p where {
?s ?o ?p.
}'''

from_graphs = ['https://sparql.vanderbilt.edu/graphs']
sve = Sparqler(endpoint=reader_endpoint_url, method='get', sleep=0)
data = sve.query(query_string, default=from_graphs, mediatype='text/csv')

with open('graph_dump.csv', 'wt', encoding='utf-8') as file_object:
    file_object.write(data)


## Drop a graph

Be careful! Both dropping and reloading large graphs can take a long time!

In [None]:
graph_name = 'put graph IRI here'
neptune = Sparqler(endpoint=loader_endpoint_url, sleep=0)
data = neptune.drop(graph_name, verbose=True)
print(json.dumps(data, indent=2))


# Obsolete development code

Left for historical reference


## Test of loading a triple twice

Needed to know if in Neptune adding a triple that is already in a graph creates a second triple or if a single triple represents the old and added triple.

Answer: It does not create a second identical triple.

In [None]:
neptune = Sparqler(endpoint=loader_endpoint_url, sleep=0)
sve = Sparqler(endpoint=reader_endpoint_url, method='get', sleep=0)

request_string = '''insert data { 
graph <https://test> {
<https://test.com/s> <https://test.com/p> <https://test.com/o> . 
<https://test.com/s> <https://test.com/p> <https://test.com/o> . 
    }
}'''
data = neptune.update(request_string, verbose=True)
print(json.dumps(data, indent=2))
print()

query_string = '''select * 
from <https://test>
where {?s ?o ?p}
'''
data = sve.query(query_string)
print(json.dumps(data, indent=2))
print()


## Use AWS SDK to upload files to the public S3 bucket

In [None]:
# ----------------
# Upload RDF triples to S3 bucket
# ----------------

s3 = boto3.client('s3')

# The upload directory should contain only RDF serializations that are to be uploaded.
file_list = os.listdir(local_upload_directory)
try:
    file_list.remove('.DS_Store') # Macs sometimes generate this hidden file - try to remove from upload list
except:
    pass

for file_name in file_list:
    #file_name_root = parse_filename(file_name)[0]
    local_file_path = local_upload_directory + file_name
    s3_file_key = file_name

    print('Uploading to s3:', file_name)
    start_time = datetime.datetime.now()
    s3.upload_file(local_file_path, s3_bucket_name, s3_file_key)
    elapsed_time = (datetime.datetime.now() - start_time).total_seconds()
    print('Upload complete in', elapsed_time, 's.')


## Load files in S3 into triplestore

For reference, Neptune's conformance with SPARQL is described at https://docs.aws.amazon.com/neptune/latest/userguide/feature-sparql-compliance.html . Most specifically, triples are always associated with a graph, even if one is not explicitly specified. See https://docs.aws.amazon.com/neptune/latest/userguide/feature-sparql-compliance.html#sparql-default-graph . The fallback URI `http://aws.amazon.com/neptune/vocab/v01/DefaultNamedGraph` is used.

Setup and load data

In [None]:
# Instantiate Neptune SPARQL object
neptune = Sparqler(endpoint=loader_endpoint_url, sleep=0)

# Load the query prefix section from a text file
with open('prefixes.txt', 'rt', encoding='utf-8') as file_object:
    prefixes_text = file_object.read()
    
# Load the named graph data CSV configuration
if exists('named_graphs_config.yml'):
    with open('named_graphs_config.yml') as file_object:
        config_data = yaml.safe_load(file_object)
else:
    print('Must have a named_graphs_config.yml file for this script to operate.')
    print()
    raise KeyboardInterrupt

# Load the data about named graphs to be updated
named_graphs_df = csv_read('named_graphs.csv')

# Load the data relating the graph names to the datafiles that contain the serializations
graph_file_associations_df = csv_read('graph_file_associations.csv')

# Check that all of the files exist before moving on
for index, file in graph_file_associations_df.iterrows():
    if not exists(local_upload_directory + file['filename']):
        print(file['filename'], 'does not exist in the directory:', local_upload_directory)
        print('Put it there, or adjust the graph_file_associations.csv file, and try again.')
        print()
        raise KeyboardInterrupt

#    else:
#        print(file['filename'], 'exists')


Load the triples from the S3 bucket into the triplestore

In [None]:
for index, file in graph_file_associations_df.iterrows():
    graph_name = file['sd:name']
    data = neptune.load(file['filename'], graph_name, s3=s3_bucket_name, verbose=True)
    print(json.dumps(data, indent=2))


## Update the metadata about graphs to reflect the changes

If there are old triples in the graphs metadata associated with the named graphs being updated, they are deleted from the <https://sparql.vanderbilt.edu/graphs> graph. See https://www.w3.org/TR/sparql11-update/#deleteWhere for reference

In [None]:
# Note that attempting to delete triples that don't exist or from a graph that doesn't exist has no effect
# and generates no errors. 

"""Generates a request string of the form:
delete where {
graph <https://sparql.vanderbilt.edu/graphs> {
<http://syriaca.org/bibl#graph> ?o ?p.

...

}}
"""

# NOTE: prefixes not needed since the subject IRI is unabbreviated.
request_string = '''delete where {
graph <https://sparql.vanderbilt.edu/graphs> {\n'''
for index, graph in named_graphs_df.iterrows():
    request_string += '<' + graph['sd:name'] + '> ?o ?p.\n'
request_string += '}}\n'    
print(request_string)
        
data = neptune.update(request_string, verbose=True)
print(json.dumps(data, indent=2))


Generate the new metadata for the named graphs in the service description and load it into the <https://sparql.vanderbilt.edu/graphs> graph. See https://www.w3.org/TR/sparql11-update/#insertData for reference. Note: unlike the examples at https://www.w3.org/TR/sparql11-service-description/#example-turtle and https://www.w3.org/TR/void/#sparql-sd , we don't use a blank node for the sd:NamedGraph instance. This makes it easier to manage linking it to the graph collection.

In [None]:
"""Generates a request string of the form:

prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
prefix xsd: <http://www.w3.org/2001/XMLSchema#>
prefix sd: <http://www.w3.org/ns/sparql-service-description#>
prefix dc: <http://purl.org/dc/elements/1.1/>
prefix dcterms: <http://purl.org/dc/terms/>
prefix tdwgutility: <http://rs.tdwg.org/dwc/terms/attributes/>
insert data {
graph <https://sparql.vanderbilt.edu/graphs> {
<http://syriaca.org/bibl#graph> dcterms:modified "2022-06-06T14:40:05.430286-06:00"^^xsd:dateTime.
<http://syriaca.org/bibl#graph> sd:name <http://syriaca.org/bibl#graph>.
<http://syriaca.org/bibl#graph> sd:graph <http://syriaca.org/bibl>.
<http://syriaca.org/bibl#graph> dcterms:issued "2018-03-06"^^xsd:date.
<http://syriaca.org/bibl#graph> dc:publisher "syriaca.org".
<http://syriaca.org/bibl#graph> rdf:type sd:NamedGraph.
<http://syriaca.org/bibl#graph> dcterms:isPartOf <http://syriaca.org/>.
<http://syriaca.org/bibl#graph> tdwgutility:status "production".

...

}}
"""
dcterms_modified = datetime.datetime.now().isoformat() + utc_offset

request_string = prefixes_text + '''insert data {
graph <https://sparql.vanderbilt.edu/graphs> {\n'''
for index, graph in named_graphs_df.iterrows():
    named_graph_iri = graph['sd:name']
    request_string += '<' + named_graph_iri + '> dcterms:modified "' + dcterms_modified + '"^^xsd:dateTime.\n'
    for column in config_data:
        triple_pattern = '<' + named_graph_iri + '> ' + column['column_header'] + ' '
        if column['object_type'] == 'iri':
            triple_pattern += '<' + graph[column['column_header']] + '>'
        elif column['object_type'] == 'curie':
            triple_pattern += graph[column['column_header']]
        elif column['object_type'] == 'literal':
            triple_pattern += '"' + graph[column['column_header']] + '"'
            if 'datatype' in column:
                triple_pattern += '^^' + column['datatype']
            if 'lang' in column:
                triple_pattern += '@' + column['lang']
        triple_pattern += '.\n'
        request_string += triple_pattern
request_string += '}}\n'    
print(request_string)
        
data = neptune.update(request_string, verbose=True)
print(json.dumps(data, indent=2))


Need to add the named graph to the GraphCollection if it isn't there already and update the modified dateTime for the https://sparql.vanderbilt.edu/graphs graph. This uses the WITH clause to carry out both the insert and delete sequentially. See https://www.w3.org/TR/sparql11-update/#deleteInsert for reference.

In [None]:
# Update the modified time for the graph of graphs data 
dcterms_modified = datetime.datetime.now().isoformat() + utc_offset
request_string = '''prefix dcterms: <http://purl.org/dc/terms/>

with <https://sparql.vanderbilt.edu/graphs> 
delete {
<https://sparql.vanderbilt.edu/graphs> dcterms:modified ?dateTime.
}
insert {
<https://sparql.vanderbilt.edu/graphs> dcterms:modified "''' + dcterms_modified + '''"^^xsd:dateTime.
}
where {
<https://sparql.vanderbilt.edu/graphs> dcterms:modified ?dateTime.
}'''

print(request_string)

data = neptune.update(request_string, verbose=True)
print(json.dumps(data, indent=2))
print()

# Delete the links from the GraphCollection to the named graphs so that the triples won't get duplicated when added
# NOTE: this may not be necessary since it doesn't appear that duplicates get generated if we don't delete. 
# But it take negligible time and doesn't hurt anything.

# NOTE 2022-06-07: Did additional testing (see code cell above) and it doesn't add the triple again. So this code
# isn't needed and will be removed from the main script.
request_string = '''prefix sd: <http://www.w3.org/ns/sparql-service-description#>
delete where {
graph <https://sparql.vanderbilt.edu/graphs> {\n'''
for index, graph in named_graphs_df.iterrows():
    request_string += '<https://sparql.vanderbilt.edu/graphs#collection> sd:namedGraph <' + graph['sd:name'] + '>.\n'
request_string += '}}\n'    
print(request_string)
        
data = neptune.update(request_string, verbose=True)
print(json.dumps(data, indent=2))
print()

# Now insert the linking triples
request_string = '''prefix sd: <http://www.w3.org/ns/sparql-service-description#>
insert data {
graph <https://sparql.vanderbilt.edu/graphs> {\n'''
for index, graph in named_graphs_df.iterrows():
    request_string += '<https://sparql.vanderbilt.edu/graphs#collection> sd:namedGraph <' + graph['sd:name'] + '>.\n'
request_string += '}}\n'    
print(request_string)
        
data = neptune.update(request_string, verbose=True)
print(json.dumps(data, indent=2))
print()


## DO NOT RUN this cell! It will delete everything in the triplestore

Actually, I've left it with an error so that it will fail if it's accidentally run.

In [None]:
# POST SPARQL Update DROP ALL command
# Warning! This deletes all triples for all graphs !!!!

query_string = 'DROP ALL'
start_time = datetime.datetime.now()
response = requests.post(loader_endpoint_url + '/sparql', data=query_string.encode('utf-8'), headers=update_request_header_dictionary)
#print(response.text) # uncomment to view the raw response, e.g. if you are getting an error
data = response.json()
print(json.dumps(data, indent = 2))

elapsed_time = (datetime.datetime.now() - start_time).total_seconds()
print('time to load:', int(elapsed_time), 's')

## Code to test the Neptune-specific loader

There did not seem to be any real benefit to using this loader with respect to speed. It also does not have any mechanism for specifying the graph into which triples should be loaded, so the only way to load them into a specific graph is to use the n-quads format. See following cell to convert from n-triples to n-quads.

In [None]:
# POST loader command

loader_request_header_dictionary = {
        'Accept' : 'application/json',
        'Content-Type': 'application/json'
    }

rdf_format = 'nquads'
#rdf_format = 'ntriples'

data = '''
    {
      "source" : "s3://'''+ s3_bucket_name + '/' + s3_file_key + '''",
      "format" : "'''  + rdf_format + '''",
      "iamRoleArn" : "arn:aws:iam::555751041262:role/neptuneloadfroms3",
      "region" : "us-east-1",
      "failOnError" : "FALSE",
      "parallelism" : "MEDIUM",
      "updateSingleCardinalityProperties" : "FALSE",
      "queueRequest" : "TRUE"
    }'''

start_time = datetime.datetime.now()

# Send request to load
response = requests.post(loader_endpoint_url + '/loader', data=data.encode('utf-8'), headers=loader_request_header_dictionary)
data = response.json()
print(json.dumps(data, indent = 2))
load_id = data['payload']['loadId']

# Check status of load once per second
completed = False
while not completed:
    response = requests.get(loader_endpoint_url + '/loader/' + load_id)
    data = response.json()
    #print(json.dumps(data, indent = 2))
    print(data['payload']['overallStatus']['status'])
    if data['payload']['overallStatus']['status'] == 'LOAD_COMPLETED' or data['payload']['overallStatus']['status'] == 'LOAD_FAILED':
        completed = True
    sleep(1)
elapsed_time = (datetime.datetime.now() - start_time).total_seconds()
print('time to load:', int(elapsed_time), 's')

## Converting n-triples to n-quads to specify graph


In [None]:
file_list = os.listdir(local_directory)
file_list.remove('.DS_Store')

# Note: assumes all files are n-triples serialization with .nt file extensions.
for file_name in file_list:
    file_name_root = parse_filename(file_name)[0]
    print('converting:', file_name)
    output_filename = file_name_root + '.nq'
    graph_string = ' <http://' + file_name_root + '> .'

    output_file_object = open(local_directory + output_filename, 'wt', encoding='utf-8')
    input_file_object = open(local_directory + file_name, 'rt', encoding='utf-8')

    for line in input_file_object:
        line_text = line.strip() # remove trailing newline
        line_text = line_text[:-1] + graph_string # remove period at end.
        print(line_text, file=output_file_object)

    output_file_object.close()
    input_file_object.close()

print('done')