In [None]:
# ----------------
# Configuration
# ----------------

import json
import requests
import csv
from pathlib import Path
from time import sleep
import sys
import re # regex
import datetime
import os
import pandas as pd
import urllib.parse

# AWS Python SDK
import boto3
import botocore

loader_endpoint_url = 'https://triplestore1.cluster-cml0hq81gymg.us-east-1.neptune.amazonaws.com:8182'

#local_filename = 'nomenclature_2022-02-02.jsonld'
local_filename = 'AATOut_2Terms.nq'
local_directory = '/Users/baskausj/triplestore_upload/'

s3_bucket_name = 'triplestore-upload'

# See https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3.html#uploads
local_file_path = local_directory + local_filename
s3_file_key = local_filename
# s3_file_key = s3_iiif_project_directory + '/' + subdirectory + '/' + local_filename

graph_iri = 'http://aatterms'

update_request_header_dictionary = {
        'Accept' : 'application/json',
        'Content-Type': 'application/sparql-update'
    }

def parse_filename(filename):
    pieces = filename.split('.')
    file_name_root = '.'.join(pieces[:-1])
    extension = pieces[len(pieces)-1]
    return file_name_root, extension
                            

In [None]:
# ----------------
# Upload RDF triples to s3 bucket
# ----------------
# NOTE: assuming they are n-triples, change extension if not.

s3 = boto3.client('s3')

file_list = os.listdir(local_directory)
file_list.remove('.DS_Store')

for file_name in file_list:
    file_name_root = parse_filename(file_name)[0]
    local_file_path = local_directory + file_name
    s3_file_key = local_filename

    print('Uploading to s3:', file_name)
    s3.upload_file(local_file_path, s3_bucket_name, s3_file_key)
    print('Upload complete')


In [None]:
# Don't run this cell until we figure out how to make this command complete. The cell never stops executing and
# that locks up the rest of the notebook.

# Start up SSH tunnel
os.system('ssh neptune -N')

In [None]:
# Run this to make sure SSH tunnel is working
try:
    response = requests.get(loader_endpoint_url + '/status')
    print(response.json())
except Exception as e:
    print('error', e.args[0])
  

In [None]:
# POST SPARQL Update LOAD command
# NOTE: assuming they are n-triples, change extension if not.

file_list = os.listdir(local_directory)
file_list.remove('.DS_Store')

total_start_time = datetime.datetime.now()
for file_name in file_list:
    file_name_root = parse_filename(file_name)[0]
    s3_file_key = file_name
    graph_iri = 'http://' + file_name_root

    print('Loading into Neptune:', file_name)

    query_string = 'LOAD <https://' + s3_bucket_name + '.s3.amazonaws.com/' + s3_file_key + '> INTO GRAPH <' + graph_iri + '>'
    start_time = datetime.datetime.now()
    response = requests.post(loader_endpoint_url + '/sparql', data=query_string.encode('utf-8'), headers=update_request_header_dictionary)
    #print(response.text) # uncomment to view the raw response, e.g. if you are getting an error
    data = response.json()
    print(json.dumps(data, indent = 2))

    elapsed_time = (datetime.datetime.now() - start_time).total_seconds()
    print('time to load:', int(elapsed_time), 's')
    print()
    
total_elapsed_time = (datetime.datetime.now() - total_start_time).total_seconds()
print(total_elapsed_time)
print('done')

In [None]:
# POST SPARQL Update DROP GRAPH command

query_string = 'DROP GRAPH <' + graph_iri + '>'
start_time = datetime.datetime.now()
response = requests.post(loader_endpoint_url + '/sparql', data=query_string.encode('utf-8'), headers=update_request_header_dictionary)
#print(response.text) # uncomment to view the raw response, e.g. if you are getting an error
data = response.json()
print(json.dumps(data, indent = 2))

elapsed_time = (datetime.datetime.now() - start_time).total_seconds()
print('time to delete:', int(elapsed_time), 's')

In [None]:
# POST SPARQL Update DROP ALL command
# Warning! This deletes all triples for all graphs !!!!

query_string = 'DROP ALL'
start_time = datetime.datetime.now()
response = requests.post(loader_endpoint_url + '/sparql', data=query_string.encode('utf-8'), headers=update_request_header_dictionary)
#print(response.text) # uncomment to view the raw response, e.g. if you are getting an error
data = response.json()
print(json.dumps(data, indent = 2))

elapsed_time = (datetime.datetime.now() - start_time).total_seconds()
print('time to load:', int(elapsed_time), 's')

In [None]:
# POST loader command

loader_request_header_dictionary = {
        'Accept' : 'application/json',
        'Content-Type': 'application/json'
    }

rdf_format = 'nquads'
#rdf_format = 'ntriples'

data = '''
    {
      "source" : "s3://'''+ s3_bucket_name + '/' + s3_file_key + '''",
      "format" : "'''  + rdf_format + '''",
      "iamRoleArn" : "arn:aws:iam::555751041262:role/neptuneloadfroms3",
      "region" : "us-east-1",
      "failOnError" : "FALSE",
      "parallelism" : "MEDIUM",
      "updateSingleCardinalityProperties" : "FALSE",
      "queueRequest" : "TRUE"
    }'''

start_time = datetime.datetime.now()

# Send request to load
response = requests.post(loader_endpoint_url + '/loader', data=data.encode('utf-8'), headers=loader_request_header_dictionary)
data = response.json()
print(json.dumps(data, indent = 2))
load_id = data['payload']['loadId']

# Check status of load once per second
completed = False
while not completed:
    response = requests.get(loader_endpoint_url + '/loader/' + load_id)
    data = response.json()
    #print(json.dumps(data, indent = 2))
    print(data['payload']['overallStatus']['status'])
    if data['payload']['overallStatus']['status'] == 'LOAD_COMPLETED' or data['payload']['overallStatus']['status'] == 'LOAD_FAILED':
        completed = True
    sleep(1)
elapsed_time = (datetime.datetime.now() - start_time).total_seconds()
print('time to load:', int(elapsed_time), 's')

# Converting n-triples to n-quads to specify graph


In [None]:
file_list = os.listdir(local_directory)
file_list.remove('.DS_Store')

# Note: assumes all files are n-triples serialization with .nt file extensions.
for file_name in file_list:
    file_name_root = parse_filename(file_name)[0]
    print('converting:', file_name)
    output_filename = file_name_root + '.nq'
    graph_string = ' <http://' + file_name_root + '> .'

    output_file_object = open(local_directory + output_filename, 'wt', encoding='utf-8')
    input_file_object = open(local_directory + file_name, 'rt', encoding='utf-8')

    for line in input_file_object:
        line_text = line.strip() # remove trailing newline
        line_text = line_text[:-1] + graph_string # remove period at end.
        print(line_text, file=output_file_object)

    output_file_object.close()
    input_file_object.close()

print('done')