In [None]:
# ----------------
# Configuration
# ----------------

import json
import requests
import csv
from pathlib import Path
from time import sleep
import sys
import re # regex
import datetime
import os
import pandas as pd
import urllib.parse

# AWS Python SDK
import boto3
import botocore

loader_endpoint_url = 'https://triplestore1.cluster-cml0hq81gymg.us-east-1.neptune.amazonaws.com:8182'

#local_filename = 'nomenclature_2022-02-02.jsonld'
local_filename = 'AATOut_Contribs.nt'

s3_bucket_name = 'triplestore-upload'

# See https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3.html#uploads
local_file_path = '/Users/baskausj/triplestore_upload/' + local_filename
s3_file_key = local_filename
# s3_file_key = s3_iiif_project_directory + '/' + subdirectory + '/' + local_filename

graph_iri = 'http://test'

update_request_header_dictionary = {
        'Accept' : 'application/json',
        'Content-Type': 'application/sparql-update'
    }


In [None]:
# ----------------
# Upload RDF triples to s3 bucket
# ----------------

s3 = boto3.client('s3')
print('Uploading to s3:', local_filename)
s3.upload_file(local_file_path, s3_bucket_name, s3_file_key)
print('Upload complete')


In [None]:
# Don't run this cell until we figure out how to make this command complete. The cell never stops executing and
# that locks up the rest of the notebook.

# Start up SSH tunnel
os.system('ssh neptune -N')

In [None]:
# Run this to make sure SSH tunnel is working
try:
    response = requests.get(loader_endpoint_url + '/status')
    print(response.json())
except Exception as e:
    print('error', e.args[0])
  

In [None]:
# POST SPARQL Update LOAD command

query_string = 'LOAD <https://' + s3_bucket_name + '.s3.amazonaws.com/' + s3_file_key + '> INTO GRAPH <' + graph_iri + '>'
response = requests.post(loader_endpoint_url + '/sparql', data=query_string.encode('utf-8'), headers=update_request_header_dictionary)
#print(response.text) # uncomment to view the raw response, e.g. if you are getting an error
data = response.json()
print(json.dumps(data, indent = 2))


In [None]:
# POST SPARQL Update DROP GRAPH command

query_string = 'DROP GRAPH <' + graph_iri + '>'
response = requests.post(loader_endpoint_url + '/sparql', data=query_string.encode('utf-8'), headers=update_request_header_dictionary)
#print(response.text) # uncomment to view the raw response, e.g. if you are getting an error
data = response.json()
print(json.dumps(data, indent = 2))


In [None]:
# POST SPARQL Update DROP ALL command
# Warning! This deletes all triples in the graph !!!!

query_string = 'DROP ALL'
response = requests.post(loader_endpoint_url + '/sparql', data=query_string.encode('utf-8'), headers=update_request_header_dictionary)
#print(response.text) # uncomment to view the raw response, e.g. if you are getting an error
data = response.json()
print(json.dumps(data, indent = 2))


In [None]:
# POST loader command

loader_request_header_dictionary = {
        'Accept' : 'application/json',
        'Content-Type': 'application/json'
    }

# rdf_format = 'nquads'
rdf_format = 'ntriples'

data = '''
    {
      "source" : "s3://'''+ s3_bucket_name + '/' + s3_file_key + '''",
      "format" : "'''  + rdf_format + '''",
      "iamRoleArn" : "arn:aws:iam::555751041262:role/neptuneloadfroms3",
      "region" : "us-east-1",
      "failOnError" : "FALSE",
      "parallelism" : "MEDIUM",
      "updateSingleCardinalityProperties" : "FALSE",
      "queueRequest" : "TRUE"
    }'''

start_time = datetime.datetime.now()

# Send request to load
response = requests.post(loader_endpoint_url + '/loader', data=data.encode('utf-8'), headers=loader_request_header_dictionary)
data = response.json()
print(json.dumps(data, indent = 2))
load_id = data['payload']['loadId']

# Check status of load once per second
completed = False
while not completed:
    response = requests.get(loader_endpoint_url + '/loader/' + load_id)
    data = response.json()
    #print(json.dumps(data, indent = 2))
    print(data['payload']['overallStatus']['status'])
    if data['payload']['overallStatus']['status'] == 'LOAD_COMPLETED':
        completed = True
    sleep(1)
elapsed_time = (datetime.datetime.now() - start_time).total_seconds()
print('time to load:', int(elapsed_time), 's')