# Preliminaries

To use the Neptune SPARQL endpoint, open the SSH tunnel using

```
ssh neptune -N
```

which reads the `~/.ssh/config` file containing:

```
host neptune-demo
 ForwardAgent yes
 User ec2-user # when using Amazon Linux
 HostName <your-ec2-address>
 IdentitiesOnly yes
 IdentityFile ~/.ssh/<your-ec2-key-file>.pem
 LocalForward 8182 <cluster-endpoint-neptune>:8182
 ```
 
 This allows access to the server through port 8182. 
 
 NOTE: at least on my computer, the Terminal window does not produce a prompt after executing this command. However, it does seem to work, since I get a response when I issue this command in a different Terminal window:
 
```
curl https://triplestore1.cluster-cml0hq81gymg.us-east-1.neptune.amazonaws.com:8182/status
```

NOTE: The dump of `AATOut_WikidataCoref.nt` from Getty was missing the period at the end of each triple. I had to add it before the file would load into Neptune.

## Configuration

In [None]:
import requests
import json
import csv
from time import sleep

endpoint = 'https://triplestore1.cluster-cml0hq81gymg.us-east-1.neptune.amazonaws.com:8182/sparql'
wdqs_endpoint = 'https://query.wikidata.org/sparql'
user_agent_header = 'thesaurus_crosswalk/0.1 (https://github.com/HeardLibrary/linked-data/; mailto:steve.baskauf@vanderbilt.edu)'

sparql_request_header = {
        'Accept' : 'application/json',
#        'Content-Type': 'application/sparql-query',
        'Content-Type': 'application/x-www-form-urlencoded',
        'User-Agent': user_agent_header
    }

# Low level functions

def send_sparql_query(query_string, endpoint):
    """Sends a SPARQL query to an endpoint URL. Argument is the query string, returns a list of results."""
    # You can delete the two print statements if the queries are short. However, for large/long queries,
    # it's good to let the user know what's going on.
    #print('querying SPARQL endpoint to acquire item metadata')
    #response = requests.post(endpoint, data=query_string.encode('utf-8'), headers=sparql_request_header)
    response = requests.post(endpoint, data=dict(query=query_string), headers=sparql_request_header)
    #print(response.text) # uncomment to view the raw response, e.g. if you are getting an error
    try:
        data = response.json()
    except:
        print(response.text)

    # Extract the values from the response JSON
    results = data['results']['bindings']
    
    #print('done retrieving data')
    #print(json.dumps(results, indent=2))
    
    return results

def extract_local_name(iri):
    """Extracts the local name part of an IRI, e.g. a qNumber from a Wikidata IRI. Argument is the IRI, returns the local name string."""
    # pattern is http://www.wikidata.org/entity/Q6386232
    pieces = iri.split('/')
    last_piece = len(pieces)
    return pieces[last_piece - 1]

# Write list of dictionaries to a CSV file
def write_dicts_to_csv(table, filename, fieldnames):
    with open(filename, 'w', newline='', encoding='utf-8') as csv_file_object:
        writer = csv.DictWriter(csv_file_object, fieldnames=fieldnames)
        writer.writeheader()
        for row in table:
            writer.writerow(row)


# Scripts

## Find out what graphs are in the triplestore

In [None]:
query_string = '''select distinct ?graph where {
graph ?graph {?s ?o ?p.}
}'''

results = send_sparql_query(query_string, endpoint)
for result in results:
    graph_iri = result['graph']['value']
    print(graph_iri)

## Query the Nomenclature thesaurus

In [None]:
with open('noun_list.txt', 'rt', encoding='utf-8') as file_object:
    nouns_text = file_object.read()
    descriptive_nouns = nouns_text.split('\n')

print(len(descriptive_nouns))
print(descriptive_nouns[:10])
    
#descriptive_nouns = ['plate', 'burner', 'bowl', 'mug', 'jar', 'vase', 'figure', 'ferrule', 'amulet', 'dish', 'coin', 'cup', 'buckle']


In [None]:
thesauri_ids = []

for noun in descriptive_nouns:
    id_dict = {'noun': noun}
    
    test_label = noun.title() # Nomenclature labels have the first letters of their labels capitalized
    print(test_label)
    query_string = '''prefix skos: <http://www.w3.org/2004/02/skos/core#>
    select distinct ?iri ?otherConcept ?prefLabel
    from <http://nomenclature_2022-02-02>
    where {
    {?iri skos:prefLabel "''' + test_label + '''"@en.}
    union
    {?iri skos:altLabel "''' + test_label + '''"@en.}
    optional {
    ?iri skos:prefLabel ?prefLabel.
    filter(lang(?prefLabel)="en")
    }
    optional {?iri skos:exactMatch ?otherConcept.}
    }
    '''

    #print(query_string)

    results = send_sparql_query(query_string, endpoint)
    found = False
    for result in results:
        iri = result['iri']['value']
        print(iri)
        nomenclature_id = extract_local_name(iri)
        id_dict['nomenclature'] = nomenclature_id
        id_dict['n_pref'] = result['prefLabel']['value']
        if 'otherConcept' in result:
            other = result['otherConcept']['value']
            if 'http://vocab.getty.edu/aat/' in other:
                print(other)
                id_dict['getty'] = extract_local_name(other)
            elif 'http://www.wikidata.org/entity/' in other:
                print(other)
                id_dict['wikidata'] = extract_local_name(other)
                
    if not('nomenclature' in id_dict):
        id_dict['nomenclature'] = ''
    if not('n_pref' in id_dict):
        id_dict['n_pref'] = ''
    if not('getty' in id_dict):
        id_dict['getty'] = ''
    if not('wikidata' in id_dict):
        id_dict['wikidata'] = ''
    print()
    thesauri_ids.append(id_dict)
    
#print(json.dumps(thesauri_ids, indent = 2))

print('done')

In [None]:
for concept_index in range(len(thesauri_ids)):
    # Do the AAT lookup only if it's not already known from the Nomenclature search
    # If it is found, try to also get the equivalent Wikidata concept
    if thesauri_ids[concept_index]['getty'] == '':
        noun = thesauri_ids[concept_index]['noun']
        print(noun)
        query_string = '''prefix skos: <http://www.w3.org/2004/02/skos/core#>
        prefix skosxl: <http://www.w3.org/2008/05/skos-xl#>
        select distinct ?iri ?otherConcept 
        from <http://AATOut_2Terms>
        from <http://AATOut_WikidataCoref>
        where {
        {?iri skosxl:prefLabel ?labelObject.}
        union
        {?iri skosxl:altLabel ?labelObject.}
        ?labelObject skosxl:literalForm "''' + noun +'''"@en.
        optional {?iri skos:exactMatch ?otherConcept.}
        }'''

        #print(query_string)

        results = send_sparql_query(query_string, endpoint)
        #print(json.dumps(results, indent = 2))

        found = False
        for result in results:
            iri = result['iri']['value']
            getty_id = extract_local_name(iri)
            found = True
            print(getty_id)
            thesauri_ids[concept_index]['getty'] = getty_id
            
            if 'otherConcept' in result:
                other = result['otherConcept']['value']
                if 'http://www.wikidata.org/entity/' in other:
                    print(other)
                    thesauri_ids[concept_index]['wikidata'] = extract_local_name(other)
            
        if not found:
            print('no match')
        print()

print('done')

In [None]:
for concept_index in range(len(thesauri_ids)):
    print(thesauri_ids[concept_index]['noun'])
    # Do the AAT label lookup only if we know the Getty ID
    if thesauri_ids[concept_index]['getty'] != '':
        query_string = '''prefix skos: <http://www.w3.org/2004/02/skos/core#>
        prefix skosxl: <http://www.w3.org/2008/05/skos-xl#>
        select distinct ?prefLabel 
        from <http://AATOut_2Terms>
        where {

        optional {
        <http://vocab.getty.edu/aat/''' + thesauri_ids[concept_index]['getty'] +'''> skosxl:prefLabel ?labelObject.
        ?labelObject skosxl:literalForm ?prefLabel.
        filter(lang(?prefLabel)="en")
        }

        }'''

        #print(query_string)

        results = send_sparql_query(query_string, endpoint)
        #print(json.dumps(results, indent = 2))

        try:
            for result in results:
                thesauri_ids[concept_index]['g_pref'] = result['prefLabel']['value']
        except:
            thesauri_ids[concept_index]['g_pref'] = ''
    else:
        thesauri_ids[concept_index]['g_pref'] = ''
        
    # Do the Wikidata label lookup only if we know the Q ID
    if thesauri_ids[concept_index]['wikidata'] != '':
        query_string = '''prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        select distinct ?prefLabel 
        where {

        optional {
        <http://www.wikidata.org/entity/''' + thesauri_ids[concept_index]['wikidata'] +'''> rdfs:label ?prefLabel.
        filter(lang(?prefLabel)="en")
        }

        }'''

        #print(query_string)

        results = send_sparql_query(query_string, wdqs_endpoint)
        #print(json.dumps(results, indent = 2))

        try:
            for result in results:
                thesauri_ids[concept_index]['w_pref'] = result['prefLabel']['value']
        except:
            thesauri_ids[concept_index]['w_pref'] = ''
    else:
        thesauri_ids[concept_index]['w_pref'] = ''
    sleep(0.2)
        
#print(json.dumps(thesauri_ids, indent = 2))

write_dicts_to_csv(thesauri_ids, 'thesauri_ids.csv', ['noun', 'nomenclature', 'n_pref', 'wikidata', 'w_pref', 'getty', 'g_pref'])
print('done')