# Script to reclassify prints into prints, posters, and maybe artists books

Import libraries and define functions

In [None]:
import pandas as pd
import requests
import datetime
import time
import json

def csv_read(path: str, **kwargs) -> pd.DataFrame:
    """Loads a CSV table into a Pandas DataFrame with all cells as strings and blank cells as empty strings
    
    Keyword argument:
    rows -- the number of rows of the table to return when used for testing. When omitted, all rows are returned.
    """
    dataframe = pd.read_csv(path, na_filter=False, dtype = str)
    if 'rows' in kwargs:
        return dataframe.head(kwargs['rows']).copy(deep=True)
    else:
        return dataframe

class Sparqler:
    """Build SPARQL queries of various sorts

    Parameters
    -----------
    method: str
        Possible values are "post" (default) or "get". Use "get" if read-only query endpoint.
        Must be "post" for update endpoint.
    endpoint: URL
        Defaults to Wikidata Query Service if not provided.
    useragent : str
        Required if using the Wikidata Query Service, otherwise optional.
        Use the form: appname/v.v (URL; mailto:email@domain.com)
        See https://meta.wikimedia.org/wiki/User-Agent_policy
    session: requests.Session
        If provided, the session will be used for all queries. Note: required for the Commons Query Service.
        If not provided, a generic requests method (get or post) will be used.
        NOTE: Currently only implemented for the .query() method since I don't have any way to test the mehtods that write.
    sleep: float
        Number of seconds to wait between queries. Defaults to 0.1
        
    Required modules:
    -------------
    requests, datetime, time
    """
    def __init__(self, method='post', endpoint='https://query.wikidata.org/sparql', useragent=None, session=None, sleep=0.1):
        # attributes for all methods
        self.http_method = method
        self.endpoint = endpoint
        if useragent is None:
            if self.endpoint == 'https://query.wikidata.org/sparql':
                print('You must provide a value for the useragent argument when using the Wikidata Query Service.')
                print()
                raise KeyboardInterrupt # Use keyboard interrupt instead of sys.exit() because it works in Jupyter notebooks
        self.session = session
        self.sleep = sleep

        self.requestheader = {}
        if useragent:
            self.requestheader['User-Agent'] = useragent
        
        if self.http_method == 'post':
            self.requestheader['Content-Type'] = 'application/x-www-form-urlencoded'

    def query(self, query_string, form='select', verbose=False, **kwargs):
        """Sends a SPARQL query to the endpoint.
        
        Parameters
        ----------
        form : str
            The SPARQL query form.
            Possible values are: "select" (default), "ask", "construct", and "describe".
        mediatype: str
            The response media type (MIME type) of the query results.
            Some possible values for "select" and "ask" are: "application/sparql-results+json" (default) and "application/sparql-results+xml".
            Some possible values for "construct" and "describe" are: "text/turtle" (default) and "application/rdf+xml".
            See https://docs.aws.amazon.com/neptune/latest/userguide/sparql-media-type-support.html#sparql-serialization-formats-neptune-output
            for response serializations supported by Neptune.
        verbose: bool
            Prints status when True. Defaults to False.
        default: list of str
            The graphs to be merged to form the default graph. List items must be URIs in string form.
            If omitted, no graphs will be specified and default graph composition will be controlled by FROM clauses
            in the query itself. 
            See https://www.w3.org/TR/sparql11-query/#namedGraphs and https://www.w3.org/TR/sparql11-protocol/#dataset
            for details.
        named: list of str
            Graphs that may be specified by IRI in a query. List items must be URIs in string form.
            If omitted, named graphs will be specified by FROM NAMED clauses in the query itself.
            
        Returns
        -------
        If the form is "select" and mediatype is "application/json", a list of dictionaries containing the data.
        If the form is "ask" and mediatype is "application/json", a boolean is returned.
        If the mediatype is "application/json" and an error occurs, None is returned.
        For other forms and mediatypes, the raw output is returned.

        Notes
        -----
        To get UTF-8 text in the SPARQL queries to work properly, send URL-encoded text rather than raw text.
        That is done automatically by the requests module for GET. I guess it also does it for POST when the
        data are sent as a dict with the urlencoded header. 
        See SPARQL 1.1 protocol notes at https://www.w3.org/TR/sparql11-protocol/#query-operation        
        """
        query_form = form
        if 'mediatype' in kwargs:
            media_type = kwargs['mediatype']
        else:
            if query_form == 'construct' or query_form == 'describe':
            #if query_form == 'construct':
                media_type = 'text/turtle'
            else:
                media_type = 'application/sparql-results+json' # default for SELECT and ASK query forms
        self.requestheader['Accept'] = media_type
            
        # Build the payload dictionary (query and graph data) to be sent to the endpoint
        payload = {'query' : query_string}
        if 'default' in kwargs:
            payload['default-graph-uri'] = kwargs['default']
        
        if 'named' in kwargs:
            payload['named-graph-uri'] = kwargs['named']

        if verbose:
            print('querying SPARQL endpoint')

        start_time = datetime.datetime.now()
        if self.http_method == 'post':
            if self.session is None:
                response = requests.post(self.endpoint, data=payload, headers=self.requestheader)
            else:
                response = self.session.post(self.endpoint, data=payload, headers=self.requestheader)
        else:
            if self.session is None:
                response = requests.get(self.endpoint, params=payload, headers=self.requestheader)
            else:
                response = self.session.get(self.endpoint, params=payload, headers=self.requestheader)
        elapsed_time = (datetime.datetime.now() - start_time).total_seconds()
        self.response = response.text
        time.sleep(self.sleep) # Throttle as a courtesy to avoid hitting the endpoint too fast.

        if verbose:
            print('done retrieving data in', int(elapsed_time), 's')

        if query_form == 'construct' or query_form == 'describe':
            return response.text
        else:
            if media_type != 'application/sparql-results+json':
                return response.text
            else:
                try:
                    data = response.json()
                except:
                    return None # Returns no value if an error. 

                if query_form == 'select':
                    # Extract the values from the response JSON
                    results = data['results']['bindings']
                else:
                    results = data['boolean'] # True or False result from ASK query 
                return results           



In [None]:
query_string = """select distinct ?qid ?statementIri where {
  ?qid wdt:P195 wd:Q18563658.
  ?qid wdt:P170 wd:Q2605345.
  ?qid wdt:P31 wd:Q478798.
  ?qid p:P31 ?statementIri.
  }"""

user_agent = 'TestAgent/0.1 (mailto:steve.baskauf@vanderbilt.edu)'
wdqs = Sparqler(useragent=user_agent)
data = wdqs.query(query_string)

#print(json.dumps(data, indent=2))

# Create a dictionary of the Q numbers and statement UUIDs
statement_dict = {}

# Create a list of the Q numbers
qid_list = []

for item in data:
    # extract the Q number from the item's URI
    qid = item['qid']['value'].replace('http://www.wikidata.org/entity/', '')

    # extract the statement UUID from the statement IRI
    statement_iri = item['statementIri']['value']
    statement_id = statement_iri.replace('http://www.wikidata.org/entity/statement/', '')
    # remove the Q number from the statement UUID before the first dash
    pieces = statement_id.split('-')[1:]
    statement_id = '-'.join(pieces)
    statement_dict[qid] = statement_id
    qid_list.append(qid)

print(qid_list)
print(statement_dict)


Load data

In [None]:
# Load the existing metadata. inventory_number is the same as accession_number.
path_to_vanderbot_metadata = '../../gallery_works/works_multiprop.csv'
vanderbot_metadata = csv_read(path_to_vanderbot_metadata)

# Set the index to the Q ID, but leave the Q ID as a column
vanderbot_metadata = vanderbot_metadata.set_index('qid', drop=False)


Extract the instance_of_uuid identifiers for the prints that need to be reclassified so that they can be deleted.

In [None]:
# Create an empty string to hold the error messages
error_messages = ''

# Create an empty dataframe to hold the qid and instance_of_uuid values for statements to be deleted
statements_to_delete = pd.DataFrame(columns=['qid', 'instance_of_uuid'])

# Loop through each row of the prints_metadata dataframe and find the instance_of_uuid value in the vanderbot_metadata dataframe
# If the instance_of_uuid value is found, copy the value of the instance_of column to the instance_of column in the prints_metadata dataframe
for index in qid_list:
    if index in vanderbot_metadata.index:
        statements_to_delete = statements_to_delete.append({'qid': index, 'instance_of_uuid': statement_dict[index]}, ignore_index=True)
    else:
        print('ERROR: ' + index + ' not found in vanderbot_metadata')
        error_messages += 'ERROR: ' + index + ' not found in vanderbot_metadata\n'
        continue

    # Change the value from "image" (Q478798) to "print" (Q11060274)
    vanderbot_metadata.at[index, 'instance_of'] = 'Q11060274'
    # Change the value of the instance_of_uuid column to the empty string
    vanderbot_metadata.at[index, 'instance_of_uuid'] = ''
    # Change the value of the instance_of_ref1_hash column to the empty string
    vanderbot_metadata.at[index, 'instance_of_ref1_hash'] = ''

# Write the statements_to_delete dataframe to a CSV file
statements_to_delete.to_csv('deletions.csv', index=False)

# Write the updated vanderbot_metadata dataframe to a CSV file
vanderbot_metadata.to_csv(path_to_vanderbot_metadata, index=False)

# Write the error messages to a text file
with open('error_messages.txt', 'w') as f:
    f.write(error_messages)

After running this step, use vanderdeletebot.py to delete the P31 print statements.
Then run vanderbot.py to create the new P31 poster statements.