In [None]:
# code in this block from the template.ipynb notebook

import requests
import json
import csv
import sys # Read CLI arguments
import datetime
#import os
#from time import sleep
#from pathlib import Path

# Use the following code for a stand-alone script if you want to pass in a value (e.g. file path) when running
# the script from the command line. If no arguments are passed, the "else" value will be used.

if len(sys.argv) == 2: # if exactly one argument passed (i.e. the configuration file path)
    file_path = sys.argv[1] # sys.argv[0] is the script name
else:
    file_path = 'file.csv'


# ----------------
# File IO
# ----------------

# Many functions operate on a list of dictionaries, where each item in the list represents a spreadsheet row
# and each column is identified by a dictionary item whose key is the column header in the spreadsheet.
# The first two functions read and write from files into this data structure.

# Read from a CSV file into a list of dictionaries
def read_dicts_from_csv(filename):
    with open(filename, 'r', newline='', encoding='utf-8') as file_object:
        dict_object = csv.DictReader(file_object)
        array = []
        for row in dict_object:
            array.append(row)
    return array

# Write a list of dictionaries to a CSV file
# The fieldnames object is a list of strings whose items are the keys in the row dictionaries that are chosen
# to be the columns in the output spreadsheet. The order in the list determines the order of the columns.
def write_dicts_to_csv(table, filename, fieldnames):
    with open(filename, 'w', newline='', encoding='utf-8') as csv_file_object:
        writer = csv.DictWriter(csv_file_object, fieldnames=fieldnames)
        writer.writeheader()
        for row in table:
            writer.writerow(row)


# If configuration or other data are stored in a file as JSON, this function loads them into a Python data structure

# Load JSON file data into a Python data structure
def load_json_into_data_struct(path):
    with open(path, 'rt', encoding='utf-8') as file_object:
        file_text = file_object.read()
    structure = json.loads(file_text)
    # uncomment the following line to view the data
    # print(json.loads(structure, indent = 2))
    return(structure)


# ----------------
# Code for interacting with a Wikibase query interface (SPARQL endpoint). Typically, it's the Wikidata Query Service
# ----------------

endpoint = 'https://query.wikidata.org/sparql'
accept_media_type = 'application/json'
# Replace this value with your own user agent header string
user_agent_header = 'VanderBot/1.6.1 (https://github.com/HeardLibrary/linked-data/tree/master/vanderbot; mailto:steve.baskauf@vanderbilt.edu)'

# The following code generates a request header dictionary suitable for sending to a SPARQL endpoint.
# If the query is SELECT, use the JSON media type above. For CONSTRUCT queryies use text/turtle to get RDF/Turtle
# Best to send a user-agent header because some Wikimedia servers don't like unidentified clients
def generate_header_dictionary(accept_media_type,user_agent_header):
    request_header_dictionary = {
        'Accept' : accept_media_type,
        'Content-Type': 'application/sparql-query',
        'User-Agent': user_agent_header
    }
    return request_header_dictionary

# The following function requires the request header generated above
request_header = generate_header_dictionary(accept_media_type,user_agent_header)
# The query is a valid SPARQL query string

# Sends a query to the query service endpoint. 
def send_sparql_query(query_string, request_header):
    # You can delete the two print statements if the queries are short. However, for large/long queries,
    # it's good to let the user know what's going on.
    print('querying SPARQL endpoint to acquire item metadata')
    response = requests.post(endpoint, data=query_string.encode('utf-8'), headers=request_header)
    #print(response.text) # uncomment to view the raw response, e.g. if you are getting an error
    data = response.json()

    # Extract the values from the response JSON
    results = data['results']['bindings']
    
    print('done retrieving data')
    #print(json.dumps(results, indent=2))
    return(results)

# ----------------
# Utility code
# ----------------

# Generate the current UTC xsd:date
def generate_utc_date():
    whole_time_string_z = datetime.datetime.utcnow().isoformat() # form: 2019-12-05T15:35:04.959311
    date_z = whole_time_string_z.split('T')[0] # form 2019-12-05
    return date_z

# Extracts the local name part of an IRI, e.g. a qNumber from a Wikidata IRI
def extract_local_name(iri):
    # pattern is http://www.wikidata.org/entity/Q6386232
    pieces = iri.split('/')
    last_piece = len(pieces)
    return pieces[last_piece - 1]

# Extracts the UUID and qId from a statement IRI and returns them as a tuple
def extract_statement_uuid(iri):
    # pattern is http://www.wikidata.org/entity/statement/Q7552806-8B88E0CA-BCC8-49D5-9AC2-F1755464F1A2
    pieces = iri.split('/')
    statement_id = pieces[5]
    pieces = statement_id.split('-')
    # UUID is the first item of the tuple, Q ID is the second item
    return pieces[1] + '-' + pieces[2] + '-' + pieces[3] + '-' + pieces[4] + '-' + pieces[5], pieces[0]

# To sort a list of dictionaries by a particular dictionary key's values, define the following function
# then invoke the sort using the code that follows

# function to use in sort
def sort_funct(row):
    return int(row['count']) # sort by the count key

'''
output_list.sort(key = sort_funct) # sort by the filename field
'''

print()

In [None]:
def create_values_list_from_file(path):
    data = read_dicts_from_csv(path)
    qid_values = ''  # VALUES list for query
    for record in data:
        qid = record['qid']
        qid_values += 'wd:' + qid + '\n'

    # remove trailing newline
    qid_values = qid_values[:len(qid_values)-1]
    return qid_values

def create_id_values_list(list):
    qid_values = ''  # VALUES list for query
    for record in list:
        qid = record['iri']
        qid_values += '<' + qid + '>\n'

    # remove trailing newline
    qid_values = qid_values[:len(qid_values)-1]
    return qid_values

def build_full_query(property, screen):
    query = '''select distinct ?entity ?label ?count where {'''
    
    # If a property is passed in, the label is linked directly to the item that is the value
    if property != '':
        query += '''
    ?entity rdfs:label ?label.'''
        
    # If no property is passed in, the label is linked to the generic property associated with the resulting property
    else:
        query += '''
    ?genProp wikibase:directClaim ?entity.
    ?genProp rdfs:label ?label.'''

    query += '''
    filter(lang(?label) = 'en')
    {
    select distinct ?entity (count(distinct ?qid) as ?count) where
        {'''
    
    # Screening of Q IDs done by a graph pattern
    if screen[0:4] == '?qid':
        query += '\n        ' + screen
        
    # Screening of Q IDs done by a list
    else:
        query += '''
        VALUES ?qid
        {
''' + screen + '''
        }'''
    
    # If a property is passed in, search for the values of that property
    if property != '':
        query += '''
        ?qid wdt:''' + property + ''' ?entity.'''
        
    # If no property passed in, then see what properties were used
    else:
        query += '''
        ?qid ?entity ?value.'''
        
    query += '''
        }
        group by ?entity
    }
}
order by desc(?count)'''
    
    return query

def build_id_query(property, screen):
    query = '''select distinct ?entity (count(distinct ?qid) as ?count) where
    {'''
    
    # Screening of Q IDs done by a graph pattern
    if screen[0:4] == '?qid':
        query += '\n    ' + screen
        
    # Screening of Q IDs done by a list
    else:
        query += '''
    VALUES ?qid
        {
''' + screen + '''
        }'''
    
    # If a property is passed in, search for the values of that property
    if property != '':
        query += '''
    ?qid wdt:''' + property + ''' ?entity.'''
        
    # If no property passed in, then see what properties were used
    else:
        query += '''
    ?qid ?truthy ?value.
    ?entity wikibase:directClaim ?truthy.'''
        
    query += '''
    }
    group by ?entity'''
    
    return query

def build_label_query(screen):
    query = '''select distinct ?entity ?label where {
    VALUES ?entity
        {
''' + screen + '''
        }'''

        # If a property is passed in, the label is linked directly to the item that is the value
    if property != '':
        query += '''
    ?entity rdfs:label ?label.
    filter(lang(?label) = 'en')
    }'''
    
    return query


In [None]:
#query_string = build_id_query('', '?qid wdt:P195 wd:Q18563658.') # items in the Fine Arts gallery

filename = 'edit_a_thon.csv'
values = create_values_list_from_file(filename)
query_string = build_id_query('P5008', values)
print(query_string)


In [None]:
results = send_sparql_query(query_string, request_header)
#print(json.dumps(results, indent=2))

interim_results = []
for result in results:
    iri = result['entity']['value']
    count = result['count']['value']
    interim_results.append({'iri': iri, 'count': count})
    
values = create_id_values_list(interim_results)
query_string = build_label_query(values)
print(query_string)

In [None]:
results = send_sparql_query(query_string, request_header)
#print(json.dumps(results, indent=2))

output_list = []
for interim_result in interim_results:
    for result in results:
        final_result = {}
        if result['entity']['value'] == interim_result['iri']:
            final_result['id'] = extract_local_name(interim_result['iri'])
            final_result['label'] = result['label']['value']
            final_result['count'] = extract_local_name(interim_result['count'])
            break
    output_list.append(final_result)


In [None]:
output_list.sort(key = sort_funct, reverse=True)
print(json.dumps(output_list, indent=2))

In [None]:
filename = 'summary.csv'
fieldnames = ['id', 'label', 'count']
write_dicts_to_csv(output_list, filename, fieldnames)
print('done')