In [71]:
import json
import requests
import csv
from pathlib import Path
from time import sleep
import sys
import uuid

sparqlSleep = 0.25 # delay time between calls to SPARQL endpoint

# -----------------------------------------------------------------
# function definitions

def retrieveCredentials(path):
    with open(path, 'rt') as fileObject:
        lineList = fileObject.read().split('\n')
    endpointUrl = lineList[0].split('=')[1]
    username = lineList[1].split('=')[1]
    password = lineList[2].split('=')[1]
    userAgent = lineList[3].split('=')[1]
    credentials = [endpointUrl, username, password, userAgent]
    return credentials

def getLoginToken(apiUrl):    
    parameters = {
        'action':'query',
        'meta':'tokens',
        'type':'login',
        'format':'json'
    }
    r = session.get(url=apiUrl, params=parameters)
    data = r.json()
    return data['query']['tokens']['logintoken']

def logIn(apiUrl, token, username, password):
    parameters = {
        'action':'login',
        'lgname':username,
        'lgpassword':password,
        'lgtoken':token,
        'format':'json'
    }
    r = session.post(apiUrl, data=parameters)
    data = r.json()
    return data

def getCsrfToken(apiUrl):
    parameters = {
        "action": "query",
        "meta": "tokens",
        "format": "json"
    }
    r = session.get(url=apiUrl, params=parameters)
    data = r.json()
    return data["query"]["tokens"]["csrftoken"]

# read a CSV into a list of dictionaries
def readDict(filename):
    fileObject = open(filename, 'r', newline='', encoding='utf-8')
    dictObject = csv.DictReader(fileObject)
    array = []
    for row in dictObject:
        array.append(row)
    fileObject.close()
    return array

# write the data to a file
def writeToFile(tableFileName, fieldnames, tableData):
    with open(tableFileName, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for writeRowNumber in range(0, len(tableData)):
            writer.writerow(tableData[writeRowNumber])

# gunction to get local name from an IRI
def extractFromIri(iri, numberPieces):
    # with pattern like http://www.wikidata.org/entity/Q6386232 there are 5 pieces with qId as number 4
    pieces = iri.split('/')
    return pieces[numberPieces]

# search for any of the "label" types: label, alias, description
def searchLabelsDescriptionsAtWikidata(qIds, labelType, language):
    # configuration settings
    endpointUrl = 'https://query.wikidata.org/sparql'
    acceptMediaType = 'application/json'
    userAgentHeader = 'VanderBot/1.3 (https://github.com/HeardLibrary/linked-data/tree/master/vanderbot; mailto:steve.baskauf@vanderbilt.edu)'
    requestHeaderDictionary = {
    'Content-Type': 'application/sparql-query',
    'Accept' : acceptMediaType,
    'User-Agent': userAgentHeader
    }

    # create a string for all of the Wikidata item IDs to be used as subjects in the query
    alternatives = ''
    for qId in qIds:
        alternatives += 'wd:' + qId + '\n'
        
    if labelType == 'label':
        predicate = 'rdfs:label'
    elif labelType == 'alias':
        predicate = 'skos:altLabel'
    elif labelType == 'description':
        predicate = 'schema:description'
    else:
        predicate = 'rdfs:label'        
        
    # create a string for the query
    query = 'select distinct ?id ?string '
    query += '''where {
  VALUES ?id
{
''' + alternatives + '''}
  ?id '''+ predicate + ''' ?string.
  filter(lang(?string)="''' + language + '''")
  }'''
    #print(query)

    returnValue = []
    # r = requests.get(endpointUrl, params={'query' : query}, headers=requestHeaderDictionary)
    r = requests.post(endpointUrl, data=query, headers=requestHeaderDictionary)
    data = r.json()
    results = data['results']['bindings']
    for result in results:
        # remove wd: 'http://www.wikidata.org/entity/'
        qNumber = extractFromIri(result['id']['value'], 4)
        string = result['string']['value']
        resultsDict = {'qId': qNumber, 'string': string}
        returnValue.append(resultsDict)

    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(sparqlSleep)
    
    return returnValue

# Function to convert times to the format required by Wikidata
def convertDates(rowData, dateColumnNameRoot):
    error = False
    # Only do something in the case where there is a date. Missing values should be skipped.
    if rowData[dateColumnNameRoot + '_val'] != '':
        # Assume that if the precision column is empty that the dates need to be converted
        if rowData[dateColumnNameRoot + '_prec'] == '':
            #print(dateColumnNameRoot, rowData[dateColumnNameRoot + '_val'])

            # set these two to default to the existing values
            # precisionNumber = int(rowData[dateColumnNameRoot + '_prec']) # not necessary since conditional on value of ''
            timeString = rowData[dateColumnNameRoot + '_val']

            value = rowData[dateColumnNameRoot + '_val']
            # date is YYYY-MM-DD
            if len(value) == 10:
                timeString = value + 'T00:00:00Z'
                precisionNumber = 11 # precision to days
            # date is YYYY-MM
            elif len(value) == 7:
                timeString = value + '-00T00:00:00Z'
                precisionNumber = 10 # precision to months
            # date is YYYY
            elif len(value) == 4:
                timeString = value + '-00-00T00:00:00Z'
                precisionNumber = 9 # precision to years
            # date is xsd:dateTime and doesn't need adjustment
            elif len(value) == 20:
                timeString = value
                precisionNumber = 11 # assume precision to days since Wikibase doesn't support greater resolution than that
            # date form unknown, don't adjust
            else:
                print('Warning: date for ' + dateColumnNameRoot + '_val:', rowData[dateColumnNameRoot + '_val'], 'does not conform to any standard format! Check manually.')
                error = True
            # assign the changed values back to the dict
            rowData[dateColumnNameRoot + '_val'] = timeString
            rowData[dateColumnNameRoot + '_prec'] = precisionNumber
        else:
            # a pre-existing precisionNumber must be an integer when written to the API
            rowData[dateColumnNameRoot + '_prec'] = int(rowData[dateColumnNameRoot + '_prec'])

        # If there is no UUID in the _nodeId column, generate one
        if rowData[dateColumnNameRoot + '_nodeId'] == '':
            rowData[dateColumnNameRoot + '_nodeId'] = str(uuid.uuid4())


    return rowData, error

'''
# Function to create reference value for times
def createTimeReferenceValue(value):
    # date is YYYY-MM-DD
    if len(value) == 10:
        timeString = '+' + value + 'T00:00:00Z'
        precisionNumber = 11 # precision to days
    # date is YYYY-MM
    elif len(value) == 7:
        timeString = '+' + value + '-00T00:00:00Z'
        precisionNumber = 10 # precision to months
    # date is YYYY
    elif len(value) == 4:
        timeString = '+' + value + '-00-00T00:00:00Z'
        precisionNumber = 9 # precision to years
    # date form unknown, don't adjust
    else:
        # 2020-07-15 note: Previously, the leading + was included with the the table value.
        # However, in order for the csv2rdf schema to be valid, the + must not be included in the tabled value. So it is added here.
        #timeString = value
        timeString = '+' + value
        precisionNumber = 11 # assume precision to days
        
    # Q1985727 is the Gregorian calendar
    # 2020-07-15 note: Previously, the leading + was included with the the table value.
    # However, in order for the csv2rdf schema to be valid, the + must not be included in the tabled value. So it is added here.
    dateDict = {
            'time': '+' + timeString,
            'timezone': 0,
            'before': 0,
            'after': 0,
            'precision': precisionNumber,
            'calendarmodel': "http://www.wikidata.org/entity/Q1985727"
            }
    return dateDict
'''

# Find the column with the UUID for the statement
def findPropertyUuid(propertyId, columns):
    statementUuidColumn = '' # start value as empty string in case no UUID column
    for column in columns:
        if not('suppressOutput' in column):
            # find the valueUrl in the column for which the value of the statement has the prop version of the property as its propertyUrl
            if 'prop/' + propertyId in column['propertyUrl']:
                temp = column['valueUrl'].partition('-{')[2]
                statementUuidColumn = temp.partition('}')[0] # in the event of two columns with the same property ID, the last one is used
                #print(statementUuidColumn)
    
    # Give a warning if there isn't any UUID column for the property
    if statementUuidColumn == '':
        print('Warning: No UUID column for property ' + propertyId)
    return statementUuidColumn

# Each property can have zero to many references. This function searches the column headers to find all of
# the columns that are references for a particulary property used in statements
def findReferencesForProperty(statementUuidColumn, columns):
    # build up a list of dictionaries about references to associate with the property
    referenceList = []

    # Step through the columns looking for references associated with the property
    for column in columns:
        if not('suppressOutput' in column):
            # check if the aboutUrl for the column has the statement subject UUID column as the about value and that the propertyUrl value is wasDerivedFrom
            if ('prov:wasDerivedFrom' in column['propertyUrl']) and (statementUuidColumn in column['aboutUrl']):
                temp = column['valueUrl'].partition('{')[2]
                refHashColumn = temp.partition('}')[0]
                #print(refHashColumn)

                # These are the lists that will accumulate data about each property of the reference
                refPropList = [] # P ID for the property
                refValueColumnList = [] # column header string for the reference property's value
                refEntityOrLiteral = [] # values: entity or literal, determined by presence of a valueUrl key for the column
                refTypeList = [] # the datatype of the property's value: url, time, or string
                refValueTypeList = [] # the specific type of a string: time or string
                # The kind of value in the column (dateTime, string) can be retrieved directly from the column 'datatype' value
                
                # Now step throught the columns looking for each of the properties that are associated with the reference
                for propColumn in columns:
                    if not('suppressOutput' in propColumn):
                        # Find the columns that have the refHash column name in the aboutUrl
                        if refHashColumn in propColumn['aboutUrl']:
                            
                            # Determine whether the value of the reference is a value node (e.g. dates) or a direct value
                            valueString = propColumn['propertyUrl'].partition('prop/reference/')[2]
                            if "value" in valueString: # e.g. value/P813
                                # The property IRI namespace for references with value nodes is http://www.wikidata.org/prop/reference/value/
                                refPropList.append(valueString.partition('value/')[2])
                                # The column title will be something like employer_ref1_retrieved_nodeId, 
                                # so get the root of the string to the left of "_nodeId"
                                refValueColumnList.append(propColumn['titles'].partition('_nodeId')[0])

                                # Find out what kind of value node it is. Currently supported is date; future: globe coordinate value and quantities
                                for testColumn in columns:
                                    try:
                                        if propColumn['titles'] in testColumn['aboutUrl']:
                                            if 'timeValue' in testColumn['propertyUrl']: # value is a date
                                                refEntityOrLiteral.append('value')
                                                refTypeList.append('time')
                                                refValueTypeList.append('time')
                                            elif 'geoLatitude' in testColumn['propertyUrl']: # value is a globe coordinate value
                                                pass
                                            elif 'quantityAmount' in testColumn['propertyUrl']: # value is a quantity
                                                pass
                                            else:
                                                continue
                                    except:
                                        pass
                            else: # e.g. P854
                                # The property IRI namespace for references with direct values is http://www.wikidata.org/prop/reference/
                                refPropList.append(valueString)
                                # Just use the whole column title
                                refValueColumnList.append(propColumn['titles'])

                                if 'valueUrl' in propColumn:
                                    # URIs are detected when there is a valueUrl whose value has a first character of "{"
                                    if propColumn['valueUrl'][0] == '{':
                                        refEntityOrLiteral.append('literal')
                                        refTypeList.append('url')
                                        refValueTypeList.append('string')
                                    else:
                                        refEntityOrLiteral.append('entity')
                                        refTypeList.append('wikibase-item')
                                        refValueTypeList.append('wikibase-entityid')
                                else:
                                    refTypeList.append('string')
                                    refValueTypeList.append('string')
                
                # After all of the properties have been found and their data have been added to the lists, 
                # insert the lists into the reference list as values in a dictionary
                referenceList.append({'refHashColumn': refHashColumn, 'refPropList': refPropList, 'refValueColumnList': refValueColumnList, 'refEntityOrLiteral': refEntityOrLiteral, 'refTypeList': refTypeList, 'refValueTypeList': refValueTypeList})
        
    # After every column has been searched for references associated with the property, return the reference list
    #print('References: ', json.dumps(referenceList, indent=2))
    return referenceList


# Each property can have zero to many qualifiers. This function searches the column headers to find all of
# the columns that are qualifiers for a particulary property
def findQualifiersForProperty(statementUuidColumn, columns):

    # These are the lists that will accumulate data about each qualifier
    qualPropList = [] # P ID for the property
    qualValueColumnList = [] # column header string for the reference property's value
    qualEntityOrLiteral = [] # values: entity or literal, determined by presence of a valueUrl key for the column
    qualTypeList = [] # the datatype of the qualifier's value: url, time, or string
    qualValueTypeList = [] # the specific type of a string: time or string
    # The kind of value in the column (dateTime, string) can be retrieved directly from the column 'datatype' value

    for column in columns:
        if not('suppressOutput' in column):
            # find the column that has the statement UUID in the about
            # and the property is a qualifier property
            if (statementUuidColumn in column['aboutUrl']) and ('qualifier' in column['propertyUrl']):
                # Determine whether the value of the qualifier is a value node (e.g. dates) or a direct value
                valueString = column['propertyUrl'].partition('prop/qualifier/')[2]
                if "value" in valueString: # e.g. value/P580
                    # The property IRI namespace for qualifiers with value nodes is http://www.wikidata.org/prop/qualifier/value/
                    qualPropList.append(valueString.partition('value/')[2])
                    # The column title will be something like employer_startDate_nodeId, 
                    # so get the root of the string to the left of "_nodeId"
                    qualValueColumnList.append(column['titles'].partition('_nodeId')[0])

                    # Find out what kind of value node it is. Currently supported is date; future: globe coordinate value and quantities
                    for testColumn in columns:
                        try:
                            if column['titles'] in testColumn['aboutUrl']:
                                if 'timeValue' in testColumn['propertyUrl']: # value is a date
                                    qualEntityOrLiteral.append('value')
                                    qualTypeList.append('time')
                                    qualValueTypeList.append('time')
                                elif 'geoLatitude' in testColumn['propertyUrl']: # value is a globe coordinate value
                                    pass
                                elif 'quantityAmount' in testColumn['propertyUrl']: # value is a quantity
                                    pass
                                else:
                                    continue
                        except:
                            pass
                else: # e.g. P1545
                    # The property IRI namespace for qualifiers with direct values is http://www.wikidata.org/prop/qualifier/
                    qualPropList.append(valueString)
                    # Just use the whole column title
                    qualValueColumnList.append(column['titles'])

                    # determine whether the qualifier is an entity/URI or string
                    if 'valueUrl' in column:
                        # URIs are detected when there is a valueUrl whose value has a first character of "{"
                        if column['valueUrl'][0] == '{':
                            qualEntityOrLiteral.append('literal')
                            qualTypeList.append('url')
                            qualValueTypeList.append('string')
                        else:
                            qualEntityOrLiteral.append('entity')
                            qualTypeList.append('wikibase-item')
                            qualValueTypeList.append('wikibase-entityid')
                    else:
                        qualTypeList.append('string')
                        qualValueTypeList.append('string')

    # After all of the qualifier columns are found for the property, create a dictionary to pass back
    qualifierDictionary = {'qualPropList': qualPropList, 'qualValueColumnList': qualValueColumnList, "qualEntityOrLiteral": qualEntityOrLiteral, 'qualTypeList': qualTypeList, 'qualValueTypeList': qualValueTypeList}
    #print('Qualifiers: ', json.dumps(qualifierDictionary, indent=2))
    return(qualifierDictionary)

# The form of snaks is the same for references and qualifiers, so they can be generated systematically
# Although the variable names include "ref", they apply the same to the analagous "qual" variables.
def generateSnaks(snakDictionary, require_references, refValue, refPropNumber, refPropList, refValueColumnList, refValueTypeList, refTypeList, refEntityOrLiteral):
    if not(refValue):  # evaluates both empty strings for direct values or empty dict for node-valued values
        if require_references: # Do not write the record if it's missing a reference.
            print('Reference value missing! Cannot write the record.')
            sys.exit()
    else:
        if refEntityOrLiteral[refPropNumber] == 'value':
            # Currently time is the only kind of value node handled
            if refTypeList[refPropNumber] == 'time':
                snakDictionary[refPropList[refPropNumber]] = [
                    {
                    'snaktype': 'value',
                    'property': refPropList[refPropNumber],
                    'datavalue':{
                        'value': {
                            'time': '+' + refValue['timeValue'],
                            'timezone': 0,
                            'before': 0,
                            'after': 0,
                            'precision': refValue['timePrecision'],
                            'calendarmodel': "http://www.wikidata.org/entity/Q1985727"
                            },
                        'type': 'time'
                        },
                    'datatype': 'time'
                    }
                ]

            # In the future handle other types here
            else:
                pass

        elif refEntityOrLiteral[refPropNumber] == 'entity':
            # case where the value is an entity
            snakDictionary[refPropList[refPropNumber]] = [
                {
                'snaktype': 'value',
                'property': refPropList[refPropNumber],
                'datavalue': {
                    'value': {
                        'id': refValue
                        },
                    'type': 'wikibase-entityid'
                    },
                'datatype': 'wikibase-item'
                }
            ]
        else:
            # case where value is a string of some kind
            snakDictionary[refPropList[refPropNumber]] = [
                {
                'snaktype': 'value',
                'property': refPropList[refPropNumber],
                'datavalue': {
                    'value': refValue,
                    'type': refValueTypeList[refPropNumber]
                },
                'datatype': refTypeList[refPropNumber]
                }
            ]
    return snakDictionary

# If there are references for a statement, return a reference list
def createReferences(referenceListForProperty, rowData):
    referenceListToReturn = []
    for referenceDict in referenceListForProperty:
        refPropList = referenceDict['refPropList']
        refValueColumnList = referenceDict['refValueColumnList']
        refValueTypeList = referenceDict['refValueTypeList']
        refTypeList = referenceDict['refTypeList']
        refEntityOrLiteral = referenceDict['refEntityOrLiteral']

        snakDictionary = {}
        for refPropNumber in range(0, len(refPropList)):
            if refEntityOrLiteral[refPropNumber] == 'value':
                # value nodes with no nodeId should be considered to have no value
                if rowData[refValueColumnList[refPropNumber] + '_nodeId'] == '':
                    refValue = {}
                else:
                    # currently time is the only supported node-valued type
                    if refTypeList[refPropNumber] == 'time':
                        refValue = {'timeValue': rowData[refValueColumnList[refPropNumber] + '_val'], 'timePrecision': rowData[refValueColumnList[refPropNumber] + '_prec']}
                    # other node-valued types will be handled here
                    else:
                        pass
            else:
                refValue = rowData[refValueColumnList[refPropNumber]]
            snakDictionary = generateSnaks(snakDictionary, require_references, refValue, refPropNumber, refPropList, refValueColumnList, refValueTypeList, refTypeList, refEntityOrLiteral)
        if snakDictionary != {}: # If any references were added, create the outer dict and add to list
            outerSnakDictionary = {
                'snaks': snakDictionary
            }
            referenceListToReturn.append(outerSnakDictionary)
    return referenceListToReturn


# NOTE: this differs from the createReferences function in that it returns
# a dictionary of snaks for a single reference, NOT a list for many references
def createReferenceSnak(referenceDict, rowData):
    refPropList = referenceDict['refPropList']
    refValueColumnList = referenceDict['refValueColumnList']
    refValueTypeList = referenceDict['refValueTypeList']
    refTypeList = referenceDict['refTypeList']
    refEntityOrLiteral = referenceDict['refEntityOrLiteral']
    
    snakDictionary = {}
    for refPropNumber in range(0, len(refPropList)):
        if refEntityOrLiteral[refPropNumber] == 'value':
            # value nodes with no nodeId should be considered to have no value
            if rowData[refValueColumnList[refPropNumber] + '_nodeId'] == '':
                refValue = {}
            else:
                # currently time is the only supported node-valued type
                if refTypeList[refPropNumber] == 'time':
                    refValue = {'timeValue': rowData[refValueColumnList[refPropNumber] + '_val'], 'timePrecision': rowData[refValueColumnList[refPropNumber] + '_prec']}
                # other node-valued types will be handled here
                else:
                    pass
        else:
            refValue = rowData[refValueColumnList[refPropNumber]]
        snakDictionary = generateSnaks(snakDictionary, require_references, refValue, refPropNumber, refPropList, refValueColumnList, refValueTypeList, refTypeList, refEntityOrLiteral)
    #print(json.dumps(snakDictionary, indent = 2))
    return snakDictionary


# If there are qualifiers for a statement, return a qualifiers dictionary
def createQualifiers(qualifierDictionaryForProperty, rowData):
    qualPropList = qualifierDictionaryForProperty['qualPropList']
    qualValueColumnList = qualifierDictionaryForProperty['qualValueColumnList']
    qualTypeList = qualifierDictionaryForProperty['qualTypeList']
    qualValueTypeList = qualifierDictionaryForProperty['qualValueTypeList']
    qualEntityOrLiteral = qualifierDictionaryForProperty['qualEntityOrLiteral']
    snakDictionary = {}
    for qualPropNumber in range(0, len(qualPropList)):
        if qualEntityOrLiteral[qualPropNumber] == 'value':
            # value nodes with no nodeId should be considered to have no value
            if rowData[qualValueColumnList[qualPropNumber] + '_nodeId'] == '':
                qualValue = {}
            else:
                # currently time is the only supported node-valued type
                if qualTypeList[qualPropNumber] == 'time':
                    qualValue = {'timeValue': rowData[qualValueColumnList[qualPropNumber] + '_val'], 'timePrecision': rowData[qualValueColumnList[qualPropNumber] + '_prec']}
                # other node-valued types will be handled here
                else:
                    pass
        else:
            qualValue = rowData[qualValueColumnList[qualPropNumber]]
        snakDictionary = generateSnaks(snakDictionary, require_qualifiers, qualValue, qualPropNumber, qualPropList, qualValueColumnList, qualValueTypeList, qualTypeList, qualEntityOrLiteral)
    return snakDictionary


# This function attempts to post and handles maxlag errors
def attemptPost(apiUrl, parameters):
    maxRetries = 10
    baseDelay = 5 # Wikidata recommends a delay of at least 5 seconds
    delayLimit = 300
    retry = 0
    # maximum number of times to retry lagged server = maxRetries
    while retry <= maxRetries:
        if retry > 0:
            print('retry:', retry)
        r = session.post(apiUrl, data = parameters)
        data = r.json()
        try:
            # check if response is a maxlag error
            # see https://www.mediawiki.org/wiki/Manual:Maxlag_parameter
            if data['error']['code'] == 'maxlag':
                print('Lag of ', data['error']['lag'], ' seconds.')
                # recommended delay is basically useless
                # recommendedDelay = int(r.headers['Retry-After'])
                #if recommendedDelay < 5:
                    # recommendation is to wait at least 5 seconds if server is lagged
                #    recommendedDelay = 5
                recommendedDelay = baseDelay*2**retry # double the delay with each retry 
                if recommendedDelay > delayLimit:
                    recommendedDelay = delayLimit
                if retry != maxRetries:
                    print('Waiting ', recommendedDelay , ' seconds.')
                    print()
                    sleep(recommendedDelay)
                retry += 1

                # after this, go out of if and try code blocks
            else:
                # an error code is returned, but it's not maxlag
                return data
        except:
            # if the response doesn't have an error key, it was successful, so return
            return data
        # here's where execution goes after the delay
    # here's where execution goes after maxRetries tries
    print('Failed after ' + str(maxRetries) + ' retries.')
    exit() # just abort the script


In [72]:
# ----------------------------------------------------------------
# authentication

# This is the format of the wikibase_credentials.txt file. Username and password
# are for a bot that you've created.  Save file in your home directory.
# Set your own User-Agent header. Do not use the one listed here
# See https://meta.wikimedia.org/wiki/User-Agent_policy
'''
endpointUrl=https://test.wikidata.org
username=User@bot
password=465jli90dslhgoiuhsaoi9s0sj5ki3lo
userAgentHeader=YourBot/0.1 (someuser@university.edu)
'''

# default API resource URL when a Wikibase/Wikidata instance is installed.
resourceUrl = '/w/api.php'

home = str(Path.home()) # gets path to home directory; supposed to work for Win and Mac
credentialsFilename = 'wikibase_credentials.txt'
credentialsPath = home + '/' + credentialsFilename
credentials = retrieveCredentials(credentialsPath)
endpointUrl = credentials[0] + resourceUrl
user = credentials[1]
pwd = credentials[2]
userAgentHeader = credentials[3]

# Instantiate session outside of any function so that it's globally accessible.
session = requests.Session()
# Set default User-Agent header so you don't have to send it with every request
session.headers.update({'User-Agent': userAgentHeader})


loginToken = getLoginToken(endpointUrl)
data = logIn(endpointUrl, loginToken, user, pwd)
csrfToken = getCsrfToken(endpointUrl)

# -------------------------------------------
# Beginning of script to process the tables

# There are options to require values for every mapped reference column or every mapped qualifier column.
# By default, these are turned off, but they can be turned on by changing these flags:
require_references = False
require_qualifiers = False

# Set the value of the maxlag parameter to back off when the server is lagged
# see https://www.mediawiki.org/wiki/Manual:Maxlag_parameter
# The recommended value is 5 seconds.
# To not use maxlang, set the value to 0
# To test the maxlag handler code, set maxlag to a very low number like .1
maxlag = 5

# This is the schema that maps the CSV column to Wikidata properties
with open('csv-metadata.json', 'rt', encoding='utf-8') as fileObject:
    text = fileObject.read()
metadata = json.loads(text)

tables = metadata['tables']

table = tables[0]

In [73]:
tableFileName = table['url']
print('File name: ', tableFileName)
tableData = readDict(tableFileName)

# we are opening the file as a csv.reader object as the easy way to get the header row as a list
fileObject = open(tableFileName, 'r', newline='', encoding='utf-8')
readerObject = csv.reader(fileObject)
for row in readerObject:
    fieldnames = row
    break # we only nead the header row, so break after the first loop
fileObject.close()

columns = table['tableSchema']['columns']

subjectWikidataIdName = ''
# assume each row is primarily about an entity
# step through the columns until there is an aboutUrl for an entity
for column in columns:
    # check only columns that have an aboutUrl key
    if 'aboutUrl' in column:
        # the value ouf the aboutUrl must be an entity
        if 'entity/{' in column['aboutUrl']:
            # extract the column name of the subject resource from the URI template
            temp = column['aboutUrl'].partition('{')[2]
            subjectWikidataIdName = temp.partition('}')[0]
            # don't worry about repeatedly replacing subjectWikidataIdName as long as the row is only about one entity            
#print(subjectWikidataIdName)

# make lists of the columns for each kind of property
labelColumnList = []
labelLanguageList = []
aliasColumnList = []
aliasLanguageList = []
descriptionColumnList = []
descriptionLanguageList = []
propertiesColumnList = []
propertiesUuidColumnList = []
propertiesEntityOrLiteral = [] # determines whether value of property is an "entity" (i.e. item) or "literal" (which includes strings, dates, and URLs that aren't actually literals)
propertiesIdList = []
propertiesTypeList = [] # the 'datatype' given to a mainsnak. Currently supported types are: "wikibase-item", "url", "time", or "string"
propertiesValueTypeList = [] # the 'type' given to values of 'datavalue' in the mainsnak. Can be "wikibase-entityid", "string" or "time" 
propertiesReferencesList = []
propertiesQualifiersList = []

# step through all of the columns and sort their headers into the appropriate list

# find the column whose name matches the URI template for the aboutUrl (only one)
for column in columns:
    if column['name'] == subjectWikidataIdName:
        subjectWikidataIdColumnHeader = column['titles']
        print('Subject column: ', subjectWikidataIdColumnHeader)

# create a list of the entities that have Wikidata qIDs
qIds = []
for entity in tableData:
    if entity[subjectWikidataIdColumnHeader] != '':
        qIds.append(entity[subjectWikidataIdColumnHeader])

existingLabels = [] # a list to hold lists of labels in various languages
existingDescriptions = [] # a list to hold lists of descriptions in various languages
existingAliases = [] # a list to hold lists of lists of aliases in various languages
for column in columns:

    # special handling for alias column
    # In order to allow for multiple aliases to be listed as a JSON string, the alias column is handled idiosyncratically and
    # not as with the labels and description columns. It must me named exactly "alias" and have output suppressed.
    # This hack allows aliases to be processed by the script, but also to allow a csv2rdf to serialize the CSV data as valid RDF.
    # However, it limits aliases to a single language.
    if 'suppressOutput' in column:
        # find columns that contain aliases and ignor any others with suppressOutput
        # GUI calls it "Also known as"; RDF as skos:altLabel
        if column['name'] == 'alias':
            altLabelColumnHeader = column['titles']
            altLabelLanguage = column['lang']
            print('Alternate label column: ', altLabelColumnHeader, ', language: ', altLabelLanguage)
            aliasColumnList.append(altLabelColumnHeader)
            aliasLanguageList.append(altLabelLanguage)

            # retrieve the aliases in that language that already exist in Wikidata and match them with table rows
            languageAliases = []
            aliasesAtWikidata = searchLabelsDescriptionsAtWikidata(qIds, 'alias', altLabelLanguage)
            for entityIndex in range(0, len(tableData)):
                personAliasList = []
                if tableData[entityIndex][subjectWikidataIdColumnHeader] != '':  # don't look for the label at Wikidata if the item doesn't yet exist
                    for wikiLabel in aliasesAtWikidata:
                        if tableData[entityIndex][subjectWikidataIdColumnHeader] == wikiLabel['qId']:
                            personAliasList.append(wikiLabel['string'])
                # if not found, the personAliasList list will remain empty
                languageAliases.append(personAliasList)

            # add all of the found aliases for that language to the list of aliases in various languages
            existingAliases.append(languageAliases)
    # handle all other non-suppressed columns.
    else:

        # find the columns (if any) that provide labels
        if column['propertyUrl'] == 'rdfs:label':
            labelColumnHeader = column['titles']
            labelLanguage = column['lang']
            print('Label column: ', labelColumnHeader, ', language: ', labelLanguage)
            labelColumnList.append(labelColumnHeader)
            labelLanguageList.append(labelLanguage)

            # retrieve the labels in that language that already exist in Wikidata and match them with table rows
            tempLabels = []
            labelsAtWikidata = searchLabelsDescriptionsAtWikidata(qIds, 'label', labelLanguage)
            for entityIndex in range(0, len(tableData)):
                found = False
                if tableData[entityIndex][subjectWikidataIdColumnHeader] != '':  # don't look for the label at Wikidata if the item doesn't yet exist
                    for wikiLabel in labelsAtWikidata:
                        if tableData[entityIndex][subjectWikidataIdColumnHeader] == wikiLabel['qId']:
                            found = True
                            tempLabels.append(wikiLabel['string'])
                            break # stop looking if there is a match
                if not found:
                    tempLabels.append('')

            # add all of the found labels for that language to the list of labels in various languages
            existingLabels.append(tempLabels)

        # find columns that contain descriptions
        # Note: if descriptions exist for a language, they will be overwritten
        elif column['propertyUrl'] == 'schema:description':
            descriptionColumnHeader = column['titles']
            descriptionLanguage = column['lang']
            print('Description column: ', descriptionColumnHeader, ', language: ', descriptionLanguage)
            descriptionColumnList.append(descriptionColumnHeader)
            descriptionLanguageList.append(descriptionLanguage)

            # retrieve the descriptions in that language that already exist in Wikidata and match them with table rows
            tempLabels = []
            descriptionsAtWikidata = searchLabelsDescriptionsAtWikidata(qIds, 'description', descriptionLanguage)
            for entityIndex in range(0, len(tableData)):
                found = False
                if tableData[entityIndex][subjectWikidataIdColumnHeader] != '':  # don't look for the label at Wikidata if the item doesn't yet exist
                    for wikiDescription in descriptionsAtWikidata:
                        if tableData[entityIndex][subjectWikidataIdColumnHeader] == wikiDescription['qId']:
                            found = True
                            tempLabels.append(wikiDescription['string'])
                            break # stop looking if there is a match
                if not found:
                    tempLabels.append('')

            # add all of the found labels for that language to the list of labels in various languages
            existingDescriptions.append(tempLabels)

        # find columns that contain properties with entity values, literal values that are URLs, or value node values
        elif 'valueUrl' in column:
            # only add columns that have "statement" properties
            if 'prop/statement/' in column['propertyUrl']:
                if 'prop/statement/value/' in column['propertyUrl']: # value is a value node (e.g. date or geo coordinates)
                    found = True
                    propColumnHeader = column['titles'].partition('_nodeId')[0] # save only the root of the column name for value nodes
                    propertyId = column['propertyUrl'].partition('prop/statement/value/')[2]
                    propertiesColumnList.append(propColumnHeader)
                    propertiesIdList.append(propertyId)
                    propertiesEntityOrLiteral.append('value')
                    # Find out what kind of value node it is. Currently supported is date; future: globe coordinate value and quantities
                    for testColumn in columns:
                        try:
                            if column['titles'] in testColumn['aboutUrl']:
                                if 'timeValue' in testColumn['propertyUrl']: # value is a date
                                    propKind = 'time'
                                    propertiesTypeList.append('time')
                                    propertiesValueTypeList.append('time')
                                elif 'geoLatitude' in testColumn['propertyUrl']: # value is a globe coordinate value
                                    propKind = 'geocoordinates'
                                elif 'quantityAmount' in testColumn['propertyUrl']: # value is a quantity
                                    propKind = 'quantity'
                                else:
                                    continue
                                print('Property column: ', propColumnHeader, ', Property ID: ', propertyId, ' Value type: ', propKind)
                        except:
                            pass

                else:
                    propColumnHeader = column['titles']
                    propertyId = column['propertyUrl'].partition('prop/statement/')[2]
                    propertiesColumnList.append(propColumnHeader)
                    propertiesIdList.append(propertyId)

                    # URLs are detected when there is a valueUrl whose value has a first character of "{"
                    if column['valueUrl'][0] == '{':
                        propertiesEntityOrLiteral.append('literal')
                        propertiesTypeList.append('url')
                        propertiesValueTypeList.append('string')
                        print('Property column: ', propColumnHeader, ', Property ID: ', propertyId, ' Value type: url')
                    # Otherwise having a valueUrl indicates that it's an item
                    else:
                        propertiesEntityOrLiteral.append('entity')
                        propertiesTypeList.append('wikibase-item')
                        propertiesValueTypeList.append('wikibase-entityid')
                        print('Property column: ', propColumnHeader, ', Property ID: ', propertyId, ' Value type: item')

                propertyUuidColumn = findPropertyUuid(propertyId, columns)
                propertiesUuidColumnList.append(propertyUuidColumn)
                propertiesReferencesList.append(findReferencesForProperty(propertyUuidColumn, columns))
                propertiesQualifiersList.append(findQualifiersForProperty(propertyUuidColumn, columns))
                print()

        # remaining columns should have properties with literal values
        else:
            # only add columns that have "statement" properties
            if 'prop/statement/' in column['propertyUrl']:
                propColumnHeader = column['titles']
                propertyId = column['propertyUrl'].partition('prop/statement/')[2]
                print('Property column: ', propColumnHeader, ', Property ID: ', propertyId, ' Value type: string')
                propertiesColumnList.append(propColumnHeader)
                propertiesIdList.append(propertyId)

                propertiesEntityOrLiteral.append('literal')
                propertiesTypeList.append('string')
                propertiesValueTypeList.append('string')

                propertyUuidColumn = findPropertyUuid(propertyId, columns)
                propertiesUuidColumnList.append(propertyUuidColumn)
                propertiesReferencesList.append(findReferencesForProperty(propertyUuidColumn, columns))
                propertiesQualifiersList.append(findQualifiersForProperty(propertyUuidColumn, columns))
                print()
print()


File name:  bluffton.csv
Subject column:  wikidataId
Label column:  labelEn , language:  en
Alternate label column:  alias , language:  en
Description column:  description , language:  en
Property column:  orcid , Property ID:  P496  Value type: string

Property column:  employer , Property ID:  P108  Value type: item

Property column:  affiliation , Property ID:  P1416  Value type: item

Property column:  instanceOf , Property ID:  P31  Value type: item

Property column:  sexOrGenderQId , Property ID:  P21  Value type: item




In [74]:
# If there are dates in the table that are not in the format Wikibase requires, they will be converted here
print('converting dates')

# Figure out the column name roots for column sets that are dates
dateColumnNameList = []
if len(propertiesColumnList) > 0:
    for propertyNumber in range(0, len(propertiesColumnList)):
        if propertiesTypeList[propertyNumber] == 'time':
            #print('property with date:', propertiesColumnList[propertyNumber])
            dateColumnNameList.append(propertiesColumnList[propertyNumber])

        if len(propertiesReferencesList[propertyNumber]) != 0:
            for qualPropNumber in range(0, len(propertiesQualifiersList[propertyNumber]['qualPropList'])):
                if propertiesQualifiersList[propertyNumber]['qualTypeList'][qualPropNumber] == 'time':
                    #print('qualifier property with date:', propertiesQualifiersList[propertyNumber]['qualValueColumnList'][qualPropNumber])
                    dateColumnNameList.append(propertiesQualifiersList[propertyNumber]['qualValueColumnList'][qualPropNumber])

        if len(propertiesReferencesList[propertyNumber]) != 0:
            for referenceNumber in range(0, len(propertiesReferencesList[propertyNumber])):
                for refPropNumber in range(0, len(propertiesReferencesList[propertyNumber][referenceNumber]['refPropList'])):
                    if propertiesReferencesList[propertyNumber][referenceNumber]['refTypeList'][refPropNumber] == 'time':
                        #print('reference property with date:', propertiesReferencesList[propertyNumber][referenceNumber]['refValueColumnList'][refPropNumber])
                        dateColumnNameList.append(propertiesReferencesList[propertyNumber][referenceNumber]['refValueColumnList'][refPropNumber])
#print(dateColumnNameList)

errorFlag = False
for rowNumber in range(0, len(tableData)):
    #print('row: ' + str(rowNumber))
    #print(tableData[rowNumber])
    for dateColumnName in dateColumnNameList:
        tableData[rowNumber], error = convertDates(tableData[rowNumber], dateColumnName)
        if error:
            errorFlag = True
    #print()        
    #print(tableData[rowNumber])
    #print()

# Write the file with the converted dates in case the script crashes
writeToFile(tableFileName, fieldnames, tableData)

# If any of the date formats in the table were bad, don't try to write to the API
if errorFlag:
    sys.exit('Fix incorrectly formatted dates in file and restart')
print()


converting dates



In [75]:
# process each row of the table for item writing
print('Writing items')
print('--------------------------')
print()

rowNumber = 0

status_message = 'processing row: ' + str(rowNumber)
if len(labelColumnList) > 0: # skip printing a label if there aren't any
    status_message += '  Label: ' + tableData[rowNumber][labelColumnList[0]] # include the first label available
if tableData[rowNumber][subjectWikidataIdColumnHeader] != '': # only list existing record IDs
    status_message += '  qID: ' + tableData[rowNumber][subjectWikidataIdColumnHeader]
else:
    status_message += '  new record'
print(status_message)

# build the parameter string to be posted to the API
parameterDictionary = {
    'action': 'wbeditentity',
    'format':'json',
    'token': csrfToken
    }

if tableData[rowNumber][subjectWikidataIdColumnHeader] == '':
    newItem = True
    parameterDictionary['new'] = 'item'
else:
    newItem = False
    parameterDictionary['id'] = tableData[rowNumber][subjectWikidataIdColumnHeader]

# begin constructing the string for the "data" value by creating a data structure to be turned into JSON
# the examples are from https://www.wikidata.org/w/api.php?action=help&modules=wbeditentity
dataStructure = {}

if len(labelColumnList) > 0:
    # here's what we need to construct for labels:
    # data={"labels":{"de":{"language":"de","value":"de-value"},"en":{"language":"en","value":"en-value"}}}
    labelDict = {}
    for languageNumber in range(0, len(labelColumnList)):
        valueString = tableData[rowNumber][labelColumnList[languageNumber]]
        # if there is a new record with no Q ID...
        if newItem:
            # add the label in the table for that language to the label dictionary
            labelDict[labelLanguageList[languageNumber]] = {
                'language': labelLanguageList[languageNumber],
                'value': valueString
                }
        else:
            # not a new record, check if the value in the table is different from what's currently in Wikidata
            if valueString != existingLabels[languageNumber][rowNumber]:
                # if they are different check to make sure the table value isn't empty
                if valueString != '':
                    print('Changing label ', existingLabels[languageNumber][rowNumber], ' to ', valueString)
                    # add the label in the table for that language to the label dictionary
                    labelDict[labelLanguageList[languageNumber]] = {
                        'language': labelLanguageList[languageNumber],
                        'value': valueString
                        }
    if labelDict != {}:
        dataStructure['labels'] = labelDict

# the alias column contains a list. If the table has more aliases than currently in Wikidata, then update
if len(aliasColumnList) > 0:
    # no example, but follow the same pattern as labels
    aliasDict = {}
    # step through each language that has aliases
    for aliasColumnNumber in range(0, len(aliasColumnList)):
        valueList = json.loads(tableData[rowNumber][aliasColumnList[aliasColumnNumber]])
        # don't do anything if there are no alias values for that person
        if valueList != []:
            # perform an unordered comparison between the aliases currently in Wikidata and
            # the aliases in the CSV for that person. Don't do anything if they are the same.
            # NOTE: this is actually redundant with the > test that follows, but I'm leaving it here to remember
            # how to do an unordered comparison.  The > test might be replaced with something more sophisticated later
            if set(valueList) != set(existingAliases[languageNumber][rowNumber]):
                # only make a change if there are more aliases in the spreadsheet than currently in Wikidata
                if len(valueList) > len(existingAliases[languageNumber][rowNumber]):
                    print('')
                    # see https://www.mediawiki.org/wiki/Wikibase/DataModel/JSON#Labels,_Descriptions_and_Aliases
                    # for structure of aliases in JSON
                    aliasLangList = []
                    for aliasValue in valueList:
                        temp = {
                        'language': aliasLanguageList[aliasColumnNumber],
                        'value': aliasValue
                        }
                        aliasLangList.append(temp)
                    aliasDict[aliasLanguageList[aliasColumnNumber]] = aliasLangList
    if aliasDict != {}:
        dataStructure['aliases'] = aliasDict

if len(descriptionColumnList) > 0:
    # here's what we need to construct for descriptions:
    # data={"descriptions":{"nb":{"language":"nb","value":"nb-Description-Here"}}}
    descriptionDict = {}
    for languageNumber in range(0, len(descriptionColumnList)):
        valueString = tableData[rowNumber][descriptionColumnList[languageNumber]]
        # if there is a new record with no Q ID...
        if newItem:
            # add the description in the table for that language to the description dictionary
            descriptionDict[descriptionLanguageList[languageNumber]] = {
                'language': descriptionLanguageList[languageNumber],
                'value': valueString
                }
        else:
            # not a new record, check if the value in the table is different from what's currently in Wikidata
            if valueString != existingDescriptions[languageNumber][rowNumber]:
                # if they are different check to make sure the table value isn't empty
                if valueString != '':
                    print('Changing description ', existingDescriptions[languageNumber][rowNumber], ' to ', valueString)
                    # add the description in the table for that language to the description dictionary
                    descriptionDict[descriptionLanguageList[languageNumber]] = {
                        'language': descriptionLanguageList[languageNumber],
                        'value': valueString
                        }
    if descriptionDict != {}:
        dataStructure['descriptions'] = descriptionDict

# handle claims
if len(propertiesColumnList) > 0:
    claimsList = []

    # here's what we need to construct for literal valued properties:
    # data={"claims":[{"mainsnak":{"snaktype":"value","property":"P56","datavalue":{"value":"ExampleString","type":"string"}},"type":"statement","rank":"normal"}]}
    for propertyNumber in range(0, len(propertiesColumnList)):
        propertyId = propertiesIdList[propertyNumber]
        statementUuidColumn = propertiesUuidColumnList[propertyNumber]
        # If there is already a UUID, then don't write that property to the API
        if tableData[rowNumber][statementUuidColumn] != '':
            continue  # skip the rest of this iteration and go onto the next property

        # The columns whose properties have value node contain only the column name root, so must be handled differently
        if propertiesEntityOrLiteral[propertyNumber] == 'value':
            valueString = tableData[rowNumber][propertiesColumnList[propertyNumber] + '_val']
            if valueString == '':
                continue  # skip the rest of this iteration and go onto the next property
            # Currently time is the only kind of value node supported
            if propertiesTypeList[propertyNumber] == 'time':
                snakDict = {
                    'mainsnak': {
                        'snaktype': 'value',
                        'property': propertiesIdList[propertyNumber],
                        'datavalue':{
                            'value': {
                                'time': '+' + valueString,
                                'timezone': 0,
                                'before': 0,
                                'after': 0,
                                'precision': tableData[rowNumber][propertiesColumnList[propertyNumber] + '_prec'],
                                'calendarmodel': "http://www.wikidata.org/entity/Q1985727"
                                },
                            'type': 'time'
                            },
                        'datatype': 'time'
                        },
                    'type': 'statement',
                    'rank': 'normal'
                    }
            # If globe coordinate value or quantities become supported, they will be handled here.
            else:
                pass

        # For other property columns, the column name is stored directly in the propertiesColumnList
        else:
            valueString = tableData[rowNumber][propertiesColumnList[propertyNumber]]
            if valueString == '':
                continue  # skip the rest of this iteration and go onto the next property
            if propertiesEntityOrLiteral[propertyNumber] == 'literal':
                    snakDict = {
                    'mainsnak': {
                        'snaktype': 'value',
                        'property': propertiesIdList[propertyNumber],
                        'datavalue':{
                            'value': valueString,
                            'type': propertiesValueTypeList[propertyNumber]
                            },
                        'datatype': propertiesTypeList[propertyNumber]
                        },
                    'type': 'statement',
                    'rank': 'normal'
                    }

            elif propertiesEntityOrLiteral[propertyNumber] == 'entity':
                snakDict = {
                    'mainsnak': {
                        'snaktype': 'value',
                        'property': propertiesIdList[propertyNumber],
                        'datatype': 'wikibase-item',
                        'datavalue': {
                            'value': {
                                'id': valueString
                                },
                            'type': 'wikibase-entityid'
                            }
                        },
                    'type': 'statement',
                    'rank': 'normal'
                    }
            else:
                print('This should not happen')

        # Look for references and qualifiers for all properties whose values are being written
        if len(propertiesReferencesList[propertyNumber]) != 0:  # skip references if there aren't any
            references = createReferences(propertiesReferencesList[propertyNumber], tableData[rowNumber])
            if references != []: # check to avoid setting references for an empty reference list
                snakDict['references'] = references
        if len(propertiesQualifiersList[propertyNumber]['qualPropList']) != 0:
            qualifiers = createQualifiers(propertiesQualifiersList[propertyNumber], tableData[rowNumber])
            if qualifiers != {}: # check for situation where no qualifier statements were made for that record
                snakDict['qualifiers'] = qualifiers

        claimsList.append(snakDict)

    if claimsList != []:
        dataStructure['claims'] = claimsList

# The data value has to be turned into a JSON string
parameterDictionary['data'] = json.dumps(dataStructure)
print(json.dumps(dataStructure, indent = 2))
#print(parameterDictionary)


Writing items
--------------------------

processing row: 0  Label: Stanley R. Clemens  new record

{
  "labels": {
    "en": {
      "language": "en",
      "value": "Stanley R. Clemens"
    }
  },
  "aliases": {
    "en": [
      {
        "language": "en",
        "value": "Stan Clemens"
      }
    ]
  },
  "descriptions": {
    "en": {
      "language": "en",
      "value": "mathematician and educator"
    }
  },
  "claims": [
    {
      "mainsnak": {
        "snaktype": "value",
        "property": "P108",
        "datatype": "wikibase-item",
        "datavalue": {
          "value": {
            "id": "Q886141"
          },
          "type": "wikibase-entityid"
        }
      },
      "type": "statement",
      "rank": "normal",
      "references": [
        {
          "snaks": {
            "P854": [
              {
                "snaktype": "value",
                "property": "P854",
                "datavalue": {
                  "value": "https://www.bluffton.edu/cat

In [76]:
if maxlag > 0:
    parameterDictionary['maxlag'] = maxlag
responseData = attemptPost(endpointUrl, parameterDictionary)
responseDataSafe = responseData
#print('Write confirmation: ', responseData)
print()
print(json.dumps(responseData, indent=2))


{
  "entity": {
    "labels": {
      "en": {
        "language": "en",
        "value": "Stanley R. Clemens"
      }
    },
    "descriptions": {
      "en": {
        "language": "en",
        "value": "mathematician and educator"
      }
    },
    "aliases": {
      "en": [
        {
          "language": "en",
          "value": "Stan Clemens"
        }
      ]
    },
    "sitelinks": {},
    "claims": {
      "P108": [
        {
          "mainsnak": {
            "snaktype": "value",
            "property": "P108",
            "hash": "de571f9cebebaf7805f4c6836e9eb027b8d3f57f",
            "datavalue": {
              "value": {
                "entity-type": "item",
                "numeric-id": 886141,
                "id": "Q886141"
              },
              "type": "wikibase-entityid"
            },
            "datatype": "wikibase-item"
          },
          "type": "statement",
          "id": "Q101242960$B9EEE4CC-791F-4E53-9542-9671811179CE",
          "rank": "no

In [40]:
responseData = responseDataSafe

In [28]:
responseDataJson = '''{
  "entity": {
    "labels": {
      "en": {
        "language": "en",
        "value": "Michael David Edmiston"
      }
    },
    "descriptions": {
      "en": {
        "language": "en",
        "value": "physicist and educator"
      }
    },
    "aliases": {
      "en": [
        {
          "language": "en",
          "value": "Mike Edmiston"
        }
      ]
    },
    "sitelinks": {},
    "claims": {
      "P108": [
        {
          "mainsnak": {
            "snaktype": "value",
            "property": "P108",
            "hash": "de571f9cebebaf7805f4c6836e9eb027b8d3f57f",
            "datavalue": {
              "value": {
                "entity-type": "item",
                "numeric-id": 886141,
                "id": "Q886141"
              },
              "type": "wikibase-entityid"
            },
            "datatype": "wikibase-item"
          },
          "type": "statement",
          "id": "Q101242220$86C233F9-7E48-406D-9950-18C13D35FB7E",
          "rank": "normal",
          "references": [
            {
              "hash": "2c1963b96bde00545c55c48774c2aa8d09c47a97",
              "snaks": {
                "P854": [
                  {
                    "snaktype": "value",
                    "property": "P854",
                    "hash": "d15eebebef5e272314bb43d3e5d6a894ef1dc135",
                    "datavalue": {
                      "value": "https://www.bluffton.edu/catalog/officers/faculty.aspx",
                      "type": "string"
                    },
                    "datatype": "url"
                  }
                ],
                "P813": [
                  {
                    "snaktype": "value",
                    "property": "P813",
                    "hash": "bc92f3b0b3a4ac82c78818993bf9fe814aa6699b",
                    "datavalue": {
                      "value": {
                        "time": "+2020-11-06T00:00:00Z",
                        "timezone": 0,
                        "before": 0,
                        "after": 0,
                        "precision": 11,
                        "calendarmodel": "http://www.wikidata.org/entity/Q1985727"
                      },
                      "type": "time"
                    },
                    "datatype": "time"
                  }
                ]
              },
              "snaks-order": [
                "P854",
                "P813"
              ]
            }
          ]
        }
      ],
      "P31": [
        {
          "mainsnak": {
            "snaktype": "value",
            "property": "P31",
            "hash": "ad7d38a03cdd40cdc373de0dc4e7b7fcbccb31d9",
            "datavalue": {
              "value": {
                "entity-type": "item",
                "numeric-id": 5,
                "id": "Q5"
              },
              "type": "wikibase-entityid"
            },
            "datatype": "wikibase-item"
          },
          "type": "statement",
          "id": "Q101242220$7448739D-84D9-4241-90C1-53707122452D",
          "rank": "normal"
        }
      ],
      "P21": [
        {
          "mainsnak": {
            "snaktype": "value",
            "property": "P21",
            "hash": "85ad4b1c7348f7a5aac521135040d74e91fb5939",
            "datavalue": {
              "value": {
                "entity-type": "item",
                "numeric-id": 6581097,
                "id": "Q6581097"
              },
              "type": "wikibase-entityid"
            },
            "datatype": "wikibase-item"
          },
          "type": "statement",
          "id": "Q101242220$7149DDA2-CCE5-4DF9-AF11-B3EF192004DA",
          "rank": "normal"
        }
      ]
    },
    "id": "Q101242220",
    "type": "item",
    "lastrevid": 1303433739
  },
  "success": 1
}'''
responseData = json.loads(responseDataJson)

In [77]:
if newItem:
    # extract the entity Q number from the response JSON
    tableData[rowNumber][subjectWikidataIdColumnHeader] = responseData['entity']['id']

# fill into the table the values of newly created claims and references
for statementIndex in range(0, len(propertiesIdList)):
    print("csv statement number: ", statementIndex)
    referencesForStatement = propertiesReferencesList[statementIndex]
    #print(tableData[rowNumber][propertiesColumnList[statementIndex]])

    # need to find out if the value is empty. Value-node values must have their nodeId's checked. Otherwise, just check whether the cell is empty.
    if propertiesEntityOrLiteral[statementIndex] =='value':
        if tableData[rowNumber][propertiesColumnList[statementIndex] + '_nodeId'] == '':
            value = False
        else:
            value = True
    else:
        if tableData[rowNumber][propertiesColumnList[statementIndex]] == '':
            value = False
        else:
            value = True
    # only add the claim if the UUID cell for that row is empty AND there is a value for the property
    if tableData[rowNumber][propertiesUuidColumnList[statementIndex]] =='' and value:
        count = 0
        statementFound = False
        # If there are multiple values for a property, this will loop through more than one statement
        for statement in responseData['entity']['claims'][propertiesIdList[statementIndex]]:
            print()
            print(statement)

            # does the value in the cell equal the mainsnak value of the claim?
            # it's necessary to check this because there could be other previous claims for that property (i.e. multiple values)
            if propertiesEntityOrLiteral[statementIndex] == 'literal':
                statementFound = tableData[rowNumber][propertiesColumnList[statementIndex]] == statement['mainsnak']['datavalue']['value']
            elif propertiesEntityOrLiteral[statementIndex] == 'entity':
                statementFound = tableData[rowNumber][propertiesColumnList[statementIndex]] == statement['mainsnak']['datavalue']['value']['id']
            elif propertiesEntityOrLiteral[statementIndex] == 'value':
                if propertiesTypeList[statementIndex] == 'time':
                    # need to handle negative dates (BCE)
                    if tableData[rowNumber][propertiesColumnList[statementIndex] + '_val'][0] == '-':
                        # make comparison with the leading minus present
                        statementFound = tableData[rowNumber][propertiesColumnList[statementIndex] + '_val'] == statement['mainsnak']['datavalue']['value']['time']
                    else:
                        # must add leading plus (not stored in the table) to match the non-standard plus included by Wikibase
                        statementFound = ('+' + tableData[rowNumber][propertiesColumnList[statementIndex] + '_val']) == statement['mainsnak']['datavalue']['value']['time']
                else: # in the future, when other node value types are supported, the code here will need to be expanded to cover the other types
                    pass
            else:
                pass
            if statementFound:
                count += 1
                if count > 1:
                    # I don't think this should actually happen, since if there were already at least one statement with this value,
                    # it would have already been downloaded in the processing prior to running this script.
                    print('Warning: duplicate statement ', tableData[rowNumber][subjectWikidataIdColumnHeader], ' ', propertiesIdList[statementIndex], ' ', tableData[rowNumber][propertiesColumnList[statementIndex]])
                tableData[rowNumber][propertiesUuidColumnList[statementIndex]] = statement['id'].split('$')[1]  # just keep the UUID part after the dollar sign

                # Search for each reference type (set of reference properties) that's being tracked for a particular property's statements
                for tableReference in referencesForStatement: # loop will not be executed when length of referenceForStatement = 0 (no references tracked for this property)
                    # Check for an exact match of reference properties and their values (since we're looking for reference for a statement that was written)
                    # Step through each reference that came back for the statement we are interested in
                    for responseReference in statement['references']: # "outer loop"
                        print()
                        print("responseReference", responseReference)
                        # Perform a screening process on each returned reference by stepping through each property associated with a refernce type
                        # and trying to match it. If the path to the value doesn't exist, there will be an exception and that reference 
                        # can be ignored. Only if the values for all of the reference properties match will the hash be recorded.
                        referenceMatch = True
                        print('table reference ref prop list:', tableReference['refPropList'])
                        for referencePropertyIndex in range(0, len(tableReference['refPropList'])): # "inner loop" to check each property in the reference
                            print('reference property index:', referencePropertyIndex)
                            try:
                                # First try to see if the values in the response JSON for the property match
                                if tableReference['refEntityOrLiteral'][referencePropertyIndex] == 'value':
                                    print("here 1")
                                    # The values for times are buried a layer deeper in the JSON than other types.
                                    if tableReference['refTypeList'][referencePropertyIndex] == 'time':
                                        print("here 2")
                                        # need to handle negative dates (BCE)
                                        if tableData[rowNumber][tableReference['refValueColumnList'][referencePropertyIndex] + '_val'][0] == '-':
                                            print("here 3")
                                            # make comparison with the leading minus present
                                            if responseReference['snaks'][tableReference['refPropList'][referencePropertyIndex]][0]['datavalue']['value']['time'] != tableData[rowNumber][tableReference['refValueColumnList'][referencePropertyIndex] + '_val']:
                                                referenceMatch = False
                                                print("Kill #1")
                                                break # kill the inner loop because this value doesn't match
                                        else:
                                            print("here 4")
                                            # must add leading plus (not stored in the table) to match the non-standard plus included by Wikibase
                                            # Note that this assumes the first value for a particular reference property. It appears to be unusual for there to be more than one.
                                            if responseReference['snaks'][tableReference['refPropList'][referencePropertyIndex]][0]['datavalue']['value']['time'] != '+' + tableData[rowNumber][tableReference['refValueColumnList'][referencePropertyIndex] + '_val']:
                                                referenceMatch = False
                                                print("Kill #2")
                                                break # kill the inner loop because this value doesn't match
                                    else: # here is where node-valued types other than time will be handled
                                        pass
                                else: # Values for types other than node-valued have direct literal values of 'value'
                                    if responseReference['snaks'][tableReference['refPropList'][referencePropertyIndex]][0]['datavalue']['value'] != tableData[rowNumber][tableReference['refValueColumnList'][referencePropertyIndex]]:
                                        referenceMatch = False
                                        print("Kill #3")
                                        break # kill the inner loop because this value doesn't match
                                # So far, so good -- the value for this property matches
                            except:
                                # An exception occured because the JSON "path" to the value didn't match. So this isn't the right property
                                referenceMatch = False
                                print("Kill #4")
                                break # kill the inner loop because the property doesn't match

                            # OK, we got all the way through on this property with it and its value matching, so referenceMatch will still be True
                            # The inner loop can continue on to the next property to see if it and its value match.

                        # If we got to this point, the inner loop completed withoug being killed. referenceMatch should still be True
                        # So this is a match to the reference that we wrote and we need to grab the reference hash
                        print('adding the reference hash to the table', responseReference['hash'])
                        tableData[rowNumber][tableReference['refHashColumn']] = responseReference['hash']
                        # It is not necessary to continue on with the next iteration of the outer loop since we found the reference we wanted.
                        # So we can kill the outer loop with the value of referenceMatch being True
                        break

                    # At this point, the outer loop is finished. Either a response reference has matched or all response references have been checked.
                    # Since this check only happens for newly written statements, referenceMatch should always be True since the exact reference was written.
                    # But better give an error message if for some reason no reference matched.
                    if referenceMatch == False:
                        print('No reference in the response JSON matched with the reference for statement:', tableData[rowNumber][subjectWikidataIdColumnHeader], ' ', propertiesIdList[statementIndex], ' ', tableData[rowNumber][propertiesColumnList[statementIndex]])
                        print('Reference  ', tableReference)

                    # The script will now move on to checking the next reference in the table.

        # Print this error message only if there is not match to any of the values after looping through all of the matching properties
        # This should never happen because this code is only executed when the statement doesn't have a UUID (i.e. not previously written)
        if count == 0:
            print('did not find', tableData[rowNumber][propertiesColumnList[statementIndex]])



csv statement number:  0
csv statement number:  1

{'mainsnak': {'snaktype': 'value', 'property': 'P108', 'hash': 'de571f9cebebaf7805f4c6836e9eb027b8d3f57f', 'datavalue': {'value': {'entity-type': 'item', 'numeric-id': 886141, 'id': 'Q886141'}, 'type': 'wikibase-entityid'}, 'datatype': 'wikibase-item'}, 'type': 'statement', 'id': 'Q101242960$B9EEE4CC-791F-4E53-9542-9671811179CE', 'rank': 'normal', 'references': [{'hash': '2c1963b96bde00545c55c48774c2aa8d09c47a97', 'snaks': {'P854': [{'snaktype': 'value', 'property': 'P854', 'hash': 'd15eebebef5e272314bb43d3e5d6a894ef1dc135', 'datavalue': {'value': 'https://www.bluffton.edu/catalog/officers/faculty.aspx', 'type': 'string'}, 'datatype': 'url'}], 'P813': [{'snaktype': 'value', 'property': 'P813', 'hash': 'bc92f3b0b3a4ac82c78818993bf9fe814aa6699b', 'datavalue': {'value': {'time': '+2020-11-06T00:00:00Z', 'timezone': 0, 'before': 0, 'after': 0, 'precision': 11, 'calendarmodel': 'http://www.wikidata.org/entity/Q1985727'}, 'type': 'time'}, 'd

In [70]:
print(tableData[rowNumber][tableReference['refValueColumnList'][referencePropertyIndex] + '_val'])
print(propertiesColumnList)
print(propertiesColumnList[1])
print(tableData[0][propertiesColumnList[1]][0])
print(tableData[0][propertiesColumnList[1] + '_val'][0])

2020-11-06T00:00:00Z
['orcid', 'employer', 'affiliation', 'instanceOf', 'sexOrGenderQId']
employer
Q


KeyError: 'employer_val'

In [31]:
# Replace the table with a new one containing any new IDs
# Note: I'm writing after every line so that if the script crashes, no data will be lost
writeToFile(tableFileName, fieldnames, tableData)
#with open(tableFileName, 'w', newline='', encoding='utf-8') as csvfile:
#    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
#    writer.writeheader()
#    for rowNumber in range(0, len(tableData)):
#        try:
#            writer.writerow(tableData[rowNumber])
#        except:
#            print('ERROR row:', rowNumber, '  ', tableData[rowNumber])
#            print()

# The limit for bots without a bot flag seems to be 50 writes per minute. That's 1.2 s between writes.
# To be safe and avoid getting blocked, use 1.25 s.
sleep(1.25)


In [None]:
# process each row of the table for references of existing claims
print('Writing references of existing claims')
print('--------------------------')
print()
for rowNumber in range(0, len(tableData)):
    print('processing row ', rowNumber, 'id:', tableData[rowNumber][subjectWikidataIdColumnHeader])

    for propertyNumber in range(0, len(propertiesColumnList)):
        propertyId = propertiesIdList[propertyNumber]
        statementUuidColumn = propertiesUuidColumnList[propertyNumber]     
        # We are only interested in writing references for statements that already have UUIDs
        if tableData[rowNumber][statementUuidColumn] != '':
            if len(propertiesReferencesList[propertyNumber]) != 0:  # skip that claim if it doesn't have references

                for reference in propertiesReferencesList[propertyNumber]:
                    if tableData[rowNumber][reference['refHashColumn']] == '': # process only new references
                        # in this script, the createReferences function returns a snak dictionary, not a list
                        referencesDict = createReferenceSnak(reference, tableData[rowNumber])
                        if referencesDict == {}: # Check for the case where no references were specified for this record
                            #print('no data to write')
                            #print()
                            pass
                        else:
                            # print(json.dumps(referencesDict, indent=2))
                            # build the parameter string to be posted to the API
                            parameterDictionary = {
                                'action': 'wbsetreference',
                                'statement': tableData[rowNumber][subjectWikidataIdColumnHeader] + "$" + tableData[rowNumber][statementUuidColumn],
                                'format':'json',
                                'token': csrfToken,
                                'snaks': json.dumps(referencesDict)
                                }
                            if maxlag > 0:
                                parameterDictionary['maxlag'] = maxlag
                            # print(json.dumps(parameterDictionary, indent = 2))

                            # print('ref:', reference['refValueColumnList'])
                            responseData = attemptPost(endpointUrl, parameterDictionary)
                            print('Write confirmation: ', responseData)
                            print()

                            tableData[rowNumber][reference['refHashColumn']] = responseData['reference']['hash']

                            # Replace the table with a new one containing any new IDs
                            # Note: I'm writing after every line so that if the script crashes, no data will be lost
                            writeToFile(tableFileName, fieldnames, tableData)

                            #with open(tableFileName, 'w', newline='', encoding='utf-8') as csvfile:
                            #    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                            #    writer.writeheader()
                            #    for writeRowNumber in range(0, len(tableData)):
                            #        writer.writerow(tableData[writeRowNumber])

                            # The limit for bots without a bot flag seems to be 50 writes per minute. That's 1.2 s between writes.
                            # To be safe and avoid getting blocked, use 1.25 s.
                            sleep(1.25)
print()
print('done')
