# VanderBot

The scripts in this notebook are part of the development of VanderBot, a system to write information about Vanderbilt University researchers and their works to Wikidata.  

This code is freely available under a CC0 license. Steve Baskauf 2019-12-16

VanderBot 0.8 is under development and subject to continual change. At this point, it's too new to have any stable releases.  

For more information, see [this page](https://github.com/HeardLibrary/linked-data/tree/master/publications).  

# Common Code

This code block includes import statements, function definitions, and declarations of variables that are common to the rest of the script. It needs to be run once before the other code blocks.

In [None]:
import requests   # best library to manage HTTP transactions
from bs4 import BeautifulSoup # web-scraping library
import json
from time import sleep
import csv
import math
from fuzzywuzzy import fuzz # fuzzy logic matching
from fuzzywuzzy import process
import xml.etree.ElementTree as et # library to traverse XML tree
import urllib
import datetime

# For a particular processing round, set a short name for the department here.
# This name is used to generate a set of unique processing files for that department.
deptShortName = 'physics'
testEmployer = 'Vanderbilt University' # to test against Wikidata employer property
employerQId = 'Q29052' # Vanderbilt University
deathDateLimit = '2000' # any date deaths before this date will be assumed to not be a match

# NOTE: eventually need to test against all affiliations in cases of faculty with multiple appointments

# ***********************************
# NOTE: the script fails if there is a current item in Wikidata that has the same values for both label and description. 
# A check needs to be run for this !!!
# ***********************************

# The default labels and descriptions can either be a column in the table or set as a constant. 
# If it's a column, the value is the column header.  If it's a constant, the value is the string to assign as the value.
deptSettings = {
    'bsci': {
        'categories': ['primary-training-faculty', 'research-and-teaching-faculty', 'secondary-faculty', 'postdoc-fellows', 'emeriti'],
        'baseUrl': 'https://as.vanderbilt.edu/biosci/people/index.php?group=',
        'departmentSearchString': 'Biological Sciences',
        'departmentQId': 'Q78041310',
        'testAuthorAffiliation': 'Biological Sciences Vanderbilt',
        "labels": 
            {
                "source": "column",
                "value": "name"
            },
        "descriptions": 
            {
                "source": "constant",
                "value": "biology researcher"
            }
    },
    'physics': {
        'categories': ['faculty', 'emeritus-faculty', 'academic-research-staff'],
        'baseUrl': 'https://as.vanderbilt.edu/physics/people/index.php?group=',
        'departmentSearchString': 'Physics Astronomy',
        'departmentQId': 'Q78779260',
        'testAuthorAffiliation': 'Physics Astronomy Vanderbilt',
        "labels": 
            {
                "source": "column",
                "value": "name"
            },
        "descriptions": 
            {
                "source": "constant",
                "value": "physics/astronomy researcher"
            }
    }
}

wikidataEndpointUrl = 'https://query.wikidata.org/sparql'
degreeList = [
    {'string': 'Ph.D.', 'value': 'Ph.D.'},
    {'string': 'PhD', 'value': 'Ph.D.'},
    {'string': 'D.Phil.', 'value': 'D.Phil.'},
    {'string': 'J.D.', 'value': 'J.D.'}
     ]

# NCBI identification requirements:
# tool name and email address should be sent with all requests
# see https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
emailAddress = 'steve.baskauf@vanderbilt.edu' # put your email address here
toolName = 'VanderBot' # give your application a name here

# generates a dictionary to be passed in a requests GET method to generate the request header
def generateHeaderDictionary(acceptMediaType):
    userAgentHeader = 'VanderBot/0.8 (https://github.com/HeardLibrary/linked-data/tree/master/publications; mailto:steve.baskauf@vanderbilt.edu)'
    requestHeaderDictionary = {
        'Accept' : acceptMediaType,
        'User-Agent': userAgentHeader
    }
    return requestHeaderDictionary

# write a list of lists to a CSV file
def writeCsv(fileName, array):
    fileObject = open(fileName, 'w', newline='', encoding='utf-8')
    writerObject = csv.writer(fileObject)
    for row in array:
        writerObject.writerow(row)
    fileObject.close()

# write a list of dictionaries to a CSV file
def writeDictsToCsv(table, filename, fieldnames):
    with open(filename, 'w', newline='') as csvFileObject:
        writer = csv.DictWriter(csvFileObject, fieldnames=fieldnames)
        writer.writeheader()
        for row in table:
            writer.writerow(row)

# read from a CSV file into a list of dictionaries
def readDict(filename):
    fileObject = open(filename, 'r', newline='', encoding='utf-8')
    dictObject = csv.DictReader(fileObject)
    array = []
    for row in dictObject:
        array.append(row)
    fileObject.close()
    return array

# extracts the qNumber from a Wikidata IRI
def extractQNumber(iri):
    # pattern is http://www.wikidata.org/entity/Q6386232
    pieces = iri.split('/')
    return pieces[4]

# extracts a local name from an IRI, specify the list item number for the last piece separated by slash
def extractFromIri(iri, numberPieces):
    # with pattern like http://www.wikidata.org/entity/Q6386232 there are 5 pieces with qId as number 4
    pieces = iri.split('/')
    return pieces[numberPieces]

# see https://www.wikidata.org/wiki/Property:P21 for values
def decodeSexOrGender(code):
    code = code.lower()
    if code == 'm':
        qId = 'Q6581097'
    elif code == 'f':
        qId = 'Q6581072'
    elif code == 'i':
        qId = 'Q1097630'
    elif code == 'tf':
        qId = 'Q1052281'
    elif code == 'tm':
        qId = 'Q2449503'
    else:
        qId = ''
    return qId

# query for a single variable that's an item named 'item'
# returns a list of results
def searchWikidataForQIdByOrcid(orcid):
    query = '''
select distinct ?item where {
  ?item wdt:P496 "''' + employees[employeeIndex]['orcid'] + '''".
  }
'''
    results = []
    acceptMediaType = 'application/json'
    r = requests.get(wikidataEndpointUrl, params={'query' : query}, headers = generateHeaderDictionary(acceptMediaType))
    try:
        data = r.json()
        statements = data['results']['bindings']
        for statement in statements:
            wikidataIri = statement['item']['value']
            qNumber = extractQNumber(wikidataIri)
            results.append(qNumber)
    except:
        results = [r.text]
    # delay a quarter second to avoid hitting the SPARQL endpoint to rapidly
    sleep(0.25)
    return results

# returns a list of employer strings for the item with Wikidata ID qId; P108 is employer
def searchWikidataEmployer(qId):
    resultsList = []
    query = '''select distinct ?employer where {
        wd:'''+ qId + ''' wdt:P108 ?employerId.
        ?employerId rdfs:label ?employer.
        FILTER(lang(?employer) = 'en')
      }'''
    #print(query)
    r = requests.get(wikidataEndpointUrl, params={'query' : query}, headers=requestHeaderDictionary)
    try:
        data = r.json()
        statements = data['results']['bindings']
        if len(statements) > 0: # if no results, the list remains empty
            for statement in statements:
                resultsList.append(statement['employer']['value'])
    except:
        resultsList = [r.text]
    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(0.25)
    return resultsList

# returns a list of value Q IDs of the property propertyId for the item with Wikidata ID qId
def searchWikidataSingleProperty(qId, propertyId, valueType):
    resultsList = []
    query = '''select distinct ?object where {
        wd:'''+ qId + ''' wdt:''' + propertyId + ''' ?object.
      }'''
    #print(query)
    r = requests.get(wikidataEndpointUrl, params={'query' : query}, headers=requestHeaderDictionary)
    try:
        data = r.json()
        statements = data['results']['bindings']
        if len(statements) > 0: # if no results, the list remains empty
            for statement in statements:
                if valueType == 'item':
                    resultValue = extractQNumber(statement['object']['value'])
                else:
                    resultValue = statement['object']['value']
                resultsList.append(resultValue)
    except:
        resultsList = [r.text]
    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(0.25)
    return resultsList

def searchOrcidAtWikidata(qIds):
    alternatives = ''
    for qId in qIds:
        alternatives += 'wd:' + qId + '\n'
        
    # NOTE: the MINUS clause removes properties in the http://www.wikidata.org/prop/reference/value/ namespace
    # leaving only those in the http://www.wikidata.org/prop/reference/ namespace (i.e. the direct literal values)
    query = '''
select distinct ?id ?statement ?orcid ?reference ?refProp ?refVal where {
  VALUES ?id
{
''' + alternatives + '''}
  ?id p:P496 ?statement.
  ?statement ps:P496 ?orcid.
  optional {
        ?statement prov:wasDerivedFrom ?reference.
        ?reference ?refProp ?refVal.
        MINUS {
            ?reference ?refProp ?refVal. 
            FILTER(contains(str(?refProp), "value/"))
        }
    }
  }'''
    #print(query)
    results = []
    acceptMediaType = 'application/json'
    r = requests.get(wikidataEndpointUrl, params={'query' : query}, headers=generateHeaderDictionary(acceptMediaType))
    data = r.json()
    statements = data['results']['bindings']
    # ********** NOTE: need to deal with case where there are more than one reference per statement
    # This will result in several statements with the same qNumeber, orcid, and referenceHash
    for statement in statements:
        # remove wd: 'http://www.wikidata.org/entity/'
        qNumber = extractFromIri(statement['id']['value'], 4)
        # remove wds: 'http://www.wikidata.org/entity/statement/'
        noDomain = extractFromIri(statement['statement']['value'], 5)
        # need to remove the qNumber that's appended in front of the UUID
        statementUuid = noDomain.partition(qNumber + '-')[2]
        orcid = statement['orcid']['value']
        if 'reference' in statement:
            # remove wdref: 'http://www.wikidata.org/reference/'
            referenceHash = extractFromIri(statement['reference']['value'], 4)
        else:
            referenceHash = ''
        if 'refProp' in statement:
            # remove pr: 'http://www.wikidata.org/prop/reference/'
            referenceProperty = extractFromIri(statement['refProp']['value'], 5)
        else:
            referenceProperty = ''
        if 'refVal' in statement:
            referenceValue = statement['refVal']['value']
            # if it's a date, it comes down as 2019-12-05T00:00:00Z, but the API wants just the date: 2019-12-05
            if referenceProperty == 'P813': # the likely property is "retrieved"; just leave it if it's another property
                referenceValue = referenceValue.split('T')[0]
        else:
            referenceValue = ''
        results.append({'qId': qNumber, 'statementUuid': statementUuid, 'orcid': orcid, 'referenceHash': referenceHash, 'referenceProperty': referenceProperty, 'referenceValue': referenceValue})
    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(0.25)
    return results

def searchHumansAtWikidata(qIds):
    alternatives = ''
    for qId in qIds:
        alternatives += 'wd:' + qId + '\n'
        
    # NOTE: instanceOf human is one of the statement that Wikidata does not care about references for
    # So we will ignore them here
    query = '''
select distinct ?id ?statement where {
  VALUES ?id
{
''' + alternatives + '''}
  ?id p:P31 ?statement.
  ?statement ps:P31 wd:Q5.
  }'''
    #print(query)
    results = []
    acceptMediaType = 'application/json'
    r = requests.get(wikidataEndpointUrl, params={'query' : query}, headers=generateHeaderDictionary(acceptMediaType))
    data = r.json()
    statements = data['results']['bindings']
    for statement in statements:
        # remove wd: 'http://www.wikidata.org/entity/'
        qNumber = extractFromIri(statement['id']['value'], 4)
        # remove wds: 'http://www.wikidata.org/entity/statement/'
        noDomain = extractFromIri(statement['statement']['value'], 5)
        # need to remove the qNumber that's appended in front of the UUID
        statementUuid = noDomain.partition(qNumber + '-')[2]
        results.append({'qId': qNumber, 'statementUuid': statementUuid})
    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(0.25)
    return results

def checkOrcid(orcid):
    namespace = 'https://orcid.org/'
    endpointUrl = namespace + orcid
    acceptMediaType = 'application/ld+json'
    r = requests.get(endpointUrl, headers=generateHeaderDictionary(acceptMediaType))
    code = r.status_code
    #print(r.text)
    data = r.json()
    response = {'code': code, 'data': data}
    if response['code'] != 200:
        print('Attempt to dereference ORCID resulted in HTTP response code ', response['code'])
        data['orcidReferenceValue'] = ''
    else:
        print('Successfully retrieved')
        wholeTimeStringZ = datetime.datetime.utcnow().isoformat() # form: 2019-12-05T15:35:04.959311
        dateZ = wholeTimeStringZ.split('T')[0] # form 2019-12-05
        wholeDateZ = '+' + dateZ + 'T00:00:00Z' # form +2019-12-05T00:00:00Z as provided by Wikidata
    # delay a quarter second to avoid hitting the API too rapidly
    sleep(0.25)
    return(wholeDateZ)

# if the value passed is '' then the value will be retrieved.  Otherwise, the value is used to screen.
def searchStatementAtWikidata(qIds, prop, value, refPropList):
    # create a string for all of the Wikidata item IDs to be used as subjects in the query
    alternatives = ''
    for qId in qIds:
        alternatives += 'wd:' + qId + '\n'
        
    # create a string for the query
    query = 'select distinct ?id ?statement '
    # if no value was specified, find the value
    if value == '':
        query += '?statementValue '
    if len(refPropList) != 0:
        query += '?reference '
    for refPropIndex in range(0, len(refPropList)):
        query += '?refVal' + str(refPropIndex) + ' '
    query += '''where {
  VALUES ?id
{
''' + alternatives + '''}
  ?id p:'''+ prop + ''' ?statement.
  ?statement ps:'''+ prop
    
    if value == '':
        query += ' ?statementValue.'
    else:
        query += ' wd:' + value + '.'

    if len(refPropList) != 0:
        query += '''
  optional {
    ?statement prov:wasDerivedFrom ?reference.'''
        for refPropIndex in range(0, len(refPropList)):
            query +='''
    ?reference pr:''' + refPropList[refPropIndex] + ''' ?refVal''' + str(refPropIndex) + '''.'''
        query +='''
        }'''
    query +='''
  }'''
    #print(query)

    returnValue = []
    acceptMediaType = 'application/json'
    r = requests.get(wikidataEndpointUrl, params={'query' : query}, headers=generateHeaderDictionary(acceptMediaType))
    data = r.json()
    results = data['results']['bindings']
    # ********** NOTE: need to deal with case where there are more than one reference per result
    # This will result in several results with the same qNumeber, orcid, and referenceHash
    for result in results:
        # remove wd: 'http://www.wikidata.org/entity/'
        qNumber = extractFromIri(result['id']['value'], 4)
        # remove wds: 'http://www.wikidata.org/entity/statement/'
        noDomain = extractFromIri(result['statement']['value'], 5)
        # need to remove the qNumber that's appended in front of the UUID
        statementUuid = noDomain.partition(qNumber + '-')[2]
        # if no value was specified, get the value that was found in the search
        if value == '':
            statementValue = result['statementValue']['value']
        if len(refPropList) != 0:
            if 'reference' in result:
                # remove wdref: 'http://www.wikidata.org/reference/'
                referenceHash = extractFromIri(result['reference']['value'], 4)
            else:
                referenceHash = ''
            referenceValues = []
            for refPropIndex in range(0, len(refPropList)):
                if 'refVal' + str(refPropIndex) in result:
                    refVal = result['refVal' + str(refPropIndex)]['value']
                    # if it's a date, it comes down as 2019-12-05T00:00:00Z, but the API wants just the date: 2019-12-05
                    #if referenceProperty == 'P813': # the likely property is "retrieved"; just leave it if it's another property
                    #    referenceValue = referenceValue.split('T')[0]
                else:
                    refVal = ''
                referenceValues.append(refVal)
        resultsDict = {'qId': qNumber, 'statementUuid': statementUuid}
        # if no value was specified, get the value that was found in the search
        if value == '':
            resultsDict['statementValue'] = statementValue
        if len(refPropList) != 0:
            resultsDict['referenceHash'] = referenceHash
            resultsDict['referenceValues'] = referenceValues
        returnValue.append(resultsDict)

    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(0.25)
    
    return returnValue

# search for any of the "label" types: label, alias, description
def searchLabelsDescriptionsAtWikidata(qIds, labelType, language):
    # create a string for all of the Wikidata item IDs to be used as subjects in the query
    alternatives = ''
    for qId in qIds:
        alternatives += 'wd:' + qId + '\n'
        
    if labelType == 'label':
        predicate = 'rdfs:label'
    elif labelType == 'alias':
        predicate = 'skos:altLabel'
    elif labelType == 'description':
        predicate = 'schema:description'
    else:
        predicate = 'rdfs:label'        
        
    # create a string for the query
    query = 'select distinct ?id ?string '
    query += '''where {
  VALUES ?id
{
''' + alternatives + '''}
  ?id '''+ predicate + ''' ?string.
  filter(lang(?string)="''' + language + '''")
  }'''
    #print(query)

    returnValue = []
    acceptMediaType = 'application/json'
    r = requests.get(wikidataEndpointUrl, params={'query' : query}, headers=generateHeaderDictionary(acceptMediaType))
    data = r.json()
    results = data['results']['bindings']
    for result in results:
        # remove wd: 'http://www.wikidata.org/entity/'
        qNumber = extractFromIri(result['id']['value'], 4)
        string = result['string']['value']
        resultsDict = {'qId': qNumber, 'string': string}
        returnValue.append(resultsDict)

    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(0.25)
    
    return returnValue

# Query ORCID for Vanderbilt University people

Script developed at https://github.com/HeardLibrary/linked-data/blob/master/publications/orcid/orcid-get-json.ipynb

Retrieves results 100 at a time, then processes them by extracting desired information.  NOTE: takes hours to run.

Saves results in a file and the alternative names in a second file.

In [None]:
table = [['orcid', 'givenNames', 'familyName', 'startDate', 'endDate', 'department', 'organization']]
otherNameList = [['orcid', 'altName']]

# use the API to search for people associated with Vanderbilt University
# First search is for only one record, just to get the number of hits found
searchUri = 'https://pub.orcid.org/v2.0/search/?q=affiliation-org-name:"Vanderbilt+University"&start=1&rows=1'
acceptMediaType = 'application/json'
response = requests.get(searchUri, headers = generateHeaderDictionary(acceptMediaType))
data = response.json()
numberResults = data["num-found"]
#print(data["num-found"])
numberPages = math.floor(numberResults/100)
#print(numberPages)
remainder = numberResults - 100*numberPages
#print(remainder)

for pageCount in range(0, numberPages+1):  # the remainder will be caught when pageCount = numberPages
    print('page: ', pageCount)
    searchUri = 'https://pub.orcid.org/v2.0/search/?q=affiliation-org-name:"Vanderbilt+University"&start='+str(pageCount*100+1)
    response = requests.get(searchUri, headers={'Accept' : 'application/json'})
    print(response.url)
    data = response.json()
    orcidsDictsList = data['result']

    # extract the identifier strings from the data structure
    orcids = []
    for orcidDict in orcidsDictsList:
        dictionary = {'id': orcidDict['orcid-identifier']['path'], 'iri': orcidDict['orcid-identifier']['uri']}
        orcids.append(dictionary)

    for orchidIndex in range(0, len(orcids)):
        response = requests.get(orcids[orchidIndex]['iri'], headers={'Accept' : 'application/json'})
        data = response.json()

        #print(json.dumps(data, indent = 2))
        orcidId = data['orcid-identifier']['path']
        #print(orcidId)
        # if there isn't a name, then go on to the next ORCID
        if not data['person']['name']:
            continue
        if data['person']['name']['given-names']:  
            givenNames = data['person']['name']['given-names']['value']
        else:
            givenNames = ''
        if data['person']['name']['family-name']:
            familyName = data['person']['name']['family-name']['value']
        else:
            familyName = ''
        #print(givenNames, ' ', familyName)
        otherNames = data['person']['other-names']['other-name']
        for otherName in otherNames:
            #print(otherName['content'])
            otherNameList.append([orcidId, otherName['content']])

        affiliations = data['activities-summary']['employments']['affiliation-group']
        #print(json.dumps(affiliations, indent = 2))
        for affiliation in affiliations:
            summaries = affiliation['summaries']
            #print(summaries)
            #print()
            for summary in summaries:
                employment = summary['employment-summary']
                #print(json.dumps(employment, indent = 2))
                startDate = ''
                if employment['start-date']:
                    if employment['start-date']['year']:
                        startDate += employment['start-date']['year']['value']
                        startMonth = employment['start-date']['month']
                        if startMonth:
                            startDate += '-' + startMonth['value']
                            startDay = employment['start-date']['day']
                            if startDay:
                                startDate += '-' + startDay['value']
                #print('start date: ', startDate)
                endDate = ''
                if employment['end-date']:
                    if employment['end-date']['year']:
                        endDate += employment['end-date']['year']['value']
                        endMonth = employment['end-date']['month']
                        if endMonth:
                            endDate += '-' + endMonth['value']
                            endDay = employment['end-date']['day']
                            if endDay:
                                endDate += '-' + endDay['value']
                #print('end date: ', endDate)
                department = employment['department-name']
                # if there is no value for department, set it to empty string
                if not department:
                    department = ''
                #print(department)
                if employment['organization']:
                    organization = employment['organization']['name']
                #print(organization)
                if 'Vanderbilt University' in organization:
                    print(orcidId, givenNames, familyName, startDate, endDate, department, organization)
                    table.append([orcidId, givenNames, familyName, startDate, endDate, department, organization])
                #print(table)
        sleep(.25)

print()
print('Done')
fileName = 'orcid_data.csv'
writeCsv(fileName, table)
fileName = 'orcid_other_names.csv'
writeCsv(fileName, otherNameList)

# Scrape departmental website

script developed at https://github.com/HeardLibrary/linked-data/blob/master/publications/scrape-bsci.ipynb


In [None]:
outputTable = [['name', 'degree', 'role', 'category']]
categories = deptSettings[deptShortName]['categories']

acceptMediaType = 'text/html'
for category in categories:
    url = deptSettings[deptShortName]['baseUrl'] + category
    response = requests.get(url, headers = generateHeaderDictionary(acceptMediaType))
    soupObject = BeautifulSoup(response.text,features="html5lib")

    # get the first table from the page
    tableObject = soupObject.find_all('table')[0]
    
    # get the rows from the table
    rowObjectsList = tableObject.find_all('tr')
    for rowObject in rowObjectsList:
        # get the cells from each row
        cellObjectsList = rowObject.find_all('td')
        # picture is in cell 0, name and title is in cell 1
        nameCell = cellObjectsList[1]
        # the name part is bolded
        name = nameCell('strong')[0].text

        # check to see if the name has already been added to the list (some depts put people on two category lists)
        found = False
        for person in outputTable:  # not worrying about the header row, which shouldn't match any name
            if person[0] == name:
                found = True
                break  # quit looking for the person
        if not found:  # only finish extracting and saving data if there isn't a match
            # separate degrees from names
            degree = ''
            for testDegree in degreeList:
                if testDegree['string'] in name:
                    name = name.partition(', ' + testDegree['string'])[0]
                    # correct any malformed strings
                    degree = testDegree['value']

            try:
                # process the roles text
                dirtyText  = str(nameCell)
                # get rid of trailing td tag
                nameCellText = dirtyText.split('</td>')[0]
                cellLines = nameCellText.split('<br/>')
                roles = []
                for lineIndex in range(1, len(cellLines)):
                    roleDict = {}
                    if ' of ' in cellLines[lineIndex]:
                        pieces = cellLines[lineIndex].split(' of ')
                        roleDict['title'] = pieces[0]
                        roleDict['department'] = pieces[1]
                        roles.append(roleDict)
                    elif ' in ' in cellLines[lineIndex]:
                        pieces = cellLines[lineIndex].split(' in ')
                        roleDict['title'] = pieces[0]
                        roleDict['department'] = pieces[1]
                        roles.append(roleDict)
                    else:
                        roleDict['title'] = cellLines[lineIndex]
                        roleDict['department'] = ''
                        roles.append(roleDict)
                    if ', Emeritus' in roleDict['department']:
                        roleDict['department'] = roleDict['department'].split(', Emeritus')[0]
                        roleDict['title'] = 'Emeritus ' + roleDict['title']
                rolesJson = json.dumps(roles)

            except:
                rolesJson = ''
            outputTable.append([name, degree, rolesJson, category])            

fileName = deptShortName + '-employees.csv'
writeCsv(fileName, outputTable)
print('done')

# Match departmental people with ORCID results

Script developed at https://github.com/HeardLibrary/linked-data/blob/master/publications/orcid/match_bsci_orcid.ipynb


In [None]:
filename = deptShortName + '-employees.csv'
employees = readDict(filename)

filename = 'orcid_data.csv'
orcidData = readDict(filename)

testRatio = 90
departmentTestRatio = 90
for employeeIndex in range(0, len(employees)):
    matched = False
    for row in orcidData:
        name = row['givenNames'] + ' ' + row['familyName']
        #ratio = fuzz.ratio(name, employees[employeeIndex][0])
        #partialRatio = fuzz.partial_ratio(name, employees[employeeIndex][0])
        #sortRatio = fuzz.token_sort_ratio(name, employees[employeeIndex][0])
        
        output = ''
        # the set ratio seems to do the best job of matching
        setRatio = fuzz.token_set_ratio(name, employees[employeeIndex]['name'])
        if setRatio >= testRatio:
            output = str(setRatio) + ' ' + name + ' / ' + employees[employeeIndex]['name']
            
            if row['department'] == '':
                output += " WARNING: no department given in ORCID"
            else:
                # carry out a secondary test to see if any of the departments listed in the BSCI page
                # are a good match to the department given in the ORCID record
                
                # expand the role JSON into a list of dictionaries
                roleDict = json.loads(employees[employeeIndex]['role'])
                departmentMatch = False
                for department in roleDict:
                    setRatio = fuzz.token_set_ratio(deptSettings[deptShortName]['departmentSearchString'], row['department'])
                    if setRatio > departmentTestRatio:
                        departmentMatch = True
                        output += ' ' + str(setRatio) + ' ' + row['department']
                if not departmentMatch:
                    output += ' WARNING: ' + row['department'] + ' less than ' + str(departmentTestRatio) + '% match to any dept.'
            print(output)
            matched = True
            foundOrcid = row['orcid']
            # We only care about the first good match to an ORCID record, kill the loop after that
            break
    if matched:
        employees[employeeIndex]['orcid'] = foundOrcid
    else:
        employees[employeeIndex]['orcid'] = ''
        
filename = deptShortName + '-employees-with-orcid.csv'
fieldnames = ['name', 'degree', 'category', 'orcid', 'role']
writeDictsToCsv(employees, filename, fieldnames)

print('Done')

# Download Vanderbilt people data from Wikidata

Developed at https://github.com/HeardLibrary/linked-data/blob/master/publications/wikidata/download-vanderbilt-people.py

Performs a SPARQL query to find people in Wikidata that are employed at Vanderbilt.  Returns name, description, start date, end date, and ORCID if it has them.  Output to vanderbilt_wikidata.csv

In [None]:
query = '''select distinct  ?person ?name ?orcid ?startDate ?endDate ?description where {
  ?person p:P108 ?statement.
  ?statement ps:P108  wd:Q29052.
  optional{
    ?person rdfs:label ?name.
    FILTER(lang(?name)="en")
    }
  optional{?statement pq:P580 ?startDate.}
  optional{?statement pq:P582 ?endDate.}
  optional{?person wdt:P496 ?orcid.}
  optional{
    ?person schema:description ?description.
    FILTER(lang(?description)="en")
          }
  }'''

# The endpoint defaults to returning XML, so the Accept: header is required
r = requests.get(wikidataEndpointUrl, params={'query' : query}, headers={'Accept' : 'application/json'})

data = r.json()
print(json.dumps(data,indent = 2))

table = [['wikidataIri', 'name', 'description', 'startDate', 'endDate', 'orcid']]
items = data['results']['bindings']
for item in items:
    wikidataIri = item['person']['value']
    name = ''
    if 'name' in item:
        name = item['name']['value']
    description = ''
    if 'description' in item:
        description = item['description']['value']
    startDate = ''
    if 'startDate' in item:
        startDate = item['startDate']['value']
    endDate = ''
    if 'endDate' in item:
        endDate = item['endDate']['value']
    orcid = ''
    if 'orcid' in item:
        orcid = item['orcid']['value']
    table.append([wikidataIri, name, description, startDate, endDate, orcid])
    
fileName = 'vanderbilt_wikidata.csv'
writeCsv(fileName, table)

# Download Vanderbilt people's altLabels from Wikidata

Developed at https://github.com/HeardLibrary/linked-data/blob/master/publications/wikidata/download-vanderbilt-people-altlabels.py

Similar to previous query, except when there is a match, it downloads the altLabels.

In [None]:
query = '''select distinct  ?person ?altLabel where {
  ?person p:P108 ?statement.
  ?statement ps:P108  wd:Q29052.
  ?person skos:altLabel ?altLabel.
  FILTER(lang(?altLabel)="en")
}'''

# The endpoint defaults to returning XML, so the Accept: header is required
r = requests.get(wikidataEndpointUrl, params={'query' : query}, headers={'Accept' : 'application/json'})

data = r.json()
print(json.dumps(data,indent = 2))

table = [['wikidataIri', 'altLabel']]
items = data['results']['bindings']
for item in items:
    wikidataIri = item['person']['value']
    altLabel = ''
    if 'altLabel' in item:
        altLabel = item['altLabel']['value']
    table.append([wikidataIri, altLabel])
    
fileName = 'vanderbilt_wikidata_altlabels.csv'
writeCsv(fileName, table)


# Match employees to Wikidata

Script developed at https://github.com/HeardLibrary/linked-data/blob/master/publications/wikidata/match_bsci_wikidata.ipynb

Attempts to match records of people Wikidata knows to work at Vanderbilt with departmental people by matching their ORCIDs, then name strings. If there isn't a match with the downloaded Wikidata records, for employees with ORCIDs, the script attempts to find them in Wikidata by directly doing a SPARQL search for their ORCID.

In [None]:
filename = deptShortName + '-employees-with-orcid.csv'
employees = readDict(filename)

filename = 'vanderbilt_wikidata.csv'
wikidataData = readDict(filename)

testRatio = 90
departmentTestRatio = 90

for employeeIndex in range(0, len(employees)):
    # 0=unmatched
    # 1=matched with ORCID in both sources
    # 2=ORCID in BSCI but name match to Wikidata (no ORCID)
    # 3=no ORCID in BSCI but name match to Wikidata (with ORCID); could happen if affiliation isn't matched in ORCID
    # 4=no ORCID in BSCI but name match to Wikidata (no ORCID)
    # 5=ORCID in BSCI and found via SPARQL ORCID search (likely non-VU affiliated in Wikidata)
    # 6=ORCID in BSCI and found via SPARQL name search (non-VU affiliated without ORCID)
    # 7=no name match
    # 8=ORCID in BSCI, error in SPARQL ORCID search
    # 9=no ORCID in BSCI, error in SPARQL name search
    # 10=affiliation match in article
    # 11=match by human choice after looking at entity data
    
    matchStatus = 0
    for row in wikidataData:
        # We know the employee has an ORCID, so try to match it
        if employees[employeeIndex]['orcid'] != '':
            # There's a match, hooray!
            if employees[employeeIndex]['orcid'] == row['orcid']:
                print('orcid match: ', row['name'] + ' ' + row['orcid'])
                matchStatus = 1
                employees[employeeIndex]['wikidataId'] = extractQNumber(row['wikidataIri'])
            # No ORCID match - see if the name matches
            else:
                setRatio = fuzz.token_set_ratio(row['name'], employees[employeeIndex]['name'])
                if setRatio >= testRatio:
                    print('name match: ', str(setRatio) + ' ' + row['name'] + ' / ' + employees[employeeIndex]['name'] + ' department:' + employees[employeeIndex]['orcid'])
                    matchStatus = 2
                    employees[employeeIndex]['wikidataId'] = extractQNumber(row['wikidataIri'])
        # As far as we know, the employee doesn't have an ORCID, so try to match the name
        else:
            setRatio = fuzz.token_set_ratio(row['name'], employees[employeeIndex]['name'])
            # We get a name match 
            if setRatio >= testRatio:
                # For some reason, Wikidata has the ORCID, so grab it
                if row['orcid'] != '':
                    print('name match: ', str(setRatio) + ' ' + row['name'] + ' / ' + employees[employeeIndex]['name'] + ' ORCID:' + row['orcid'])
                    employees[employeeIndex]['orcid'] = row['orcid']
                    matchStatus = 3
                # Wikidata doesn't have an ORCID
                else:
                    print('name match: ', str(setRatio) + ' ' + row['name'] + ' / ' + employees[employeeIndex]['name'] + ' WD description: ' + row['description'])
                    matchStatus = 4
                employees[employeeIndex]['wikidataId'] = extractQNumber(row['wikidataIri'])
        # We've gone all the way through the without finding a match

    # Do a last ditch attempt to try to find the person in Wikidata by doing a SPARQL search for their ORCID
    if matchStatus == 0:
        if employees[employeeIndex]['orcid'] != '':
            results = searchWikidataForQIdByOrcid(employees[employeeIndex]['orcid'])
            if len(results) > 0:
                print('SPARQL ORCID search: ', employees[employeeIndex]['name'], results)
                if len(results) == 1:
                    # if search fails and return an error message
                    if len(results[0]) > 15:
                        matchStatus = 8
                        print('Error message in ORCID search')
                    else:
                        matchStatus = 5
                        employees[employeeIndex]['wikidataId'] = results[0]
                else:
                    print('ERROR: multiple results for same ORCID')
    # after every possible matching method, record the matchStatus
    employees[employeeIndex]['wikidataStatus'] = str(matchStatus)
#print(employees)

filename = deptShortName + '-employees-with-wikidata.csv'
fieldnames = ['wikidataId', 'name', 'degree', 'category', 'orcid', 'wikidataStatus', 'role']
writeDictsToCsv(employees, filename, fieldnames)

print('done')


# Crosscheck people against publications

Developed at https://github.com/HeardLibrary/linked-data/blob/master/publications/crosscheck-publications.ipynb

Checks possible Wikidata records against publications in CrossRef and PubMed to see if the author metadata will disambiguate the Wikidata record.


In [None]:
acceptMediaType = 'application/json'
requestHeaderDictionary = generateHeaderDictionary(acceptMediaType)

def generateNameAlternatives(name):
    # get rid of periods
    name = name.replace('.', '')
    pieces = name.split(' ')
    
    # generate initials for all names
    initials = []
    for piece in pieces:
        initials.append(piece[0:1])
    
    # NOTE: currently doesn't handle ", Jr.", "III", etc.
    
    alternatives = []
    # full name
    nameVersion = ''
    for pieceNumber in range(0, len(pieces)-1):
        nameVersion += pieces[pieceNumber] + ' '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)
    
    # first and last name with initials
    nameVersion = pieces[0] + ' '
    for pieceNumber in range(1, len(pieces)-1):
        nameVersion += initials[pieceNumber] + ' '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)
    
    # first and last name with initials and periods
    nameVersion = pieces[0] + ' '
    for pieceNumber in range(1, len(pieces)-1):
        nameVersion += initials[pieceNumber] + '. '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)

    # first and last name only
    nameVersion = pieces[0] + ' '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)

    # first initial and last name only
    nameVersion = initials[0] + ' '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)

    # first initial with period and last name only
    nameVersion = initials[0] + '. '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)

    # all name initials with last name
    nameVersion = initials[0] + ' '
    for pieceNumber in range(1, len(pieces)-1):
        nameVersion += initials[pieceNumber] + ' '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)

    # all name initials with periods with last name
    nameVersion = ''
    for pieceNumber in range(0, len(pieces)-1):
        nameVersion += initials[pieceNumber] + '. '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)

    # all name initials concatenated with last name
    nameVersion = ''
    for pieceNumber in range(0, len(pieces)-1):
        nameVersion += initials[pieceNumber]
    nameVersion += ' ' + pieces[len(pieces)-1]
    alternatives.append(nameVersion)
    
    # remove duplicates
    dedupe = list(set(alternatives))

    return dedupe

def searchNameAtWikidata(name):
    nameList = generateNameAlternatives(name)
    alternatives = ''
    for alternative in nameList:
        alternatives += '"' + alternative + '"@en\n'
    query = '''
select distinct ?item ?label where {
  VALUES ?value
  {
  ''' + alternatives + '''}
?item rdfs:label|skos:altLabel ?value.
?item rdfs:label ?label.
FILTER(lang(?label)='en')
  }
'''
    #print(query)
    #print('searching for ', name)
    results = []
    r = requests.get(wikidataEndpointUrl, params={'query' : query}, headers=requestHeaderDictionary)
    try:
        data = r.json()
        statements = data['results']['bindings']
        for statement in statements:
            wikidataIri = statement['item']['value']
            if 'label' in statement:
                name = statement['label']['value']
            else:
                name = ''
            qNumber = extractQNumber(wikidataIri)
            results.append({'qId': qNumber, 'name': name})
    except:
        results = [{'error': r.text}]
    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(0.25)
    return results

# returns a dictionary of various descriptors of the item with Wikidata ID qId
# P106 is occupation, schema:description is filtered to be the English description
def searchWikidataDescription(qId):
    resultsDict = {}
    query = '''select distinct ?description ?orcid ?occupation where {
        optional {
            wd:'''+ qId + ''' schema:description ?description.
            FILTER(lang(?description) = 'en')
            }
        optional {
            wd:'''+ qId + ''' wdt:P106 ?occupationId.
            ?occupationId rdfs:label ?occupation.
            FILTER(lang(?occupation) = 'en')            
            }
        optional {wd:'''+ qId + ''' wdt:P496 ?orcid.}
      }'''
    #print(query)
    r = requests.get(wikidataEndpointUrl, params={'query' : query}, headers=requestHeaderDictionary)
    try:
        data = r.json()
        statements = data['results']['bindings']
        if len(statements) > 0: # if no results, the dictionary remains empty
            # Only a single description per language is allowed, so there should only be one description
            if 'description' in statements[0]:
                description = statements[0]['description']['value']
            else:
                description = ''
            resultsDict['description'] = description
            
            # Only a single ORCID is allowed, so there should only be one orcid value
            if 'orcid' in statements[0]:
                orcid = statements[0]['orcid']['value']
            else:
                orcid = ''
            resultsDict['orcid'] = orcid
            
            # if there are multiple statements, that's because there are more than one occupation
            occupationList = []
            for statement in statements:
                if 'occupation' in statement:
                    occupationList.append(statement['occupation']['value'])
            resultsDict['occupation'] = occupationList
    except:
        resultsDict = {'error': r.text}
    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(0.25)
    return resultsDict

# returns a list of results of articles by person with Wikidata ID qId
def searchWikidataArticle(qId):
    resultsList = []
    # P50 is "author"; P698 is the PubMed ID of the article; P356 is the DOI of the article
    query = '''select distinct ?title ?doi ?pmid where {
      ?article wdt:P50 wd:''' + qId + '''.
      optional {
          ?article rdfs:label ?title.
          FILTER(lang(?title) = 'en')
          }
      optional {?article wdt:P698 ?pmid.}
      optional {?article wdt:P356 ?doi.}
      }'''
    #print(query)
    r = requests.get(wikidataEndpointUrl, params={'query' : query}, headers=requestHeaderDictionary)
    try:
        data = r.json()
        statements = data['results']['bindings']
        for statement in statements:
            if 'title' in statement:
                title = statement['title']['value']
                #print('title=',title)
            else:
                title = ''
            if 'pmid' in statement:
                pmid = statement['pmid']['value']
            else:
                pmid = ''
            if 'doi' in statement:
                doi = statement['doi']['value']
            else:
                doi = ''
            resultsList.append({'title': title, 'pmid': pmid, 'doi': doi})
    except:
        resultsList = [r.text]
    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(0.25)
    return resultsList

def retrievePubMedData(pmid):
    fetchUrl = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
    paramDict = {
        'tool': toolName, 
        'email': emailAddress,
        'db': 'pubmed', 
         #'retmode': 'xml', 
        'rettype': 'abstract', 
        'id': pmid
    }
    response = requests.get(fetchUrl, params=paramDict)
    #print(response.url)
    pubData = response.text  # the response text is XML
    #print(pubData)  # uncomment this line to see the XML

    # process the returned XML, see https://docs.python.org/2/library/xml.etree.elementtree.html
    root = et.fromstring(pubData)
    try:
        title = root.findall('.//ArticleTitle')[0].text
    except:
        title = ''
    names = root.findall('.//Author')
    affiliations = []
    for name in names:
        try:
            affiliation = name.find('./AffiliationInfo/Affiliation').text
        except:
            affiliation = ''
        try:
            lastName = name.find('./LastName').text
        except:
            lastName = ''
        try:
            foreName = name.find('./ForeName').text
        except:
            foreName = ''
        try:
            idField = name.find('./Identifier')
            if idField.get('Source') == 'ORCID':
                orcid = idField.text
            else:
                orcid = ''
        except:
            orcid = ''
              
        #print(lastName)
        #print(affiliation)
        affiliations.append({'affiliation': affiliation, 'surname': lastName, 'forename': foreName, 'orcid': orcid})
    #print()

    # See https://www.ncbi.nlm.nih.gov/books/NBK25497/ for usage guidelines. 
    # An API key is required for more than 3 requests per second.
    sleep(0.5) # wait half a second before hitting the API again to avoid getting blocked
    return affiliations

def retrieveCrossRefDoi(doi):
    authorList = []
    crossRefEndpointUrl = 'https://api.crossref.org/works/'
    encodedDoi = urllib.parse.quote(doi)
    searchUrl = crossRefEndpointUrl + encodedDoi
    acceptMediaType = 'application/json'
    response = requests.get(searchUrl, headers=generateHeaderDictionary(acceptMediaType))
    try:
        data = response.json()
        #print(json.dumps(data, indent = 2))
        if 'author' in data['message']:
            authors = data['message']['author']
            for author in authors:
                authorDict = {}
                if 'ORCID' in author:
                    authorDict['orcid'] = author['ORCID']
                else:
                    authorDict['orcid'] = ''
                if 'given' in author:
                    authorDict['givenName'] = author['given']
                else:
                    authorDict['givenName'] = ''
                if 'family' in author:
                    authorDict['familyName'] = author['family']
                else:
                    authorDict['familyName'] = ''
                affiliationList = []
                if 'affiliation' in author:
                    for affiliation in author['affiliation']:
                        affiliationList.append(affiliation['name'])
                # if there aren't any affiliations, the list will remain empty
                authorDict['affiliation'] = affiliationList
                authorList.append(authorDict)
    except:
        authorList = [data]
    return authorList

# ***** BODY OF SEARCH
filename = deptShortName + '-employees-with-wikidata.csv'
bsciEmployees = readDict(filename)

for employeeIndex in range(0, len(bsciEmployees)):
#for employeeIndex in range(0, 1): # just do one person for testing
    # perform search only for people who weren't already matched
    if bsciEmployees[employeeIndex]['wikidataStatus'] == '0':
        matchStatus = 0
        print('--------------------------')
        results = searchNameAtWikidata(bsciEmployees[employeeIndex]['name'])
        if len(results) == 0:
            print('No Wikidata name match: ', bsciEmployees[employeeIndex]['name'])
            matchStatus = 7
            print()
        else:
            print('SPARQL name search: ', bsciEmployees[employeeIndex]['name'])
            if len(results) == 1:
                if 'error' in results[0]:
                    matchStatus = 9
                    print('Error message in name search:', results[0]['error'])
                    break # discontinue processing this person

            qIds = []
            nameVariants = []
            potentialOrcid = []
            for result in results:
                qIds.append(result['qId'])
                nameVariants.append(result['name'])
            
            testAuthor = bsciEmployees[employeeIndex]['name']
            testOrcid = bsciEmployees[employeeIndex]['orcid']

            if testOrcid == '':
                print('(no ORCID)')
            else:
                print('ORCID: ', testOrcid)
            print()
            
            foundMatch = False # start the flag with the person not being matched
            for qIdIndex in range(0, len(qIds)):
                potentialOrcid.append('') # default to no ORCID found for that person
                print()
                print(qIdIndex, 'Wikidata ID: ', qIds[qIdIndex], ' Name variant: ', nameVariants[qIdIndex], ' ', 'https://www.wikidata.org/wiki/' + qIds[qIdIndex])
                wdClassList = searchWikidataSingleProperty(qIds[qIdIndex], 'P31', 'item')
                # if there is a class property, check if it's a human
                if len(wdClassList) != 0:
                    # if it's not a human
                    if wdClassList[0] != 'Q5':
                        print('This item is not a human!')
                        break
                        
                # check for a death date
                deathDateList = searchWikidataSingleProperty(qIds[qIdIndex], 'P570', 'string')
                if len(deathDateList) == 0:
                    print('No death date given.')
                else:
                    deathDate = deathDateList[0][0:10] # all dates are converted to xsd:dateTime and will have a y-m-d date
                    if deathDate < deathDateLimit:
                        # if the person died a long time ago, don't retrieve other stuff
                        print('This person died in ', deathDate)
                        break
                    else:
                        # if the person died recently, we still might be interested in them so keep going
                        print('This person died in ', deathDate)

                descriptors = searchWikidataDescription(qIds[qIdIndex])
                employers = searchWikidataEmployer(qIds[qIdIndex])
                #print(descriptors)
                if descriptors != {}:
                    if descriptors['description'] != '':
                        print('description: ', descriptors['description'])
                    for occupation in descriptors['occupation']:
                        print('occupation: ', occupation)
                    for employer in employers:
                        print('employer: ', employer)
                    if descriptors['orcid'] != '':
                        if testOrcid == '':
                            # **** NOTE: if the person has an ORCID, it may be possible to find articles via ORCID
                            # that aren't linked in Wikidata. Not sure if this happens often enough to handle it
                            print('ORCID: ', descriptors['orcid'])
                            potentialOrcid[qIdIndex] = descriptors['orcid']
                        else:
                            # This should always be true if the SPARQL query for ORCID was already done
                            if testOrcid != descriptors['orcid']:
                                print('*** NOT the same person; ORCID ' + descriptors['orcid'] + ' does not match.')
                                break # don't continue the loop (look up references) since it's definitely not a match
                            else:
                                print('*** An ORCID match! How did it get missed in the earlier SPARQL query?')
                                break
                else:
                    print('No description or occupation given.')

                result = searchWikidataArticle(qIds[qIdIndex])
                if len(result) == 0:
                    print('No articles authored by that person')
                else:
                    articleCount = 0
                    for article in result:
                        print()
                        print('Checking article: ', article['title'])
                        if article['pmid'] == '':
                            print('No PubMed ID')
                        else:
                            print('Checking authors in PubMed article: ', article['pmid'])
                            pubMedAuthors = retrievePubMedData(article['pmid'])
                            #print(pubMedAuthors)
                            for author in pubMedAuthors:
                                nameTestRatio = fuzz.token_set_ratio(author['surname'], testAuthor)
                                #print(nameTestRatio, author['surname'])
                                if nameTestRatio >= 90:
                                    # if the PubMed metadata gives an ORCID for the matched person, record it unless 
                                    # the ORCID has already been gotten from the Wikidata record
                                    if author['orcid'] != '':
                                        if testOrcid == '':
                                            print('ORCID from article: ', author['orcid'])
                                            if potentialOrcid[qIdIndex] == '':
                                                potentialOrcid[qIdIndex] = author['orcid']
                                        else:
                                            if testOrcid != author['orcid']:
                                                print('*** NOT the same person; ORCID ' + author['orcid'] + ' does not match.')
                                                break # don't continue the loop (look up authors) since it's definitely not a match
                                            else:
                                                print('*** An ORCID match!')
                                                foundMatch = True
                                                matchStatus = 6
                                                break # don't continue the loop (look up authors) since it's an ORCID match

                                    if author['affiliation'] != '': 
                                        setRatio = fuzz.token_set_ratio(deptSettings[deptShortName]['testAuthorAffiliation'], author['affiliation'])
                                        print('Affiliation test: ', setRatio, author['affiliation'])
                                        if setRatio >= 90:
                                            foundMatch = True
                                            matchStatus = 10
                                            break # don't continue the loop (look up authors) since it's an affiliation match
                                    else:
                                        break # give up on this article because no affiliation string
                        # Don't look up the DOI if it's already found a match with PubMed
                        if foundMatch:
                            break # stop checking articles after a PubMed one has matched
                        else:
                            if article['doi'] == '':
                                print('No DOI')
                            else:
                                print('Checking authors in DOI article: ', article['doi'])
                                doiAuthors = retrieveCrossRefDoi(article['doi'])
                                for author in doiAuthors:
                                    nameTestRatio = fuzz.token_set_ratio(author['familyName'], testAuthor)
                                    #print(nameTestRatio, author['familyName'])
                                    if nameTestRatio >= 90:
                                        if author['orcid'] != '':
                                            if testOrcid == '':
                                                # DOI records the entire ORCID URI, not just the ID number
                                                # so pull the last 19 characters from the string
                                                print('ORCID from article: ', author['orcid'][-19:])
                                                # only add the ORCID from article if there isn't already one,
                                                # for example, one gotten from the Wikidata record itself
                                                if potentialOrcid[qIdIndex] == '':
                                                    potentialOrcid[qIdIndex] = author['orcid'][-19:]
                                            else:
                                                if testOrcid != author['orcid']:
                                                    print('*** NOT the same person; ORCID ' + author['orcid'] + ' does not match.')
                                                    break # don't continue the loop (look up authors) since it's definitely not a match
                                                else:
                                                    print('*** An ORCID match!')
                                                    foundMatch = True
                                                    matchStatus = 6
                                                    break # don't continue the loop (look up authors) since it's an ORCID match


                                        if len(author['affiliation']) > 0:
                                            for affiliation in author['affiliation']:
                                                setRatio = fuzz.token_set_ratio(deptSettings[deptShortName]['testAuthorAffiliation'], affiliation)
                                                print('Affiliation test: ', setRatio, affiliation)
                                                if setRatio >= 90:
                                                    foundMatch = True
                                                    matchStatus = 10
                                                    break # don't continue the loop (look up authors) since it's an affiliation match
                                        else:
                                            break # give up on this article because no affiliation string
                            if foundMatch:
                                break # stop checking articles after a DOI one has matched
                        articleCount += 1
                        if articleCount > 10:
                            checkMore = input('There are more than 10 articles. Press Enter to skip the rest or enter anything to get the rest.')
                            if checkMore == '':
                                break
                    if foundMatch:
                        print('***', qIds[qIdIndex], ' is a match.')
                        print()
                        bsciEmployees[employeeIndex]['wikidataId'] = qIds[qIdIndex]
                        bsciEmployees[employeeIndex]['orcid'] = potentialOrcid[qIdIndex]
                        break # quit checking Q IDs since the person was matched
                    else:
                        print('No match found.')
                print('Employee: ', bsciEmployees[employeeIndex]['name'], ' vs. name variant: ', nameVariants[qIdIndex])
                print()
            if not foundMatch:
                choiceString = input('Enter the number of the matched entity, or press Enter/return if none match: ')
                if choiceString == '':
                    matchStatus = 7
                else:
                    # NOTE: there is no error trapping here for mis-entry !!!
                    choice = int(choiceString)
                    matchStatus = 11
                    bsciEmployees[employeeIndex]['wikidataId'] = qIds[choice]
                    # write a discovered ORCID only if the person didn't already have one
                    if (potentialOrcid[choice] != '') and (bsciEmployees[employeeIndex]['orcid'] == ''):
                        bsciEmployees[employeeIndex]['orcid'] = potentialOrcid[choice]
                print()
                
        # record the final match status
        bsciEmployees[employeeIndex]['wikidataStatus'] = str(matchStatus)
    
    # write the file after each person is checked in case the user crashes the script
    filename = deptShortName + '-employees-curated.csv'
    fieldnames = ['wikidataId', 'name', 'degree', 'category', 'orcid', 'wikidataStatus', 'role']
    writeDictsToCsv(bsciEmployees, filename, fieldnames)

print()
print('Done')


# Download various statements and references, then generate write file

NOTE: between the previous step and this one, one can add a gender/sex column to the table that will be processed if it exists.  Column header: 'gender'.  Allowed values (from Wikidata): m=male, f=female, i=intersex, tf=transgender female, tm=transgender male

In [None]:
filename = deptShortName + '-employees-curated.csv'
employees = readDict(filename)

# create a list of the employees who have Wikidata qIDs
qIds = []
for employee in employees:
    if employee['wikidataId'] != '':
        qIds.append(employee['wikidataId'])

# get all of the ORCID data that is already in Wikidata
prop = 'P496' # ORCID iD
value = '' # since no value is passed, the search will retrieve the value
refProps = ['P813'] # retrieved
wikidataOrcidData = searchStatementAtWikidata(qIds, prop, value, refProps)
#print(json.dumps(wikidataOrcidData, indent=2))

# match people who have ORCIDs with ORCID data downloaded from Wikidata
for employeeIndex in range(0, len(employees)):
    matched = False
    for wikidataOrcidDataIndex in range(0, len(wikidataOrcidData)):
        if wikidataOrcidData[wikidataOrcidDataIndex]['qId'] == employees[employeeIndex]['wikidataId']:
            matched = True
            if employees[employeeIndex]['orcid'] != wikidataOrcidData[wikidataOrcidDataIndex]['statementValue']:
                print('Non-matching ORCID for ', employees[employeeIndex]['name'])
            # if there is a match, record whatever data was retrieved
            else:
                employees[employeeIndex]['orcidStatementUuid'] = wikidataOrcidData[wikidataOrcidDataIndex]['statementUuid']
                employees[employeeIndex]['orcidReferenceHash'] = wikidataOrcidData[wikidataOrcidDataIndex]['referenceHash']
                # if there is no referenceHash then try to dereference the ORCID
                if employees[employeeIndex]['orcidReferenceHash']== '':
                    # if there is a match, check whether the ORCID record can be retrieved
                    print('Checking ORCID for Wikidata matched: ', employees[employeeIndex]['name'])
                    # returned value is the current date if successful; empty string if not
                    employees[employeeIndex]['orcidReferenceValue'] = checkOrcid(employees[employeeIndex]['orcid'])
                # if there is an existing reference, record the value for the first reference property (only one ref property)
                else:
                    print('Already an ORCID reference for: ', employees[employeeIndex]['name'])
                    # need to add the + in front of dateTime, which is needed by the API for upload
                    employees[employeeIndex]['orcidReferenceValue'] = '+' + wikidataOrcidData[wikidataOrcidDataIndex]['referenceValues'][0]
            # stop checking at the first match.
            break
    # if the person doesn't match with those whose ORCIDs came back from the query...
    if not matched:
        # check for access if they have an ORCID (not present in Wikidata)
        if employees[employeeIndex]['orcid'] != '':
            print('Checking ORCID for unmatched: ', employees[employeeIndex]['name'])
            # the function returns the current date (to use as the retrieved date) if the ORCID is found, otherwise empty string
            employees[employeeIndex]['orcidReferenceValue'] = checkOrcid(employees[employeeIndex]['orcid'])

# get data already in Wikidata about people employed at Vanderbilt
prop = 'P108' # employer
refProps = ['P854', 'P813'] # source URL, retrieved
wikidataEmployerData = searchStatementAtWikidata(qIds, prop, employerQId, refProps)
#print(json.dumps(wikidataEmployerData, indent=2))

# match people with employment data downloaded from Wikidata
for employeeIndex in range(0, len(employees)):
    matchedStatement = False
    matchedReference = False
    for wikidataEmployerDataIndex in range(0, len(wikidataEmployerData)):
        if wikidataEmployerData[wikidataEmployerDataIndex]['qId'] == employees[employeeIndex]['wikidataId']:
            matchedStatement = True
            employees[employeeIndex]['employerStatementUuid'] = wikidataEmployerData[wikidataEmployerDataIndex]['statementUuid']
            employees[employeeIndex]['employerReferenceHash'] = wikidataEmployerData[wikidataEmployerDataIndex]['referenceHash']
            # if there is a referenceHash then record the values for the two reference properties: P813, P854'; retrieved, source URL
            if employees[employeeIndex]['employerReferenceHash']!= '':
                # need to add the + in front of dateTime, which is needed by the API for upload
                employees[employeeIndex]['employerReferenceSourceUrl'] = wikidataEmployerData[wikidataEmployerDataIndex]['referenceValues'][0]
                if wikidataEmployerData[wikidataEmployerDataIndex]['referenceValues'][0] == deptSettings[deptShortName]['baseUrl'] + employees[employeeIndex]['category']:
                    matchedReference = True
                employees[employeeIndex]['employerReferenceRetrieved'] = '+' + wikidataEmployerData[wikidataEmployerDataIndex]['referenceValues'][1]
            # stop checking if there is an exact match to the reference URL. Otherwise keep looping.
            # if there is a later reference that matches ours, it will overwrite any previous reference data
            # otherwise, the existing (different) reference data will be retained
            if matchedReference:
                break
        
    # everyone is assigned the employerQId as a value because either they showed up in the SPARQL search for employerQId
    # or we are making a statement that they work for employerQId.
    employees[employeeIndex]['employer'] = employerQId
    if not matchedStatement:  # only generate the metadata if there isn't already a statement
        wholeTimeStringZ = datetime.datetime.utcnow().isoformat() # form: 2019-12-05T15:35:04.959311
        dateZ = wholeTimeStringZ.split('T')[0] # form 2019-12-05
        wholeDateZ = '+' + dateZ + 'T00:00:00Z' # form +2019-12-05T00:00:00Z as provided by Wikidata
        employees[employeeIndex]['employerReferenceSourceUrl'] = deptSettings[deptShortName]['baseUrl'] + employees[employeeIndex]['category']
        employees[employeeIndex]['employerReferenceRetrieved'] = wholeDateZ
    # ******************
    # NOTE: the end result here is that nothing is done if the "employer [university name]" statement (e.g. P108 Q29052) exists.
    # If the statement has a different reference than the one we use, it's recorded.  
    # ******************

# *** This is a copy and paste of the employer section above, modified for affiliation

# get data already in Wikidata about people affiliated with the department
prop = 'P1416' # affiliation
refProps = ['P854', 'P813'] # source URL, retrieved
wikidataEmployerData = searchStatementAtWikidata(qIds, prop, deptSettings[deptShortName]['departmentQId'], refProps)
#print(json.dumps(wikidataEmployerData, indent=2))

# match people with affiliation data downloaded from Wikidata
for employeeIndex in range(0, len(employees)):
    matchedStatement = False
    matchedReference = False
    for wikidataEmployerDataIndex in range(0, len(wikidataEmployerData)):
        if wikidataEmployerData[wikidataEmployerDataIndex]['qId'] == employees[employeeIndex]['wikidataId']:
            matchedStatement = True
            employees[employeeIndex]['affiliationStatementUuid'] = wikidataEmployerData[wikidataEmployerDataIndex]['statementUuid']
            employees[employeeIndex]['affiliationReferenceHash'] = wikidataEmployerData[wikidataEmployerDataIndex]['referenceHash']
            # if there is a referenceHash then record the values for the two reference properties: P813, P854'; retrieved, source URL
            if employees[employeeIndex]['affiliationReferenceHash']!= '':
                # need to add the + in front of dateTime, which is needed by the API for upload
                employees[employeeIndex]['affiliationReferenceSourceUrl'] = wikidataEmployerData[wikidataEmployerDataIndex]['referenceValues'][0]
                if wikidataEmployerData[wikidataEmployerDataIndex]['referenceValues'][0] == deptSettings[deptShortName]['baseUrl'] + employees[employeeIndex]['category']:
                    matchedReference = True
                employees[employeeIndex]['affiliationReferenceRetrieved'] = '+' + wikidataEmployerData[wikidataEmployerDataIndex]['referenceValues'][1]
            # stop checking if there is an exact match to the reference URL. Otherwise keep looping.
            # if there is a later reference that matches ours, it will overwrited any previous reference data
            # otherwise, the existing (different) reference data will be retained
            if matchedReference:
                break
        
    # everyone is assigned the department as a value because either they showed up in the SPARQL search
    # or we are making a statement that they are affiliated with the department.
    employees[employeeIndex]['affiliation'] = deptSettings[deptShortName]['departmentQId']
    if not matchedStatement:  # only generate the metadata if there isn't already a statement
        wholeTimeStringZ = datetime.datetime.utcnow().isoformat() # form: 2019-12-05T15:35:04.959311
        dateZ = wholeTimeStringZ.split('T')[0] # form 2019-12-05
        wholeDateZ = '+' + dateZ + 'T00:00:00Z' # form +2019-12-05T00:00:00Z as provided by Wikidata
        employees[employeeIndex]['affiliationReferenceSourceUrl'] = deptSettings[deptShortName]['baseUrl'] + employees[employeeIndex]['category']
        employees[employeeIndex]['affiliationReferenceRetrieved'] = wholeDateZ

# get all of the data that is already in Wikidata about who are humans
prop = 'P31' # instance of
value = 'Q5' # human
refProps = [] # no ref property needed
wikidataHumanData = searchStatementAtWikidata(qIds, prop, value, refProps)

# Find out which people have assertions that they are humans and record their statement IDs.
# Assign the properties to all others.
for employeeIndex in range(0, len(employees)):
    for wikidataHumanIndex in range(0, len(wikidataHumanData)):
        if wikidataHumanData[wikidataHumanIndex]['qId'] == employees[employeeIndex]['wikidataId']:
            employees[employeeIndex]['instanceOfUuid'] = wikidataHumanData[wikidataHumanIndex]['statementUuid']
    # everybody is assigned a value of 'human'
    employees[employeeIndex]['instanceOf'] = 'Q5'

# hack of human code immediately above

# get all of the data that is already in Wikidata about the sex or gender of the researchers
prop = 'P21' # sex or gender
value = '' # don't provide a value so that it will return whatever value it finds
refProps = [] # no ref property needed
wikidataHumanData = searchStatementAtWikidata(qIds, prop, value, refProps)

# Find out which people have assertions of sex/gender and record their statement IDs.
# Assign the value for the property to all others.
# NOTE: Wikidata doesn't seem to care a lot about references for this property and we don't really have one anyway
for employeeIndex in range(0, len(employees)):
    matched = False
    for wikidataHumanIndex in range(0, len(wikidataHumanData)):
        if wikidataHumanData[wikidataHumanIndex]['qId'] == employees[employeeIndex]['wikidataId']:
            matched = True
            employees[employeeIndex]['sexOrGenderUuid'] = wikidataHumanData[wikidataHumanIndex]['statementUuid']
            # use the value in Wikidata and ignore the value in the 'gender' column of the table.
            # extractFromIri() function strips the namespace from the qId
            employees[employeeIndex]['sexOrGenderQId'] = extractFromIri(wikidataHumanData[wikidataHumanIndex]['statementValue'], 4)
    if not matched:
        # assign the value from the 'gender' column in the table if not already in Wikidata
        if 'gender' in employees[employeeIndex]:
            employees[employeeIndex]['sexOrGenderQId'] = decodeSexOrGender(employees[employeeIndex]['gender'])
        else:
            employees[employeeIndex]['sexOrGenderQId'] = ''

# get all of the English language labels for the employees that are already in Wikidata
labelType = 'label'
language = 'en'
wikidataLabels = searchLabelsDescriptionsAtWikidata(qIds, labelType, language)

# Match people with their labels
for employeeIndex in range(0, len(employees)):
    matched = False
    for wikidataLabelIndex in range(0, len(wikidataLabels)):
        if wikidataLabels[wikidataLabelIndex]['qId'] == employees[employeeIndex]['wikidataId']:
            matched = True
            employees[employeeIndex]['labelEn'] = wikidataLabels[wikidataLabelIndex]['string']
    if not matched:
        # assign the value from the 'name' column in the table if not already in Wikidata
        if deptSettings[deptShortName]['labels']['source'] == 'column':
            # then use the value from the default label column.
            defaultLabelColumn = deptSettings[deptShortName]['labels']['value']
            employees[employeeIndex]['labelEn'] = employees[employeeIndex][defaultLabelColumn]
        else:
            # or use the default label value.
            employees[employeeIndex]['labelEn'] = deptSettings[deptShortName]['labels']['value']

# get all of the English language descriptions for the employees that are already in Wikidata
labelType = 'description'
language = 'en'
wikidataDescriptions = searchLabelsDescriptionsAtWikidata(qIds, labelType, language)

# Match people with their descriptions
for employeeIndex in range(0, len(employees)):
    matched = False
    for wikidataDescriptionIndex in range(0, len(wikidataDescriptions)):
        if wikidataDescriptions[wikidataDescriptionIndex]['qId'] == employees[employeeIndex]['wikidataId']:
            matched = True
            employees[employeeIndex]['description'] = wikidataDescriptions[wikidataDescriptionIndex]['string']
    if not matched:
        # assign a default value if not already in Wikidata
        if deptSettings[deptShortName]['descriptions']['source'] == 'column':
            # then use the value from the default description column.
            defaultDescriptionColumn = deptSettings[deptShortName]['descriptions']['value']
            employees[employeeIndex]['description'] = employees[employeeIndex][defaultDescriptionColumn]
        else:
            # or use the default description value.
            employees[employeeIndex]['description'] = deptSettings[deptShortName]['descriptions']['value']

# Get all of the aliases already at Wikidata for employees.  
# Since there can be multiple aliases, they are stored as a list structure.
# The writing script can handle multiple languages, but here we are only dealing with English ones.

# retrieve the aliases in that language that already exist in Wikidata and match them with table rows
labelType = 'alias'
language = 'en'
aliasesAtWikidata = searchLabelsDescriptionsAtWikidata(qIds, labelType, language)
for entityIndex in range(0, len(employees)):
    personAliasList = []
    if employees[entityIndex]['wikidataId'] != '':  # don't look for the label at Wikidata if the item doesn't yet exist
        for wikiLabel in aliasesAtWikidata:
            if employees[entityIndex]['wikidataId'] == wikiLabel['qId']:
                personAliasList.append(wikiLabel['string'])
    # if not found, the personAliasList list will remain empty
    employees[entityIndex]['alias'] = json.dumps(personAliasList)

# write the file
filename = deptShortName + '-employees-to-write.csv'
fieldnames = ['wikidataId', 'name', 'labelEn', 'alias', 'description', 'orcidStatementUuid', 'orcid', 'orcidReferenceHash', 'orcidReferenceValue', 'employerStatementUuid', 'employer', 'employerReferenceHash', 'employerReferenceSourceUrl', 'employerReferenceRetrieved', 'affiliationStatementUuid', 'affiliation', 'affiliationReferenceHash', 'affiliationReferenceSourceUrl', 'affiliationReferenceRetrieved', 'instanceOfUuid', 'instanceOf', 'sexOrGenderUuid', 'sexOrGenderQId', 'gender', 'degree', 'category', 'wikidataStatus', 'role']
writeDictsToCsv(employees, filename, fieldnames)

print()
print('Done')