# Query ORCID for Vanderbilt University people

Script developed at https://github.com/HeardLibrary/linked-data/blob/master/publications/orcid/orcid-get-json.ipynb

Retrieves results 100 at a time, then processes them by extracting desired information.  NOTE: takes hours to run.

Saves results in a file and the alternative names in a second file.

In [None]:
import requests
import json
from time import sleep
import csv
import math

# function to write results to a file
def writeCsv(fileName, array):
    fileObject = open(fileName, 'w', newline='', encoding='utf-8')
    writerObject = csv.writer(fileObject)
    for row in array:
        writerObject.writerow(row)
    fileObject.close()
    
table = [['orcid', 'givenNames', 'familyName', 'startDate', 'endDate', 'department', 'organization']]
otherNameList = [['orcid', 'altName']]

# use the API to search for people associated with Vanderbilt University
# First search is for only one record, just to get the number of hits found
searchUri = 'https://pub.orcid.org/v2.0/search/?q=affiliation-org-name:"Vanderbilt+University"&start=1&rows=1'
response = requests.get(searchUri, headers={'Accept' : 'application/json'})
data = response.json()
numberResults = data["num-found"]
#print(data["num-found"])
numberPages = math.floor(numberResults/100)
#print(numberPages)
remainder = numberResults - 100*numberPages
#print(remainder)

for pageCount in range(0, numberPages+1):  # the remainder will be caught when pageCount = numberPages
    print('page: ', pageCount)
    searchUri = 'https://pub.orcid.org/v2.0/search/?q=affiliation-org-name:"Vanderbilt+University"&start='+str(pageCount*100+1)
    response = requests.get(searchUri, headers={'Accept' : 'application/json'})
    print(response.url)
    data = response.json()
    orcidsDictsList = data['result']

    # extract the identifier strings from the data structure
    orcids = []
    for orcidDict in orcidsDictsList:
        dictionary = {'id': orcidDict['orcid-identifier']['path'], 'iri': orcidDict['orcid-identifier']['uri']}
        orcids.append(dictionary)

    for orchidIndex in range(0, len(orcids)):
        response = requests.get(orcids[orchidIndex]['iri'], headers={'Accept' : 'application/json'})
        data = response.json()

        #print(json.dumps(data, indent = 2))
        orcidId = data['orcid-identifier']['path']
        #print(orcidId)
        # if there isn't a name, then go on to the next ORCID
        if not data['person']['name']:
            continue
        if data['person']['name']['given-names']:  
            givenNames = data['person']['name']['given-names']['value']
        else:
            givenNames = ''
        if data['person']['name']['family-name']:
            familyName = data['person']['name']['family-name']['value']
        else:
            familyName = ''
        #print(givenNames, ' ', familyName)
        otherNames = data['person']['other-names']['other-name']
        for otherName in otherNames:
            #print(otherName['content'])
            otherNameList.append([orcidId, otherName['content']])

        affiliations = data['activities-summary']['employments']['affiliation-group']
        #print(json.dumps(affiliations, indent = 2))
        for affiliation in affiliations:
            summaries = affiliation['summaries']
            #print(summaries)
            #print()
            for summary in summaries:
                employment = summary['employment-summary']
                #print(json.dumps(employment, indent = 2))
                startDate = ''
                if employment['start-date']:
                    if employment['start-date']['year']:
                        startDate += employment['start-date']['year']['value']
                        startMonth = employment['start-date']['month']
                        if startMonth:
                            startDate += '-' + startMonth['value']
                            startDay = employment['start-date']['day']
                            if startDay:
                                startDate += '-' + startDay['value']
                #print('start date: ', startDate)
                endDate = ''
                if employment['end-date']:
                    if employment['end-date']['year']:
                        endDate += employment['end-date']['year']['value']
                        endMonth = employment['end-date']['month']
                        if endMonth:
                            endDate += '-' + endMonth['value']
                            endDay = employment['end-date']['day']
                            if endDay:
                                endDate += '-' + endDay['value']
                #print('end date: ', endDate)
                department = employment['department-name']
                # if there is no value for department, set it to empty string
                if not department:
                    department = ''
                #print(department)
                if employment['organization']:
                    organization = employment['organization']['name']
                #print(organization)
                if 'Vanderbilt University' in organization:
                    print(orcidId, givenNames, familyName, startDate, endDate, department, organization)
                    table.append([orcidId, givenNames, familyName, startDate, endDate, department, organization])
                #print(table)
        sleep(.25)

print()
print('Done')
fileName = 'orcid_data.csv'
writeCsv(fileName, table)
fileName = 'orcid_other_names.csv'
writeCsv(fileName, otherNameList)

# Scrape departmental website

script developed at https://github.com/HeardLibrary/linked-data/blob/master/publications/scrape-bsci.ipynb

Note: currently script is ideosyncratic to https://as.vanderbilt.edu/biosci/people/index.php?group= patterned pages in the Vanderbilt BSCI department website.

In [None]:
import requests   # best library to manage HTTP transactions
from bs4 import BeautifulSoup # web-scraping library
import csv
import json

def writeCsv(fileName, array):
    fileObject = open(fileName, 'w', newline='', encoding='utf-8')
    writerObject = csv.writer(fileObject)
    for row in array:
        writerObject.writerow(row)
    fileObject.close()

acceptMediaType = 'text/html'
userAgentHeader = 'BaskaufScraper/0.1 (steve.baskauf@vanderbilt.edu)'
requestHeaderDictionary = {
    'Accept' : acceptMediaType,
    'User-Agent': userAgentHeader
}

outputTable = [['name', 'degree', 'role', 'category']]
categories = ['primary-training-faculty', 'research-and-teaching-faculty', 'secondary-faculty', 'postdoc-fellows', 'emeriti']
degreeList = [
    {'string': 'Ph.D.', 'value': 'Ph.D.'},
    {'string': 'PhD', 'value': 'Ph.D.'},
    {'string': 'D.Phil.', 'value': 'D.Phil.'},
    {'string': 'J.D.', 'value': 'J.D.'}
     ]

for category in categories:
    url = 'https://as.vanderbilt.edu/biosci/people/index.php?group=' + category
    response = requests.get(url, headers = requestHeaderDictionary)
    soupObject = BeautifulSoup(response.text,features="html5lib")

    # get the first table from the page
    tableObject = soupObject.find_all('table')[0]
    
    # get the rows from the table
    rowObjectsList = tableObject.find_all('tr')
    for rowObject in rowObjectsList:
        # get the cells from each row
        cellObjectsList = rowObject.find_all('td')
        # picture is in cell 0, name and title is in cell 1
        nameCell = cellObjectsList[1]
        # the name part is bolded
        name = nameCell('strong')[0].text
        
        # separate degrees from names
        degree = ''
        for testDegree in degreeList:
            if testDegree['string'] in name:
                name = name.partition(', ' + testDegree['string'])[0]
                # correct any malformed strings
                degree = testDegree['value']

        try:
            # process the roles text
            dirtyText  = str(nameCell)
            # get rid of trailing td tag
            nameCellText = dirtyText.split('</td>')[0]
            cellLines = nameCellText.split('<br/>')
            roles = []
            for lineIndex in range(1, len(cellLines)):
                roleDict = {}
                if ' of ' in cellLines[lineIndex]:
                    pieces = cellLines[lineIndex].split(' of ')
                    roleDict['title'] = pieces[0]
                    roleDict['department'] = pieces[1]
                    roles.append(roleDict)
                elif ' in ' in cellLines[lineIndex]:
                    pieces = cellLines[lineIndex].split(' in ')
                    roleDict['title'] = pieces[0]
                    roleDict['department'] = pieces[1]
                    roles.append(roleDict)
                else:
                    roleDict['title'] = cellLines[lineIndex]
                    roleDict['department'] = ''
                    roles.append(roleDict)
                if ', Emeritus' in roleDict['department']:
                    roleDict['department'] = roleDict['department'].split(', Emeritus')[0]
                    roleDict['title'] = 'Emeritus ' + roleDict['title']
            rolesJson = json.dumps(roles)

        except:
            rolesJson = ''
        outputTable.append([name, degree, rolesJson, category])            

fileName = 'bsci-employees.csv'
writeCsv(fileName, outputTable)
print('done')

# Match BSCI people with ORCID results

Script developed at https://github.com/HeardLibrary/linked-data/blob/master/publications/orcid/match_bsci_orcid.ipynb

Adds data to the bsci-employees.csv file and outputs as bsci-employees-with-orcid.csv file.

In [None]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import csv
import json

def readDict(filename):
    fileObject = open(filename, 'r', newline='', encoding='utf-8')
    dictObject = csv.DictReader(fileObject)
    array = []
    for row in dictObject:
        array.append(row)
    fileObject.close()
    return array

filename = 'bsci-employees.csv'
bsciEmployees = readDict(filename)

filename = 'orcid_data.csv'
orcidData = readDict(filename)

testRatio = 90
departmentTestRatio = 90
for employeeIndex in range(0, len(bsciEmployees)):
    matched = False
    for row in orcidData:
        name = row['givenNames'] + ' ' + row['familyName']
        #ratio = fuzz.ratio(name, bsciEmployees[employeeIndex][0])
        #partialRatio = fuzz.partial_ratio(name, bsciEmployees[employeeIndex][0])
        #sortRatio = fuzz.token_sort_ratio(name, bsciEmployees[employeeIndex][0])
        
        output = ''
        # the set ratio seems to do the best job of matching
        setRatio = fuzz.token_set_ratio(name, bsciEmployees[employeeIndex]['name'])
        if setRatio >= testRatio:
            output = str(setRatio) + ' ' + name + ' / ' + bsciEmployees[employeeIndex]['name']
            
            if row['department'] == '':
                output += " WARNING: no department given in ORCID"
            else:
                # carry out a secondary test to see if any of the departments listed in the BSCI page
                # are a good match to the department given in the ORCID record
                
                # expand the role JSON into a list of dictionaries
                roleDict = json.loads(bsciEmployees[employeeIndex]['role'])
                departmentMatch = False
                for department in roleDict:
                    setRatio = fuzz.token_set_ratio(department['department'], row['department'])
                    if setRatio > departmentTestRatio:
                        departmentMatch = True
                        output += ' ' + str(setRatio) + ' ' + row['department']
                if not departmentMatch:
                    output += ' WARNING: ' + row['department'] + ' less than ' + str(departmentTestRatio) + '% match to any dept.'
            print(output)
            matched = True
            foundOrcid = row['orcid']
            # We only care about the first good match to an ORCID record, kill the loop after that
            break
    if matched:
        bsciEmployees[employeeIndex]['orcid'] = foundOrcid
    else:
        bsciEmployees[employeeIndex]['orcid'] = ''
        
filename = 'bsci-employees-with-orcid.csv'
with open(filename, 'w', newline='') as csvFileObject:
    fieldnames = ['name', 'degree', 'category', 'orcid', 'role']
    writer = csv.DictWriter(csvFileObject, fieldnames=fieldnames)
    writer.writeheader()
    for row in bsciEmployees:
        writer.writerow(row)

# Download Vanderbilt people data from Wikidata

Developed at https://github.com/HeardLibrary/linked-data/blob/master/publications/wikidata/download-vanderbilt-people.py

Performs a SPARQL query to find people in Wikidata that are employed at Vanderbilt.  Returns name, description, start date, end date, and ORCID if it has them.  Output to vanderbilt_wikidata.csv

In [None]:
import requests   # best library to manage HTTP transactions
import json
import csv

# function to write results to a file
def writeCsv(fileName, array):
    fileObject = open(fileName, 'w', newline='', encoding='utf-8')
    writerObject = csv.writer(fileObject)
    for row in array:
        writerObject.writerow(row)
    fileObject.close()

endpointUrl = 'https://query.wikidata.org/sparql'
query = '''select distinct  ?person ?name ?orcid ?startDate ?endDate ?description where {
  ?person p:P108 ?statement.
  ?statement ps:P108  wd:Q29052.
  optional{
    ?person rdfs:label ?name.
    FILTER(lang(?name)="en")
    }
  optional{?statement pq:P580 ?startDate.}
  optional{?statement pq:P582 ?endDate.}
  optional{?person wdt:P496 ?orcid.}
  optional{
    ?person schema:description ?description.
    FILTER(lang(?description)="en")
          }
  }'''

# The endpoint defaults to returning XML, so the Accept: header is required
r = requests.get(endpointUrl, params={'query' : query}, headers={'Accept' : 'application/json'})

data = r.json()
print(json.dumps(data,indent = 2))

table = [['wikidataIri', 'name', 'description', 'startDate', 'endDate', 'orcid']]
items = data['results']['bindings']
for item in items:
    wikidataIri = item['person']['value']
    name = ''
    if 'name' in item:
        name = item['name']['value']
    description = ''
    if 'description' in item:
        description = item['description']['value']
    startDate = ''
    if 'startDate' in item:
        startDate = item['startDate']['value']
    endDate = ''
    if 'endDate' in item:
        endDate = item['endDate']['value']
    orcid = ''
    if 'orcid' in item:
        orcid = item['orcid']['value']
    table.append([wikidataIri, name, description, startDate, endDate, orcid])
    
fileName = 'vanderbilt_wikidata.csv'
writeCsv(fileName, table)

# Download Vanderbilt people's altLabels from Wikidata

Developed at https://github.com/HeardLibrary/linked-data/blob/master/publications/wikidata/download-vanderbilt-people-altlabels.py

Similar to previous query, except when there is a match, it downloads the altLabels.

In [None]:
import requests   # best library to manage HTTP transactions
import json
import csv

# function to write results to a file
def writeCsv(fileName, array):
    fileObject = open(fileName, 'w', newline='', encoding='utf-8')
    writerObject = csv.writer(fileObject)
    for row in array:
        writerObject.writerow(row)
    fileObject.close()

endpointUrl = 'https://query.wikidata.org/sparql'
query = '''select distinct  ?person ?altLabel where {
  ?person p:P108 ?statement.
  ?statement ps:P108  wd:Q29052.
  ?person skos:altLabel ?altLabel.
  FILTER(lang(?altLabel)="en")
}'''

# The endpoint defaults to returning XML, so the Accept: header is required
r = requests.get(endpointUrl, params={'query' : query}, headers={'Accept' : 'application/json'})

data = r.json()
print(json.dumps(data,indent = 2))

table = [['wikidataIri', 'altLabel']]
items = data['results']['bindings']
for item in items:
    wikidataIri = item['person']['value']
    altLabel = ''
    if 'altLabel' in item:
        altLabel = item['altLabel']['value']
    table.append([wikidataIri, altLabel])
    
fileName = 'vanderbilt_wikidata_altlabels.csv'
writeCsv(fileName, table)


# Match BSCI people to Wikidata

Script developed at https://github.com/HeardLibrary/linked-data/blob/master/publications/wikidata/match_bsci_wikidata.ipynb

Attempts to match records of people Wikidata knows to work at Vanderbilt with departmental people by matching their ORCIDs, then name strings.

Finally, it attempts to do a SPARQL query on name string variants.

In [None]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import requests
import csv
import json
from time import sleep

def readDict(filename):
    fileObject = open(filename, 'r', newline='', encoding='utf-8')
    dictObject = csv.DictReader(fileObject)
    array = []
    for row in dictObject:
        array.append(row)
    fileObject.close()
    return array

def extractQNumber(iri):
    # pattern is http://www.wikidata.org/entity/Q6386232
    pieces = iri.split('/')
    return pieces[4]

# query for a single variable that's an item named 'item'
# returns a list of results
def searchWikidataForQId(query):
    results = []
    endpointUrl = 'https://query.wikidata.org/sparql'
    acceptMediaType = 'application/json'
    userAgentHeader = 'BaskaufScraper/0.1 (steve.baskauf@vanderbilt.edu)'
    requestHeaderDictionary = {
    'Accept' : acceptMediaType,
    'User-Agent': userAgentHeader
}
    r = requests.get(endpointUrl, params={'query' : query}, headers=requestHeaderDictionary)
    try:
        data = r.json()
        statements = data['results']['bindings']
        for statement in statements:
            wikidataIri = statement['item']['value']
            qNumber = extractQNumber(wikidataIri)
            results.append(qNumber)
    except:
        results = [r.text]
    # delay a quarter second to avoid hitting the SPARQL endpoint to rapidly
    sleep(0.25)
    return results

def generateNameAlternatives(name):
    # get rid of periods
    name = name.replace('.', '')
    pieces = name.split(' ')
    
    # generate initials for all names
    initials = []
    for piece in pieces:
        initials.append(piece[0:1])
        
    alternatives = []
    # full name
    nameVersion = ''
    for pieceNumber in range(0, len(pieces)-1):
        nameVersion += pieces[pieceNumber] + ' '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)
    
    # first and last name with initials
    nameVersion = pieces[0] + ' '
    for pieceNumber in range(1, len(pieces)-1):
        nameVersion += initials[pieceNumber] + ' '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)
    
    # first and last name with initials and periods
    nameVersion = pieces[0] + ' '
    for pieceNumber in range(1, len(pieces)-1):
        nameVersion += initials[pieceNumber] + '. '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)

    # first and last name only
    nameVersion = pieces[0] + ' '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)

    # first initial and last name only
    nameVersion = initials[0] + ' '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)

    # first initial with period and last name only
    nameVersion = initials[0] + '. '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)

    # all name initials with last name
    nameVersion = initials[0] + ' '
    for pieceNumber in range(1, len(pieces)-1):
        nameVersion += initials[pieceNumber] + ' '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)

    # all name initials with periods with last name
    nameVersion = ''
    for pieceNumber in range(0, len(pieces)-1):
        nameVersion += initials[pieceNumber] + '. '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)

    # all name initials concatenated with last name
    nameVersion = ''
    for pieceNumber in range(0, len(pieces)-1):
        nameVersion += initials[pieceNumber]
    nameVersion += ' ' + pieces[len(pieces)-1]
    alternatives.append(nameVersion)
    
    # remove duplicates
    dedupe = list(set(alternatives))

    return dedupe

def searchNameAtWikidata(name):
    nameList = generateNameAlternatives(name)
    alternatives = ''
    for alternative in nameList:
        alternatives += '"' + alternative + '"@en\n'
    query = '''
select distinct ?item where {
  VALUES ?value
  {
  ''' + alternatives + '''}
?item rdfs:label|skos:altLabel ?value.
  }
'''
    #print(query)
    #print('searching for ', name)
    results = searchWikidataForQId(query)
    return results

filename = 'bsci-employees-with-orcid.csv'
bsciEmployees = readDict(filename)

filename = 'vanderbilt_wikidata.csv'
wikidataData = readDict(filename)

testRatio = 90
departmentTestRatio = 90

for employeeIndex in range(0, len(bsciEmployees)):
    # 0=unmatched
    # 1=matched with ORCID in both sources
    # 2=ORCID in BSCI but name match to Wikidata (no ORCID)
    # 3=no ORCID in BSCI but name match to Wikidata (with ORCID); could happen if affiliation isn't matched in ORCID
    # 4=no ORCID in BSCI but name match to Wikidata (no ORCID)
    # 5=ORCID in BSCI and found via SPARQL ORCID search (likely non-VU affiliated in Wikidata)
    # 6=ORCID in BSCI and found via SPARQL name search (non-VU affiliated without ORCID)
    # 7=no ORCID in BSCI, no name match
    # 8=ORCID in BSCI, error in SPARQL ORCID search
    # 9=no ORCID in BSCI, error in SPARQL name search
    matchStatus = 0
    for row in wikidataData:
        # We know the employee has an ORCID, so try to match it
        if bsciEmployees[employeeIndex]['orcid'] != '':
            # There's a match, hooray!
            if bsciEmployees[employeeIndex]['orcid'] == row['orcid']:
                print('orcid match: ', row['name'] + ' ' + row['orcid'])
                matchStatus = 1
                bsciEmployees[employeeIndex]['wikidataId'] = extractQNumber(row['wikidataIri'])
            # No ORCID match - see if the name matches
            else:
                setRatio = fuzz.token_set_ratio(row['name'], bsciEmployees[employeeIndex]['name'])
                if setRatio >= testRatio:
                    print('name match: ', str(setRatio) + ' ' + row['name'] + ' / ' + bsciEmployees[employeeIndex]['name'] + ' BSCI:' + bsciEmployees[employeeIndex]['orcid'])
                    matchStatus = 2
                    bsciEmployees[employeeIndex]['wikidataId'] = extractQNumber(row['wikidataIri'])
        # As far as we know, the employee doesn't have an ORCID, so try to match the name
        else:
            setRatio = fuzz.token_set_ratio(row['name'], bsciEmployees[employeeIndex]['name'])
            # We get a name match 
            if setRatio >= testRatio:
                # For some reason, Wikidata has the ORCID, so grab it
                if row['orcid'] != '':
                    print('name match: ', str(setRatio) + ' ' + row['name'] + ' / ' + bsciEmployees[employeeIndex]['name'] + ' ORCID:' + row['orcid'])
                    bsciEmployees[employeeIndex]['orcid'] = row['orcid']
                    matchStatus = 3
                # Wikidata doesn't have an ORCID
                else:
                    print('name match: ', str(setRatio) + ' ' + row['name'] + ' / ' + bsciEmployees[employeeIndex]['name'] + ' WD description: ' + row['description'])
                    matchStatus = 4
                bsciEmployees[employeeIndex]['wikidataId'] = extractQNumber(row['wikidataIri'])
        # We've gone all the way through the without finding a match

    # Do a last ditch attempt to try to find the person in Wikidata by doing a SPARQL search for their ORCID
    if matchStatus == 0:
        if bsciEmployees[employeeIndex]['orcid'] != '':
            query = '''
select distinct ?item where {
  ?item wdt:P496 "''' + bsciEmployees[employeeIndex]['orcid'] + '''".
  }
'''
            results = searchWikidataForQId(query)
            if len(results) > 0:
                print('SPARQL ORCID search: ', bsciEmployees[employeeIndex]['name'], results)
                if len(results) == 1:
                    # if search fails and return an error message
                    if len(results[0]) > 15:
                        matchStatus = 8
                        print('Error message in ORCID search')
                    else:
                        matchStatus = 5
                        bsciEmployees[employeeIndex]['wikidataId'] = results[0]
                else:
                    print('ERROR: multiple results for same ORCID')
print('done')
#print(bsciEmployees)

filename = 'bsci-employees-with-wikidata.csv'
with open(filename, 'w', newline='') as csvFileObject:
    fieldnames = ['wikidataId', 'name', 'degree', 'category', 'orcid', 'wikidataStatus', 'role']
    writer = csv.DictWriter(csvFileObject, fieldnames=fieldnames)
    writer.writeheader()
    for row in bsciEmployees:
        writer.writerow(row)

# Crosscheck people against publications

Developed at https://github.com/HeardLibrary/linked-data/blob/master/publications/crosscheck-publications.ipynb

Checks possible Wikidata records against publications in CrossRef and PubMed to see if the author metadata will disambituate the Wikidata record.

Note: this needs to be integrated with the previous script.

In [None]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import requests
import csv
import json
import xml.etree.ElementTree as et
from time import sleep
import urllib

# tool name and email address should be sent with all requests
# see https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
emailAddress = 'steve.baskauf@vanderbilt.edu' # put your email address here
toolName = 'testApiScript' # give your application a name here

wikidataEndpointUrl = 'https://query.wikidata.org/sparql'
acceptMediaType = 'application/json'
userAgentHeader = 'BaskaufScraper/0.1 (mailto:steve.baskauf@vanderbilt.edu)'
requestHeaderDictionary = {
    'Accept' : acceptMediaType,
    'User-Agent': userAgentHeader
    }

# returns a dictionary of various descriptors of the item with Wikidata ID qId
# P106 is occupation, schema:description is filtered to be the English description
def searchWikidataDescription(qId):
    resultsDict = {}
    query = '''select distinct ?description ?orcid ?occupation where {
        optional {
            wd:'''+ qId + ''' schema:description ?description.
            FILTER(lang(?description) = 'en')
            }
        optional {
            wd:'''+ qId + ''' wdt:P106 ?occupationId.
            ?occupationId rdfs:label ?occupation.
            FILTER(lang(?occupation) = 'en')            
            }
        optional {wd:'''+ qId + ''' wdt:P496 ?orcid.}
      }'''
    #print(query)
    r = requests.get(wikidataEndpointUrl, params={'query' : query}, headers=requestHeaderDictionary)
    try:
        data = r.json()
        statements = data['results']['bindings']
        if len(statements) > 0: # if no results, the dictionary remains empty
            # Only a single description per language is allowed, so there should only be one description
            if 'description' in statements[0]:
                description = statements[0]['description']['value']
            else:
                description = ''
            resultsDict['description'] = description
            
            # Only a single ORCID is allowed, so there should only be one orcid value
            if 'orcid' in statements[0]:
                orcid = statements[0]['orcid']['value']
            else:
                orcid = ''
            resultsDict['orcid'] = orcid
            
            # if there are multiple statements, that's because there are more than one occupation
            occupationList = []
            for statement in statements:
                if 'occupation' in statement:
                    occupationList.append(statement['occupation']['value'])
            resultsDict['occupation'] = occupationList
    except:
        resultsDict = {'error': r.text}
    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(0.25)
    return resultsDict

# returns a list of employer strings for the item with Wikidata ID qId;,P108 is employer
def searchWikidataEmployer(qId):
    resultsList = []
    query = '''select distinct ?employer where {
        wd:'''+ qId + ''' wdt:P108 ?employerId.
        ?employerId rdfs:label ?employer.
        FILTER(lang(?employer) = 'en')
      }'''
    #print(query)
    r = requests.get(wikidataEndpointUrl, params={'query' : query}, headers=requestHeaderDictionary)
    try:
        data = r.json()
        statements = data['results']['bindings']
        if len(statements) > 0: # if no results, the list remains empty
            for statement in statements:
                resultsList.append(statement['employer']['value'])
    except:
        resultsList = [r.text]
    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(0.25)
    return resultsList

# returns a list of results of articles by person with Wikidata ID qId
def searchWikidataArticle(qId):
    resultsList = []
    # P50 is "author"; P698 is the PubMed ID of the article; P356 is the DOI of the article
    query = '''select distinct ?title ?doi ?pmid where {
      ?article wdt:P50 wd:''' + qId + '''.
      optional {
          ?article rdfs:label ?title.
          FILTER(lang(?title) = 'en')
          }
      optional {?article wdt:P698 ?pmid.}
      optional {?article wdt:P356 ?doi.}
      }'''
    #print(query)
    r = requests.get(wikidataEndpointUrl, params={'query' : query}, headers=requestHeaderDictionary)
    try:
        data = r.json()
        statements = data['results']['bindings']
        for statement in statements:
            if 'title' in statement:
                title = statement['title']['value']
                print('title=',title)
            else:
                title = ''
            if 'pmid' in statement:
                pmid = statement['pmid']['value']
            else:
                pmid = ''
            if 'doi' in statement:
                doi = statement['doi']['value']
            else:
                doi = ''
            resultsList.append({'title': title, 'pmid': pmid, 'doi': doi})
    except:
        resultsList = [r.text]
    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(0.25)
    return resultsList

def retrievePubMedData(pmid):
    fetchUrl = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
    paramDict = {
        'tool': toolName, 
        'email': emailAddress,
        'db': 'pubmed', 
         #'retmode': 'xml', 
        'rettype': 'abstract', 
        'id': pmid
    }
    response = requests.get(fetchUrl, params=paramDict)
    #print(response.url)
    pubData = response.text  # the response text is XML
    #print(pubData)  # uncomment this line to see the XML

    # process the returned XML, see https://docs.python.org/2/library/xml.etree.elementtree.html
    root = et.fromstring(pubData)
    try:
        title = root.findall('.//ArticleTitle')[0].text
    except:
        title = ''
    try:
        print(title)
    except:
        print('')
    names = root.findall('.//Author')
    affiliations = []
    for name in names:
        try:
            affiliation = name.find('./AffiliationInfo/Affiliation').text
        except:
            affiliation = ''
        try:
            lastName = name.find('./LastName').text
        except:
            lastName = ''
        try:
            foreName = name.find('./ForeName').text
        except:
            foreName = ''
              
        #print(lastName)
        #print(affiliation)
        affiliations.append({'affiliation': affiliation, 'surname': lastName, 'forename': foreName})
    #print()

    # See https://www.ncbi.nlm.nih.gov/books/NBK25497/ for usage guidelines. 
    # An API key is required for more than 3 requests per second.
    sleep(0.5) # wait half a second before hitting the API again to avoid getting blocked
    return affiliations

def retrieveCrossRefDoi(doi):
    authorList = []
    crossRefEndpointUrl = 'https://api.crossref.org/works/'
    encodedDoi = urllib.parse.quote(doi)
    searchUrl = crossRefEndpointUrl + encodedDoi
    response = requests.get(searchUrl, headers=requestHeaderDictionary)
    try:
        data = response.json()
        #print(json.dumps(data, indent = 2))
        if 'author' in data['message']:
            authors = data['message']['author']
            for author in authors:
                authorDict = {}
                if 'ORCID' in author:
                    authorDict['orcid'] = author['ORCID']
                else:
                    authorDict['orcid'] = ''
                if 'given' in author:
                    authorDict['givenName'] = author['given']
                else:
                    authorDict['givenName'] = ''
                if 'family' in author:
                    authorDict['familyName'] = author['family']
                else:
                    authorDict['familyName'] = ''
                affiliationList = []
                if 'affiliation' in author:
                    for affiliation in author['affiliation']:
                        affiliationList.append(affiliation['name'])
                # if there aren't any affiliations, the list will remain empty
                authorDict['affiliation'] = affiliationList
                authorList.append(authorDict)
    except:
        authorList = [data]
    return authorList

Body of search

In [None]:
filename = 'bsci-employees-with-wikidata.csv'
bsciEmployees = readDict(filename)

for employeeIndex in range(0, len(bsciEmployees)):
    
    # ************ left off here
    # try a name search as a last resort
    # NOTE: There is a significant number of cases where the person has an ORCID and is in Wikidata
    # but we don't know their ORCID because their affiliation didn't get into Wikidata or ORCID
    # These cases will need to be checked manually against publications to make sure they are 
    # the right people.
        results = searchNameAtWikidata(bsciEmployees[employeeIndex]['name'])
        if len(results) > 0:
            print('SPARQL name search: ', bsciEmployees[employeeIndex]['name'], results)
            if len(results) == 1:
                # check for error message in results
                if len(results[0]) > 15:
                    matchStatus = 9
                    print('Error message in name search')
                else:
                    matchStatus = 6
        else:
            print('No Wikidata match: ', bsciEmployees[employeeIndex]['name'])
            matchStatus = 7
    bsciEmployees[employeeIndex]['wikidataStatus'] = str(matchStatus)
    # ************************

    #qIds = ["Q21503132", "Q45530486", "Q45579795", "Q45579952", "Q45580596", "Q45631936", "Q56480357", "Q57416670", "Q57550074", "Q59553435", "Q70150244"]
    #qIds = ["Q16910840", "Q64091655", "Q66741850", "Q67221376"]
    qIds = ["Q64091698"]
    testString = 'Biological Sciences Vanderbilt'
    testEmployer = 'Vanderbilt University'
    #testAuthor = 'Peng Xu'
    #testAuthor = 'Thomas Clements'
    testAuthor = 'Larisa DeSantis'
    #testOrcid = '0000-0001-7103-3692'
    testOrcid = ''
    testOrcid = ''

    print('Checking identities for ', testAuthor)
    if testOrcid == '':
        print('(no ORCID)')
    else:
        print('ORCID: ', testOrcid)
    print()
    for qIdIndex in range(0, len(qIds)):
        print(qIdIndex, 'Wikidata ID: ', qIds[qIdIndex])
        descriptors = searchWikidataDescription(qIds[qIdIndex])
        employers = searchWikidataEmployer(qIds[qIdIndex])
        #print(descriptors)
        if descriptors != {}:
            if descriptors['description'] != '':
                print('description: ', descriptors['description'])
            for occupation in descriptors['occupation']:
                print('occupation: ', occupation)
            for employer in employers:
                print('employer: ', employer)
            if descriptors['orcid'] != '':
                if testOrcid == '':
                    # **** NOTE: if the person has an ORCID, it may be possible to find articles via ORCID
                    # that aren't linked in Wikidata. Not sure if this happens often enough to handle it
                    print('ORCID: ', descriptors['orcid'])
                else:
                    # This should always be true if the SPARQL query for ORCID was already done
                    if testOrcid != descriptors['orcid']:
                        print('*** NOT the same person; ORCID ' + descriptors['orcid'] + ' does not match.')
                        break # don't continue the loop (look up references) since it's definitely not a match
                    else:
                        print('*** An ORCID match! How did it get missed in the earlier SPARQL query?')
                        break
        else:
            print('No description or occupation given.')

        result = searchWikidataArticle(qIds[qIdIndex])
        if len(result) == 0:
            print('No articles authored by that person')
        else:
            foundMatch = False
            for article in result:
                print('Checking article: ', article['title'])
                if article['pmid'] == '':
                    print('No PubMed ID')
                else:
                    print('Checking authors in PubMed article: ', article['pmid'])
                    pubMedAuthors = retrievePubMedData(article['pmid'])
                    for author in pubMedAuthors:
                        nameTestRatio = fuzz.token_set_ratio(author['surname'], testAuthor)
                        print(nameTestRatio, author['surname'])
                        if nameTestRatio >= 90:
                            if author['affiliation'] != '': 
                                setRatio = fuzz.token_set_ratio(testString, author['affiliation'])
                                print('Affiliation test: ', setRatio, author['affiliation'])
                                if setRatio >= 90:
                                    foundMatch = True
                            else:
                                break # give up on this article because no affiliation string
                # Don't look up the DOI if it's already found a match with PubMed
                if foundMatch:
                    break # stop checking articles after one has matched
                else:
                    if article['doi'] == '':
                        print('No DOI')
                    else:
                        print('Checking authors in DOI article: ', article['doi'])
                        doiAuthors = retrieveCrossRefDoi(article['doi'])
                        for author in doiAuthors:
                            nameTestRatio = fuzz.token_set_ratio(author['familyName'], testAuthor)
                            print(nameTestRatio, author['familyName'])
                            if nameTestRatio >= 90:
                                if author['orcid'] != '':
                                    if testOrcid == '':
                                        print('ORCID from article: ', author['orcid'])
                                    else:
                                        if testOrcid != author['orcid']:
                                            print('*** NOT the same person; ORCID ' + author['orcid'] + ' does not match.')
                                            break # don't continue the loop (look up references) since it's definitely not a match
                                        else:
                                            print('*** An ORCID match!')
                                            foundMatch = True
                                            break


                                if len(author['affiliation']) > 0:
                                    for affiliation in author['affiliation']:
                                        setRatio = fuzz.token_set_ratio(testString, affiliation)
                                        print('Affiliation test: ', setRatio, affiliation)
                                        if setRatio >= 90:
                                            foundMatch = True
                                #else:
                                #    break # give up on this article because no affiliation string
            if foundMatch:
                print('***', qId, ' has a match.')
            else:
                print('No match found.')
        print()