In [None]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import requests
import csv
import json
from time import sleep

def readDict(filename):
    fileObject = open(filename, 'r', newline='', encoding='utf-8')
    dictObject = csv.DictReader(fileObject)
    array = []
    for row in dictObject:
        array.append(row)
    fileObject.close()
    return array

def extractQNumber(iri):
    # pattern is http://www.wikidata.org/entity/Q6386232
    pieces = iri.split('/')
    return pieces[4]

# query for a single variable that's an item named 'item'
# returns a list of results
def searchWikidataForQId(query):
    results = []
    endpointUrl = 'https://query.wikidata.org/sparql'
    acceptMediaType = 'application/json'
    userAgentHeader = 'BaskaufScraper/0.1 (steve.baskauf@vanderbilt.edu)'
    requestHeaderDictionary = {
    'Accept' : acceptMediaType,
    'User-Agent': userAgentHeader
}
    r = requests.get(endpointUrl, params={'query' : query}, headers=requestHeaderDictionary)
    try:
        data = r.json()
        statements = data['results']['bindings']
        for statement in statements:
            wikidataIri = statement['item']['value']
            qNumber = extractQNumber(wikidataIri)
            results.append(qNumber)
    except:
        results = [r.text]
    # delay a quarter second to avoid hitting the SPARQL endpoint to rapidly
    sleep(0.25)
    return results

def generateNameAlternatives(name):
    # get rid of periods
    name = name.replace('.', '')
    pieces = name.split(' ')
    
    # generate initials for all names
    initials = []
    for piece in pieces:
        initials.append(piece[0:1])
        
    alternatives = []
    # full name
    nameVersion = ''
    for pieceNumber in range(0, len(pieces)-1):
        nameVersion += pieces[pieceNumber] + ' '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)
    
    # first and last name with initials
    nameVersion = pieces[0] + ' '
    for pieceNumber in range(1, len(pieces)-1):
        nameVersion += initials[pieceNumber] + ' '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)
    
    # first and last name with initials and periods
    nameVersion = pieces[0] + ' '
    for pieceNumber in range(1, len(pieces)-1):
        nameVersion += initials[pieceNumber] + '. '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)

    # first and last name only
    nameVersion = pieces[0] + ' '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)

    # first initial and last name only
    nameVersion = initials[0] + ' '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)

    # first initial with period and last name only
    nameVersion = initials[0] + '. '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)

    # all name initials with last name
    nameVersion = initials[0] + ' '
    for pieceNumber in range(1, len(pieces)-1):
        nameVersion += initials[pieceNumber] + ' '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)

    # all name initials with periods with last name
    nameVersion = ''
    for pieceNumber in range(0, len(pieces)-1):
        nameVersion += initials[pieceNumber] + '. '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)

    # all name initials concatenated with last name
    nameVersion = ''
    for pieceNumber in range(0, len(pieces)-1):
        nameVersion += initials[pieceNumber]
    nameVersion += ' ' + pieces[len(pieces)-1]
    alternatives.append(nameVersion)
    
    # remove duplicates
    dedupe = list(set(alternatives))

    return dedupe

def searchNameAtWikidata(name):
    nameList = generateNameAlternatives(name)
    alternatives = ''
    for alternative in nameList:
        alternatives += '"' + alternative + '"@en\n'
    query = '''
select distinct ?item where {
  VALUES ?value
  {
  ''' + alternatives + '''}
?item rdfs:label|skos:altLabel ?value.
  }
'''
    #print(query)
    print('searching for ', name)
    results = searchWikidataForQId(query)
    return results

In [None]:
filename = 'bsci-employees-with-orcid.csv'
bsciEmployees = readDict(filename)

In [None]:
filename = 'vanderbilt_wikidata.csv'
wikidataData = readDict(filename)

In [None]:
testRatio = 90
departmentTestRatio = 90

for employeeIndex in range(0, len(bsciEmployees)):
    # 0=unmatched
    # 1=matched with ORCID in both sources
    # 2=ORCID in BSCI but name match to Wikidata (no ORCID)
    # 3=no ORCID in BSCI but name match to Wikidata (with ORCID); could happen if affiliation isn't matched in ORCID
    # 4=no ORCID in BSCI but name match to Wikidata (no ORCID)
    # 5=ORCID in BSCI and found via SPARQL ORCID search (likely non-VU affiliated in Wikidata)
    # 6=ORCID in BSCI and found via SPARQL name search (non-VU affiliated without ORCID)
    # 7=no ORCID in BSCI, no name match
    # 8=ORCID in BSCI, error in SPARQL ORCID search
    # 9=no ORCID in BSCI, error in SPARQL name search
    matchStatus = 0
    for row in wikidataData:
        # We know the employee has an ORCID, so try to match it
        if bsciEmployees[employeeIndex]['orcid'] != '':
            # There's a match, hooray!
            if bsciEmployees[employeeIndex]['orcid'] == row['orcid']:
                print('orcid match: ', row['name'] + ' ' + row['orcid'])
                matchStatus = 1
                bsciEmployees[employeeIndex]['wikidataId'] = extractQNumber(row['wikidataIri'])
            # No ORCID match - see if the name matches
            else:
                setRatio = fuzz.token_set_ratio(row['name'], bsciEmployees[employeeIndex]['name'])
                if setRatio >= testRatio:
                    print('name match: ', str(setRatio) + ' ' + row['name'] + ' / ' + bsciEmployees[employeeIndex]['name'] + ' BSCI:' + bsciEmployees[employeeIndex]['orcid'])
                    matchStatus = 2
                    bsciEmployees[employeeIndex]['wikidataId'] = extractQNumber(row['wikidataIri'])
        # As far as we know, the employee doesn't have an ORCID, so try to match the name
        else:
            setRatio = fuzz.token_set_ratio(row['name'], bsciEmployees[employeeIndex]['name'])
            # We get a name match 
            if setRatio >= testRatio:
                # For some reason, Wikidata has the ORCID, so grab it
                if row['orcid'] != '':
                    print('name match: ', str(setRatio) + ' ' + row['name'] + ' / ' + bsciEmployees[employeeIndex]['name'] + ' ORCID:' + row['orcid'])
                    bsciEmployees[employeeIndex]['orcid'] = row['orcid']
                    matchStatus = 3
                # Wikidata doesn't have an ORCID
                else:
                    print('name match: ', str(setRatio) + ' ' + row['name'] + ' / ' + bsciEmployees[employeeIndex]['name'] + ' WD description: ' + row['description'])
                    matchStatus = 4
                bsciEmployees[employeeIndex]['wikidataId'] = extractQNumber(row['wikidataIri'])
        # We've gone all the way through the without finding a match

    # Do a last ditch attempt to try to find the person in Wikidata by doing a SPARQL search for their ORCID
    if matchStatus == 0:
        if bsciEmployees[employeeIndex]['orcid'] != '':
            query = '''
select distinct ?item where {
  ?item wdt:P496 "''' + bsciEmployees[employeeIndex]['orcid'] + '''".
  }
'''
            results = searchWikidataForQId(query)
            if len(results) > 0:
                print('SPARQL ORCID search: ', bsciEmployees[employeeIndex]['name'], results)
                if len(results) == 1:
                    # if search fails and return an error message
                    if len(results[0]) > 15:
                        matchStatus = 8
                        print('Error message in ORCID search')
                    else:
                        matchStatus = 5
                        bsciEmployees[employeeIndex]['wikidataId'] = results[0]
                else:
                    print('ERROR: multiple results for same ORCID')
        # try a name search as a last resort
        # NOTE: There is a significant number of cases where the person has an ORCID and is in Wikidata
        # but we don't know their ORCID because their affiliation didn't get into Wikidata or ORCID
        # These cases will need to be checked manually against publications to make sure they are 
        # the right people.
        else:
            results = searchNameAtWikidata(bsciEmployees[employeeIndex]['name'])
            if len(results) > 0:
                print('SPARQL name search: ', bsciEmployees[employeeIndex]['name'], results)
                if len(results) == 1:
                    # check for error message in results
                    if len(results[0]) > 15:
                        matchStatus = 9
                        print('Error message in name search')
                    else:
                        matchStatus = 6
            else:
                print('No Wikidata match: ', bsciEmployees[employeeIndex]['name'])
                matchStatus = 7
        bsciEmployees[employeeIndex]['wikidataStatus'] = str(matchStatus)

print('done')
print(bsciEmployees)

In [None]:
filename = 'bsci-employees-with-wikidata.csv'
with open(filename, 'w', newline='') as csvFileObject:
    fieldnames = ['wikidataId', 'name', 'degree', 'category', 'orcid', 'wikidataStatus', 'role']
    writer = csv.DictWriter(csvFileObject, fieldnames=fieldnames)
    writer.writeheader()
    for row in bsciEmployees:
        writer.writerow(row)
