In [56]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import requests
import csv
import json
from time import sleep

def readDict(filename):
    fileObject = open(filename, 'r', newline='', encoding='utf-8')
    dictObject = csv.DictReader(fileObject)
    array = []
    for row in dictObject:
        array.append(row)
    fileObject.close()
    return array

def extractQNumber(iri):
    # pattern is http://www.wikidata.org/entity/Q6386232
    pieces = iri.split('/')
    return pieces[4]

# query for a single variable that's an item named 'item'
# returns a list of results
def searchWikidataForQId(query):
    results = []
    endpointUrl = 'https://query.wikidata.org/sparql'
    acceptMediaType = 'application/json'
    userAgentHeader = 'BaskaufScraper/0.1 (steve.baskauf@vanderbilt.edu)'
    requestHeaderDictionary = {
    'Accept' : acceptMediaType,
    'User-Agent': userAgentHeader
}
    r = requests.get(endpointUrl, params={'query' : query}, headers=requestHeaderDictionary)
    try:
        data = r.json()
        statements = data['results']['bindings']
        for statement in statements:
            wikidataIri = statement['item']['value']
            qNumber = extractQNumber(wikidataIri)
            results.append(qNumber)
    except:
        results = [r.text]
    # delay a quarter second to avoid hitting the SPARQL endpoint to rapidly
    sleep(0.25)
    return results

def searchNameAtWikidata(name):
    query = '''
select distinct ?item where {
  ?item rdfs:label|skos:altLabel "''' + name + '''"@en.
  }
'''
    results = searchWikidataForQId(query)
    return results

In [7]:
filename = 'bsci-employees-with-orcid.csv'
bsciEmployees = readDict(filename)

In [8]:
filename = 'vanderbilt_wikidata.csv'
wikidataData = readDict(filename)

In [61]:
testRatio = 90
departmentTestRatio = 90

for employeeIndex in range(0, len(bsciEmployees)):
    # 0=unmatched
    # 1=matched with ORCID in both sources
    # 2=ORCID in BSCI but name match to Wikidata (no ORCID)
    # 3=no ORCID in BSCI but name match to Wikidata (with ORCID); could happen if affiliation isn't matched in ORCID
    # 4=no ORCID in BSCI but name match to Wikidata (no ORCID)
    # 5=ORCID in BSCI and found via SPARQL ORCID search (likely non-VU affiliated in Wikidata)
    # 6=ORCID in BSCI and found via SPARQL name search (non-VU affiliated without ORCID)
    # 7=no ORCID in BSCI, no name match
    # 8=ORCID in BSCI, error in SPARQL ORCID search
    # 9=no ORCID in BSCI, error in SPARQL name search
    matchStatus = 0
    for row in wikidataData:
        # We know the employee has an ORCID, so try to match it
        if bsciEmployees[employeeIndex]['orcid'] != '':
            # There's a match, hooray!
            if bsciEmployees[employeeIndex]['orcid'] == row['orcid']:
                print('orcid match: ', row['name'] + ' ' + row['orcid'])
                matchStatus = 1
                bsciEmployees[employeeIndex]['wikidataId'] = extractQNumber(row['wikidataIri'])
            # No ORCID match - see if the name matches
            else:
                setRatio = fuzz.token_set_ratio(row['name'], bsciEmployees[employeeIndex]['name'])
                if setRatio >= testRatio:
                    print('name match: ', str(setRatio) + ' ' + row['name'] + ' / ' + bsciEmployees[employeeIndex]['name'] + ' BSCI:' + bsciEmployees[employeeIndex]['orcid'])
                    matchStatus = 2
                    bsciEmployees[employeeIndex]['wikidataId'] = extractQNumber(row['wikidataIri'])
        # As far as we know, the employee doesn't have an ORCID, so try to match the name
        else:
            setRatio = fuzz.token_set_ratio(row['name'], bsciEmployees[employeeIndex]['name'])
            # We get a name match 
            if setRatio >= testRatio:
                # For some reason, Wikidata has the ORCID, so grab it
                if row['orcid'] != '':
                    print('name match: ', str(setRatio) + ' ' + row['name'] + ' / ' + bsciEmployees[employeeIndex]['name'] + ' ORCID:' + row['orcid'])
                    bsciEmployees[employeeIndex]['orcid'] = row['orcid']
                    matchStatus = 3
                # Wikidata doesn't have an ORCID
                else:
                    print('name match: ', str(setRatio) + ' ' + row['name'] + ' / ' + bsciEmployees[employeeIndex]['name'] + ' WD description: ' + row['description'])
                    matchStatus = 4
                bsciEmployees[employeeIndex]['wikidataId'] = extractQNumber(row['wikidataIri'])
        # We've gone all the way through the without finding a match

    # Do a last ditch attempt to try to find the person in Wikidata by doing a SPARQL search for their ORCID
    if matchStatus == 0:
        if bsciEmployees[employeeIndex]['orcid'] != '':
            query = '''
select distinct ?item where {
  ?item wdt:P496 "''' + bsciEmployees[employeeIndex]['orcid'] + '''".
  }
'''
            results = searchWikidataForQId(query)
            if len(results) > 0:
                print('SPARQL ORCID search: ', bsciEmployees[employeeIndex]['name'], results)
                if len(results) == 1:
                    # if search fails and return an error message
                    if len(results[0]) > 15:
                        matchStatus = 8
                        print('Error message in ORCID search')
                    else:
                        matchStatus = 5
                        bsciEmployees[employeeIndex]['wikidataId'] = results[0]
                else:
                    print('ERROR: multiple results for same ORCID')
        # try a name search as a last resort
        # NOTE: There is a significant number of cases where the person has an ORCID and is in Wikidata
        # but we don't know their ORCID because their affiliation didn't get into Wikidata or ORCID
        # These cases will need to be checked manually against publications to make sure they are 
        # the right people.
        else:
            results = searchNameAtWikidata(bsciEmployees[employeeIndex]['name'])
            if len(results) > 0:
                print('SPARQL name search: ', bsciEmployees[employeeIndex]['name'], results)
                if len(results) == 1:
                    # check for error message in results
                    if len(results[0]) > 15:
                        matchStatus = 9
                        print('Error message in name search')
                    else:
                        matchStatus = 6
            else:
                print('No Wikidata match: ', bsciEmployees[employeeIndex]['name'])
                matchStatus = 7
        bsciEmployees[employeeIndex]['wikidataStatus'] = str(matchStatus)

print('done')
print(bsciEmployees)

SPARQL name search:  Patrick Abbot ['Q63512955']
SPARQL name search:  Seth Bordenstein ['Q45943775']
name match:  100 Kendal Broadie / Kendal S. Broadie WD description: American biologist
No Wikidata match:  Tony Capra
name match:  100 Kenneth C. Catania / Kenneth C. Catania WD description: American neuroscientist
orcid match:  Nicole Creanza 0000-0001-8821-7383
SPARQL name search:  Larisa DeSantis ['Q64091698']
orcid match:  Brandt F Eichman 0000-0002-0965-2297
No Wikidata match:  Katherine L. Friedman
No Wikidata match:  Daniel J. Funk
No Wikidata match:  Todd R. Graham
SPARQL ORCID search:  Julian F. Hillyer ['Q73061764']
SPARQL ORCID search:  Lauren Parker Jackson ['Q59535314']
name match:  100 Carl H. Johnson / Carl H. Johnson BSCI:0000-0003-2878-3193
name match:  100 Douglas G. McMahon / Douglas G. McMahon WD description: American biologist
No Wikidata match:  Maulik Patel
No Wikidata match:  James G. Patton
orcid match:  Lars Plate 0000-0003-4363-6116
orcid match:  Antonis Rokas

In [62]:
filename = 'bsci-employees-with-wikidata.csv'
with open(filename, 'w', newline='') as csvFileObject:
    fieldnames = ['wikidataId', 'name', 'degree', 'category', 'orcid', 'wikidataStatus', 'role']
    writer = csv.DictWriter(csvFileObject, fieldnames=fieldnames)
    writer.writeheader()
    for row in bsciEmployees:
        writer.writerow(row)
