In [None]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import requests
import csv
import json
import xml.etree.ElementTree as et
from time import sleep
import urllib

# tool name and email address should be sent with all requests
# see https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
emailAddress = 'steve.baskauf@vanderbilt.edu' # put your email address here
toolName = 'testApiScript' # give your application a name here

wikidataEndpointUrl = 'https://query.wikidata.org/sparql'
acceptMediaType = 'application/json'
userAgentHeader = 'BaskaufScraper/0.1 (mailto:steve.baskauf@vanderbilt.edu)'
requestHeaderDictionary = {
    'Accept' : acceptMediaType,
    'User-Agent': userAgentHeader
    }

# returns a dictionary of various descriptors of the item with Wikidata ID qId
# P106 is occupation, schema:description is filtered to be the English description
def searchWikidataDescription(qId):
    resultsDict = {}
    query = '''select distinct ?description ?orcid ?occupation where {
        optional {
            wd:'''+ qId + ''' schema:description ?description.
            FILTER(lang(?description) = 'en')
            }
        optional {
            wd:'''+ qId + ''' wdt:P106 ?occupationId.
            ?occupationId rdfs:label ?occupation.
            FILTER(lang(?occupation) = 'en')            
            }
        optional {wd:'''+ qId + ''' wdt:P496 ?orcid.}
      }'''
    #print(query)
    r = requests.get(wikidataEndpointUrl, params={'query' : query}, headers=requestHeaderDictionary)
    try:
        data = r.json()
        statements = data['results']['bindings']
        if len(statements) > 0: # if no results, the dictionary remains empty
            # Only a single description per language is allowed, so there should only be one description
            if 'description' in statements[0]:
                description = statements[0]['description']['value']
            else:
                description = ''
            resultsDict['description'] = description
            
            # Only a single ORCID is allowed, so there should only be one orcid value
            if 'orcid' in statements[0]:
                orcid = statements[0]['orcid']['value']
            else:
                orcid = ''
            resultsDict['orcid'] = orcid
            
            # if there are multiple statements, that's because there are more than one occupation
            occupationList = []
            for statement in statements:
                if 'occupation' in statement:
                    occupationList.append(statement['occupation']['value'])
            resultsDict['occupation'] = occupationList
    except:
        resultsDict = {'error': r.text}
    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(0.25)
    return resultsDict

# returns a list of employer strings for the item with Wikidata ID qId;,P108 is employer
def searchWikidataEmployer(qId):
    resultsList = []
    query = '''select distinct ?employer where {
        wd:'''+ qId + ''' wdt:P108 ?employerId.
        ?employerId rdfs:label ?employer.
        FILTER(lang(?employer) = 'en')
      }'''
    #print(query)
    r = requests.get(wikidataEndpointUrl, params={'query' : query}, headers=requestHeaderDictionary)
    try:
        data = r.json()
        statements = data['results']['bindings']
        if len(statements) > 0: # if no results, the list remains empty
            for statement in statements:
                resultsList.append(statement['employer']['value'])
    except:
        resultsList = [r.text]
    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(0.25)
    return resultsList

# returns a list of results of articles by person with Wikidata ID qId
def searchWikidataArticle(qId):
    resultsList = []
    # P50 is "author"; P698 is the PubMed ID of the article; P356 is the DOI of the article
    query = '''select distinct ?title ?doi ?pmid where {
      ?article wdt:P50 wd:''' + qId + '''.
      optional {
          ?article rdfs:label ?title.
          FILTER(lang(?title) = 'en')
          }
      optional {?article wdt:P698 ?pmid.}
      optional {?article wdt:P356 ?doi.}
      }'''
    #print(query)
    r = requests.get(wikidataEndpointUrl, params={'query' : query}, headers=requestHeaderDictionary)
    try:
        data = r.json()
        statements = data['results']['bindings']
        for statement in statements:
            if 'title' in statement:
                title = statement['title']['value']
                print('title=',title)
            else:
                title = ''
            if 'pmid' in statement:
                pmid = statement['pmid']['value']
            else:
                pmid = ''
            if 'doi' in statement:
                doi = statement['doi']['value']
            else:
                doi = ''
            resultsList.append({'title': title, 'pmid': pmid, 'doi': doi})
    except:
        resultsList = [r.text]
    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(0.25)
    return resultsList

def retrievePubMedData(pmid):
    fetchUrl = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
    paramDict = {
        'tool': toolName, 
        'email': emailAddress,
        'db': 'pubmed', 
         #'retmode': 'xml', 
        'rettype': 'abstract', 
        'id': pmid
    }
    response = requests.get(fetchUrl, params=paramDict)
    #print(response.url)
    pubData = response.text  # the response text is XML
    #print(pubData)  # uncomment this line to see the XML

    # process the returned XML, see https://docs.python.org/2/library/xml.etree.elementtree.html
    root = et.fromstring(pubData)
    try:
        title = root.findall('.//ArticleTitle')[0].text
    except:
        title = ''
    try:
        print(title)
    except:
        print('')
    names = root.findall('.//Author')
    affiliations = []
    for name in names:
        try:
            affiliation = name.find('./AffiliationInfo/Affiliation').text
        except:
            affiliation = ''
        try:
            lastName = name.find('./LastName').text
        except:
            lastName = ''
        try:
            foreName = name.find('./ForeName').text
        except:
            foreName = ''
              
        #print(lastName)
        #print(affiliation)
        affiliations.append({'affiliation': affiliation, 'surname': lastName, 'forename': foreName})
    #print()

    # See https://www.ncbi.nlm.nih.gov/books/NBK25497/ for usage guidelines. 
    # An API key is required for more than 3 requests per second.
    sleep(0.5) # wait half a second before hitting the API again to avoid getting blocked
    return affiliations

def retrieveCrossRefDoi(doi):
    authorList = []
    crossRefEndpointUrl = 'https://api.crossref.org/works/'
    encodedDoi = urllib.parse.quote(doi)
    searchUrl = crossRefEndpointUrl + encodedDoi
    response = requests.get(searchUrl, headers=requestHeaderDictionary)
    try:
        data = response.json()
        #print(json.dumps(data, indent = 2))
        if 'author' in data['message']:
            authors = data['message']['author']
            for author in authors:
                authorDict = {}
                if 'ORCID' in author:
                    authorDict['orcid'] = author['ORCID']
                else:
                    authorDict['orcid'] = ''
                if 'given' in author:
                    authorDict['givenName'] = author['given']
                else:
                    authorDict['givenName'] = ''
                if 'family' in author:
                    authorDict['familyName'] = author['family']
                else:
                    authorDict['familyName'] = ''
                affiliationList = []
                if 'affiliation' in author:
                    for affiliation in author['affiliation']:
                        affiliationList.append(affiliation['name'])
                # if there aren't any affiliations, the list will remain empty
                authorDict['affiliation'] = affiliationList
                authorList.append(authorDict)
    except:
        authorList = [data]
    return authorList

In [None]:
print(retrieveCrossRefDoi('10.1111/jzo.12413'))

In [None]:
qIds = ["Q21503132", "Q45530486", "Q45579795", "Q45579952", "Q45580596", "Q45631936", "Q56480357", "Q57416670", "Q57550074", "Q59553435", "Q70150244"]
testString = 'Biological Sciences Vanderbilt'
testAuthor = 'Peng Xu'

In [None]:
for qId in qIds:
    result = searchWikidataArticle(qId)
    print(result)

In [None]:
if len(result) > 0:
    if result[0]['pmid'] != '':
        pubMedAuthors = retrievePubMedData(result[0])
        print(pubMedAuthors)

In [None]:
for author in pubMedAuthors:
    nameTestRatio = fuzz.token_set_ratio(author['surname'], testAuthor)
    print(nameTestRatio, author['surname'])
    if nameTestRatio >= 90:
        setRatio = fuzz.token_set_ratio(testString, author['affiliation'])
        print(setRatio, author['affiliation'])

In [None]:
#qIds = ["Q21503132", "Q45530486", "Q45579795", "Q45579952", "Q45580596", "Q45631936", "Q56480357", "Q57416670", "Q57550074", "Q59553435", "Q70150244"]
#qIds = ["Q16910840", "Q64091655", "Q66741850", "Q67221376"]
qIds = ["Q64091698"]
testString = 'Biological Sciences Vanderbilt'
testEmployer = 'Vanderbilt University'
#testAuthor = 'Peng Xu'
#testAuthor = 'Thomas Clements'
testAuthor = 'Larisa DeSantis'
#testOrcid = '0000-0001-7103-3692'
testOrcid = ''
testOrcid = ''

print('Checking identities for ', testAuthor)
if testOrcid == '':
    print('(no ORCID)')
else:
    print('ORCID: ', testOrcid)
print()
for qIdIndex in range(0, len(qIds)):
    print(qIdIndex, 'Wikidata ID: ', qIds[qIdIndex])
    descriptors = searchWikidataDescription(qIds[qIdIndex])
    employers = searchWikidataEmployer(qIds[qIdIndex])
    #print(descriptors)
    if descriptors != {}:
        if descriptors['description'] != '':
            print('description: ', descriptors['description'])
        for occupation in descriptors['occupation']:
            print('occupation: ', occupation)
        for employer in employers:
            print('employer: ', employer)
        if descriptors['orcid'] != '':
            if testOrcid == '':
                # **** NOTE: if the person has an ORCID, it may be possible to find articles via ORCID
                # that aren't linked in Wikidata. Not sure if this happens often enough to handle it
                print('ORCID: ', descriptors['orcid'])
            else:
                # This should always be true if the SPARQL query for ORCID was already done
                if testOrcid != descriptors['orcid']:
                    print('*** NOT the same person; ORCID ' + descriptors['orcid'] + ' does not match.')
                    break # don't continue the loop (look up references) since it's definitely not a match
                else:
                    print('*** An ORCID match! How did it get missed in the earlier SPARQL query?')
                    break
    else:
        print('No description or occupation given.')
    
    result = searchWikidataArticle(qIds[qIdIndex])
    if len(result) == 0:
        print('No articles authored by that person')
    else:
        foundMatch = False
        for article in result:
            print('Checking article: ', article['title'])
            if article['pmid'] == '':
                print('No PubMed ID')
            else:
                print('Checking authors in PubMed article: ', article['pmid'])
                pubMedAuthors = retrievePubMedData(article['pmid'])
                for author in pubMedAuthors:
                    nameTestRatio = fuzz.token_set_ratio(author['surname'], testAuthor)
                    print(nameTestRatio, author['surname'])
                    if nameTestRatio >= 90:
                        if author['affiliation'] != '': 
                            setRatio = fuzz.token_set_ratio(testString, author['affiliation'])
                            print('Affiliation test: ', setRatio, author['affiliation'])
                            if setRatio >= 90:
                                foundMatch = True
                        else:
                            break # give up on this article because no affiliation string
            # Don't look up the DOI if it's already found a match with PubMed
            if foundMatch:
                break # stop checking articles after one has matched
            else:
                if article['doi'] == '':
                    print('No DOI')
                else:
                    print('Checking authors in DOI article: ', article['doi'])
                    doiAuthors = retrieveCrossRefDoi(article['doi'])
                    for author in doiAuthors:
                        nameTestRatio = fuzz.token_set_ratio(author['familyName'], testAuthor)
                        print(nameTestRatio, author['familyName'])
                        if nameTestRatio >= 90:
                            if author['orcid'] != '':
                                if testOrcid == '':
                                    print('ORCID from article: ', author['orcid'])
                                else:
                                    if testOrcid != author['orcid']:
                                        print('*** NOT the same person; ORCID ' + author['orcid'] + ' does not match.')
                                        break # don't continue the loop (look up references) since it's definitely not a match
                                    else:
                                        print('*** An ORCID match!')
                                        foundMatch = True
                                        break


                            if len(author['affiliation']) > 0:
                                for affiliation in author['affiliation']:
                                    setRatio = fuzz.token_set_ratio(testString, affiliation)
                                    print('Affiliation test: ', setRatio, affiliation)
                                    if setRatio >= 90:
                                        foundMatch = True
                            #else:
                            #    break # give up on this article because no affiliation string
        if foundMatch:
            print('***', qId, ' has a match.')
        else:
            print('No match found.')
    print()