# VanderBot (departments)

The scripts in this notebook are part of the development of VanderBot, a system to write information about Vanderbilt University researchers and their works to Wikidata.  

This is a side project geared towards getting information about Vanderbilt departments into Wikidata.  It is a hack of https://github.com/HeardLibrary/linked-data/blob/master/publications/process_department.ipynb

This code is freely available under a CC0 license. Steve Baskauf 2019-12-16  

For more information, see [this page](https://github.com/HeardLibrary/linked-data/tree/master/publications).  

# Common Code

This code block includes import statements, function definitions, and declarations of variables that are common to the rest of the script. It needs to be run once before the other code blocks.

In [None]:
import requests   # best library to manage HTTP transactions
from bs4 import BeautifulSoup # web-scraping library
import json
from time import sleep
import csv
import math
from fuzzywuzzy import fuzz # fuzzy logic matching
from fuzzywuzzy import process
import xml.etree.ElementTree as et # library to traverse XML tree
import urllib
import datetime

# For a particular processing round, set a short name for the department here.
# This name is used to generate a set of unique processing files for that department.
deptShortName = 'physics'
testEmployer = 'Vanderbilt University' # to test against Wikidata employer property
employerQId = 'Q29052' # Vanderbilt University
deathDateLimit = '2000' # any date deaths before this date will be assumed to not be a match

# NOTE: eventually need to test against all affiliations in cases of faculty with multiple appointments

# ***********************************
# NOTE: the script fails if there is a current item in Wikidata that has the same values for both label and description. 
# A check needs to be run for this !!!
# ***********************************

# The default labels and descriptions can either be a column in the table or set as a constant. 
# If it's a column, the value is the column header.  If it's a constant, the value is the string to assign as the value.
deptSettings = {
    'bsci': {
        'categories': ['primary-training-faculty', 'research-and-teaching-faculty', 'secondary-faculty', 'postdoc-fellows', 'emeriti'],
        'baseUrl': 'https://as.vanderbilt.edu/biosci/people/index.php?group=',
        'departmentSearchString': 'Biological Sciences',
        'departmentQId': 'Q78041310',
        'testAuthorAffiliation': 'Biological Sciences Vanderbilt',
        "labels": 
            {
                "source": "column",
                "value": "name"
            },
        "descriptions": 
            {
                "source": "constant",
                "value": "biology researcher"
            }
    },
    'physics': {
        'categories': ['faculty', 'emeritus-faculty', 'academic-research-staff'],
        'baseUrl': 'https://as.vanderbilt.edu/physics/people/index.php?group=',
        'departmentSearchString': 'Physics Astronomy',
        'departmentQId': 'Q78779260',
        'testAuthorAffiliation': 'Physics Astronomy Vanderbilt',
        "labels": 
            {
                "source": "column",
                "value": "name"
            },
        "descriptions": 
            {
                "source": "constant",
                "value": "physics/astronomy researcher"
            }
    }
}

wikidataEndpointUrl = 'https://query.wikidata.org/sparql'
degreeList = [
    {'string': 'Ph.D.', 'value': 'Ph.D.'},
    {'string': 'PhD', 'value': 'Ph.D.'},
    {'string': 'D.Phil.', 'value': 'D.Phil.'},
    {'string': 'J.D.', 'value': 'J.D.'}
     ]

# NCBI identification requirements:
# tool name and email address should be sent with all requests
# see https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
emailAddress = 'steve.baskauf@vanderbilt.edu' # put your email address here
toolName = 'VanderBot' # give your application a name here

# generates a dictionary to be passed in a requests GET method to generate the request header
def generateHeaderDictionary(acceptMediaType):
    userAgentHeader = 'VanderBot/0.9 (https://github.com/HeardLibrary/linked-data/tree/master/publications; mailto:steve.baskauf@vanderbilt.edu)'
    requestHeaderDictionary = {
        'Accept' : acceptMediaType,
        'User-Agent': userAgentHeader
    }
    return requestHeaderDictionary

# write a list of lists to a CSV file
def writeCsv(fileName, array):
    fileObject = open(fileName, 'w', newline='', encoding='utf-8')
    writerObject = csv.writer(fileObject)
    for row in array:
        writerObject.writerow(row)
    fileObject.close()

# write a list of dictionaries to a CSV file
def writeDictsToCsv(table, filename, fieldnames):
    with open(filename, 'w', newline='') as csvFileObject:
        writer = csv.DictWriter(csvFileObject, fieldnames=fieldnames)
        writer.writeheader()
        for row in table:
            writer.writerow(row)

# read from a CSV file into a list of dictionaries
def readDict(filename):
    fileObject = open(filename, 'r', newline='', encoding='utf-8')
    dictObject = csv.DictReader(fileObject)
    array = []
    for row in dictObject:
        array.append(row)
    fileObject.close()
    return array

# extracts the qNumber from a Wikidata IRI
def extractQNumber(iri):
    # pattern is http://www.wikidata.org/entity/Q6386232
    pieces = iri.split('/')
    return pieces[4]

# extracts a local name from an IRI, specify the list item number for the last piece separated by slash
def extractFromIri(iri, numberPieces):
    # with pattern like http://www.wikidata.org/entity/Q6386232 there are 5 pieces with qId as number 4
    pieces = iri.split('/')
    return pieces[numberPieces]

# see https://www.wikidata.org/wiki/Property:P21 for values
def decodeSexOrGender(code):
    code = code.lower()
    if code == 'm':
        qId = 'Q6581097'
    elif code == 'f':
        qId = 'Q6581072'
    elif code == 'i':
        qId = 'Q1097630'
    elif code == 'tf':
        qId = 'Q1052281'
    elif code == 'tm':
        qId = 'Q2449503'
    else:
        qId = ''
    return qId

# query for a single variable that's an item named 'item'
# returns a list of results
def searchWikidataForQIdByOrcid(orcid):
    query = '''
select distinct ?item where {
  ?item wdt:P496 "''' + employees[employeeIndex]['orcid'] + '''".
  }
'''
    results = []
    acceptMediaType = 'application/json'
    r = requests.get(wikidataEndpointUrl, params={'query' : query}, headers = generateHeaderDictionary(acceptMediaType))
    try:
        data = r.json()
        statements = data['results']['bindings']
        for statement in statements:
            wikidataIri = statement['item']['value']
            qNumber = extractQNumber(wikidataIri)
            results.append(qNumber)
    except:
        results = [r.text]
    # delay a quarter second to avoid hitting the SPARQL endpoint to rapidly
    sleep(0.25)
    return results

# returns a list of employer strings for the item with Wikidata ID qId; P108 is employer
def searchWikidataEmployer(qId):
    resultsList = []
    query = '''select distinct ?employer where {
        wd:'''+ qId + ''' wdt:P108 ?employerId.
        ?employerId rdfs:label ?employer.
        FILTER(lang(?employer) = 'en')
      }'''
    #print(query)
    r = requests.get(wikidataEndpointUrl, params={'query' : query}, headers=requestHeaderDictionary)
    try:
        data = r.json()
        statements = data['results']['bindings']
        if len(statements) > 0: # if no results, the list remains empty
            for statement in statements:
                resultsList.append(statement['employer']['value'])
    except:
        resultsList = [r.text]
    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(0.25)
    return resultsList

# returns a list of value Q IDs of the property propertyId for the item with Wikidata ID qId
def searchWikidataSingleProperty(qId, propertyId, valueType):
    resultsList = []
    query = '''select distinct ?object where {
        wd:'''+ qId + ''' wdt:''' + propertyId + ''' ?object.
      }'''
    #print(query)
    r = requests.get(wikidataEndpointUrl, params={'query' : query}, headers=requestHeaderDictionary)
    try:
        data = r.json()
        statements = data['results']['bindings']
        if len(statements) > 0: # if no results, the list remains empty
            for statement in statements:
                if valueType == 'item':
                    resultValue = extractQNumber(statement['object']['value'])
                else:
                    resultValue = statement['object']['value']
                resultsList.append(resultValue)
    except:
        resultsList = [r.text]
    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(0.25)
    return resultsList

def searchOrcidAtWikidata(qIds):
    alternatives = ''
    for qId in qIds:
        alternatives += 'wd:' + qId + '\n'
        
    # NOTE: the MINUS clause removes properties in the http://www.wikidata.org/prop/reference/value/ namespace
    # leaving only those in the http://www.wikidata.org/prop/reference/ namespace (i.e. the direct literal values)
    query = '''
select distinct ?id ?statement ?orcid ?reference ?refProp ?refVal where {
  VALUES ?id
{
''' + alternatives + '''}
  ?id p:P496 ?statement.
  ?statement ps:P496 ?orcid.
  optional {
        ?statement prov:wasDerivedFrom ?reference.
        ?reference ?refProp ?refVal.
        MINUS {
            ?reference ?refProp ?refVal. 
            FILTER(contains(str(?refProp), "value/"))
        }
    }
  }'''
    #print(query)
    results = []
    acceptMediaType = 'application/json'
    r = requests.get(wikidataEndpointUrl, params={'query' : query}, headers=generateHeaderDictionary(acceptMediaType))
    data = r.json()
    statements = data['results']['bindings']
    # ********** NOTE: need to deal with case where there are more than one reference per statement
    # This will result in several statements with the same qNumeber, orcid, and referenceHash
    for statement in statements:
        # remove wd: 'http://www.wikidata.org/entity/'
        qNumber = extractFromIri(statement['id']['value'], 4)
        # remove wds: 'http://www.wikidata.org/entity/statement/'
        noDomain = extractFromIri(statement['statement']['value'], 5)
        # need to remove the qNumber that's appended in front of the UUID
        statementUuid = noDomain.partition(qNumber + '-')[2]
        orcid = statement['orcid']['value']
        if 'reference' in statement:
            # remove wdref: 'http://www.wikidata.org/reference/'
            referenceHash = extractFromIri(statement['reference']['value'], 4)
        else:
            referenceHash = ''
        if 'refProp' in statement:
            # remove pr: 'http://www.wikidata.org/prop/reference/'
            referenceProperty = extractFromIri(statement['refProp']['value'], 5)
        else:
            referenceProperty = ''
        if 'refVal' in statement:
            referenceValue = statement['refVal']['value']
            # if it's a date, it comes down as 2019-12-05T00:00:00Z, but the API wants just the date: 2019-12-05
            if referenceProperty == 'P813': # the likely property is "retrieved"; just leave it if it's another property
                referenceValue = referenceValue.split('T')[0]
        else:
            referenceValue = ''
        results.append({'qId': qNumber, 'statementUuid': statementUuid, 'orcid': orcid, 'referenceHash': referenceHash, 'referenceProperty': referenceProperty, 'referenceValue': referenceValue})
    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(0.25)
    return results

def searchHumansAtWikidata(qIds):
    alternatives = ''
    for qId in qIds:
        alternatives += 'wd:' + qId + '\n'
        
    # NOTE: instanceOf human is one of the statement that Wikidata does not care about references for
    # So we will ignore them here
    query = '''
select distinct ?id ?statement where {
  VALUES ?id
{
''' + alternatives + '''}
  ?id p:P31 ?statement.
  ?statement ps:P31 wd:Q5.
  }'''
    #print(query)
    results = []
    acceptMediaType = 'application/json'
    r = requests.get(wikidataEndpointUrl, params={'query' : query}, headers=generateHeaderDictionary(acceptMediaType))
    data = r.json()
    statements = data['results']['bindings']
    for statement in statements:
        # remove wd: 'http://www.wikidata.org/entity/'
        qNumber = extractFromIri(statement['id']['value'], 4)
        # remove wds: 'http://www.wikidata.org/entity/statement/'
        noDomain = extractFromIri(statement['statement']['value'], 5)
        # need to remove the qNumber that's appended in front of the UUID
        statementUuid = noDomain.partition(qNumber + '-')[2]
        results.append({'qId': qNumber, 'statementUuid': statementUuid})
    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(0.25)
    return results

def checkOrcid(orcid):
    namespace = 'https://orcid.org/'
    endpointUrl = namespace + orcid
    acceptMediaType = 'application/ld+json'
    r = requests.get(endpointUrl, headers=generateHeaderDictionary(acceptMediaType))
    code = r.status_code
    #print(r.text)
    data = r.json()
    response = {'code': code, 'data': data}
    if response['code'] != 200:
        print('Attempt to dereference ORCID resulted in HTTP response code ', response['code'])
        data['orcidReferenceValue'] = ''
    else:
        print('Successfully retrieved')
        wholeTimeStringZ = datetime.datetime.utcnow().isoformat() # form: 2019-12-05T15:35:04.959311
        dateZ = wholeTimeStringZ.split('T')[0] # form 2019-12-05
        wholeDateZ = '+' + dateZ + 'T00:00:00Z' # form +2019-12-05T00:00:00Z as provided by Wikidata
    # delay a quarter second to avoid hitting the API too rapidly
    sleep(0.25)
    return(wholeDateZ)

# if the value passed is '' then the value will be retrieved.  Otherwise, the value is used to screen.
def searchStatementAtWikidata(qIds, prop, value, refPropList):
    # create a string for all of the Wikidata item IDs to be used as subjects in the query
    alternatives = ''
    for qId in qIds:
        alternatives += 'wd:' + qId + '\n'
        
    # create a string for the query
    query = 'select distinct ?id ?statement '
    # if no value was specified, find the value
    if value == '':
        query += '?statementValue '
    if len(refPropList) != 0:
        query += '?reference '
    for refPropIndex in range(0, len(refPropList)):
        query += '?refVal' + str(refPropIndex) + ' '
    query += '''where {
  VALUES ?id
{
''' + alternatives + '''}
  ?id p:'''+ prop + ''' ?statement.
  ?statement ps:'''+ prop
    
    if value == '':
        query += ' ?statementValue.'
    else:
        query += ' wd:' + value + '.'

    if len(refPropList) != 0:
        query += '''
  optional {
    ?statement prov:wasDerivedFrom ?reference.'''
        for refPropIndex in range(0, len(refPropList)):
            query +='''
    ?reference pr:''' + refPropList[refPropIndex] + ''' ?refVal''' + str(refPropIndex) + '''.'''
        query +='''
        }'''
    query +='''
  }'''
    #print(query)

    returnValue = []
    acceptMediaType = 'application/json'
    r = requests.get(wikidataEndpointUrl, params={'query' : query}, headers=generateHeaderDictionary(acceptMediaType))
    data = r.json()
    results = data['results']['bindings']
    # ********** NOTE: need to deal with case where there are more than one reference per result
    # This will result in several results with the same qNumeber, orcid, and referenceHash
    for result in results:
        # remove wd: 'http://www.wikidata.org/entity/'
        qNumber = extractFromIri(result['id']['value'], 4)
        # remove wds: 'http://www.wikidata.org/entity/statement/'
        noDomain = extractFromIri(result['statement']['value'], 5)
        # need to remove the qNumber that's appended in front of the UUID
        statementUuid = noDomain.partition(qNumber + '-')[2]
        # if no value was specified, get the value that was found in the search
        if value == '':
            statementValue = result['statementValue']['value']
        if len(refPropList) != 0:
            if 'reference' in result:
                # remove wdref: 'http://www.wikidata.org/reference/'
                referenceHash = extractFromIri(result['reference']['value'], 4)
            else:
                referenceHash = ''
            referenceValues = []
            for refPropIndex in range(0, len(refPropList)):
                if 'refVal' + str(refPropIndex) in result:
                    refVal = result['refVal' + str(refPropIndex)]['value']
                    # if it's a date, it comes down as 2019-12-05T00:00:00Z, but the API wants just the date: 2019-12-05
                    #if referenceProperty == 'P813': # the likely property is "retrieved"; just leave it if it's another property
                    #    referenceValue = referenceValue.split('T')[0]
                else:
                    refVal = ''
                referenceValues.append(refVal)
        resultsDict = {'qId': qNumber, 'statementUuid': statementUuid}
        # if no value was specified, get the value that was found in the search
        if value == '':
            resultsDict['statementValue'] = statementValue
        if len(refPropList) != 0:
            resultsDict['referenceHash'] = referenceHash
            resultsDict['referenceValues'] = referenceValues
        returnValue.append(resultsDict)

    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(0.25)
    
    return returnValue

# search for any of the "label" types: label, alias, description
def searchLabelsDescriptionsAtWikidata(qIds, labelType, language):
    # create a string for all of the Wikidata item IDs to be used as subjects in the query
    alternatives = ''
    for qId in qIds:
        alternatives += 'wd:' + qId + '\n'
        
    if labelType == 'label':
        predicate = 'rdfs:label'
    elif labelType == 'alias':
        predicate = 'skos:altLabel'
    elif labelType == 'description':
        predicate = 'schema:description'
    else:
        predicate = 'rdfs:label'        
        
    # create a string for the query
    query = 'select distinct ?id ?string '
    query += '''where {
  VALUES ?id
{
''' + alternatives + '''}
  ?id '''+ predicate + ''' ?string.
  filter(lang(?string)="''' + language + '''")
  }'''
    #print(query)

    returnValue = []
    acceptMediaType = 'application/json'
    r = requests.get(wikidataEndpointUrl, params={'query' : query}, headers=generateHeaderDictionary(acceptMediaType))
    data = r.json()
    results = data['results']['bindings']
    for result in results:
        # remove wd: 'http://www.wikidata.org/entity/'
        qNumber = extractFromIri(result['id']['value'], 4)
        string = result['string']['value']
        resultsDict = {'qId': qNumber, 'string': string}
        returnValue.append(resultsDict)

    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(0.25)
    
    return returnValue

# Scrape Medicine departments faculty directory web pages

Scrapes the faculty, department names, and homepage URLs at https://wag.app.vanderbilt.edu/PublicPage/Faculty/ViewAll The HTML provides PickLetter URLs for each letter of the alphabet.

In [None]:
from string import ascii_uppercase
outputTable = [['name', 'givenName', 'surname', 'degrees', 'rank', 'department', 'url', 'date']]

for letter in ascii_uppercase:
#if 1==1:
    #letter = 'Q'
    print(letter)
    acceptMediaType = 'text/html'
    url = 'https://wag.app.vanderbilt.edu//PublicPage/Faculty/PickLetter?letter=' + letter
    response = requests.get(url, headers = generateHeaderDictionary(acceptMediaType))
    soupObject = BeautifulSoup(response.text,features="html5lib")

    # get the first table from the page
    tableObject = soupObject.find_all('tbody')[0]

    facultyItems = tableObject.find_all('tr')

    for personRecord in facultyItems:
        column = personRecord.find_all('td')
        localUrl = column[0].find('a')
        url = 'https://wag.app.vanderbilt.edu' + localUrl.get('href')
        nameLastFirst = column[1].text.strip()
        nameParts = nameLastFirst.split(',')
        firstName = nameParts[1].strip()
        lastName = nameParts[0].strip()
        name = firstName + ' ' + lastName
        degrees = column[2].text.strip()
        title = column[3].text.strip()
        department = column[4].text.strip()
        #print(name, degrees, title, department, url)    
        wholeTimeStringZ = datetime.datetime.utcnow().isoformat() # form: 2019-12-05T15:35:04.959311
        dateZ = wholeTimeStringZ.split('T')[0] # form 2019-12-05
        wholeDateZ = '+' + dateZ + 'T00:00:00Z' # form +2019-12-05T00:00:00Z as provided by Wikidata


        outputTable.append([name, firstName, lastName, degrees, title, department, url, wholeDateZ])            

    fileName = 'medicine-faculty.csv'
    writeCsv(fileName, outputTable)
    sleep(0.25)
print('done')

# Create departmental CSV

Remove duplicates to get department list.

Manually dereference each website and copy and paste some description.

Also check that the labelEn is actually what they use. Manually created fields to match the Engineering departments.

After the script is done, I manually added a column named "officialWebsiteLanguageQualifier" and put the value "Q1860" (for English) in each row. This should just get added to the script.