# Common Code

This code block includes import statements, function definitions, and declarations of variables that are common to the rest of the script. It needs to be run once before the other code blocks.

**Note: some code in this block is found in the stand-alone file vb_common_code.py**

In [None]:
import requests   # best library to manage HTTP transactions
from bs4 import BeautifulSoup # web-scraping library
import json
from time import sleep
import csv
import math
from fuzzywuzzy import fuzz # fuzzy logic matching
from fuzzywuzzy import process
import xml.etree.ElementTree as et # library to traverse XML tree
import urllib
import datetime
import string

# For a particular processing round, set a short name for the department here.
# This name is used to generate a set of unique processing files for that department.
testEmployer = 'Vanderbilt University' # to test against Wikidata employer property
employerQId = 'Q29052' # Vanderbilt University
deathDateLimit = '2000' # any death dates before this date will be assumed to not be a match
birthDateLimit = '1920' # any birth dates before this date will be assumed to not be a match
wikibase_instance_namespace = 'http://www.wikidata.org/entity/'

# ---------------------
# function definitions
# ---------------------

wikidata_endpoint_url = 'https://query.wikidata.org/sparql'
accept_media_type = 'application/json'
user_agent_header = 'VanderBot/1.7 (https://github.com/HeardLibrary/linked-data/tree/master/vanderbot; mailto:steve.baskauf@vanderbilt.edu)'

# NCBI identification requirements:
# tool name and email address should be sent with all requests
# see https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
emailAddress = 'steve.baskauf@vanderbilt.edu' # put your email address here
toolName = 'VanderBot' # give your application a name here

# Extracts the local name part of an IRI, e.g. a qNumber from a Wikidata IRI
def extract_local_name(iri):
    # pattern is http://www.wikidata.org/entity/Q6386232
    pieces = iri.split('/')
    last_piece = len(pieces)
    return pieces[last_piece - 1]

# Generate the current UTC xsd:date
def generate_utc_date():
    whole_time_string_z = datetime.datetime.utcnow().isoformat() # form: 2019-12-05T15:35:04.959311
    date_z = whole_time_string_z.split('T')[0] # form 2019-12-05
    return date_z

today = generate_utc_date()

# generates a dictionary to be passed in a requests GET method to generate the request header
def generate_header_dictionary_non_sparql(acceptMediaType):
    userAgentHeader = 'VanderBot/1.3 (https://github.com/HeardLibrary/linked-data/tree/master/vanderbot; mailto:steve.baskauf@vanderbilt.edu)'
    requestHeaderDictionary = {
        'Accept' : acceptMediaType,
        'User-Agent': userAgentHeader
    }
    return requestHeaderDictionary

# write a list of lists to a CSV file
def write_lists_to_csv(fileName, array):
    with open(fileName, 'w', newline='', encoding='utf-8') as fileObject:
        writerObject = csv.writer(fileObject)
        for row in array:
            writerObject.writerow(row)

# Read from a CSV file into a list of dictionaries
def read_dicts_from_csv(filename):
    with open(filename, 'r', newline='', encoding='utf-8') as file_object:
        dict_object = csv.DictReader(file_object)
        array = []
        for row in dict_object:
            array.append(row)
    return array

# Write list of dictionaries to a CSV file
def write_dicts_to_csv(table, filename, fieldnames):
    with open(filename, 'w', newline='', encoding='utf-8') as csv_file_object:
        writer = csv.DictWriter(csv_file_object, fieldnames=fieldnames)
        writer.writeheader()
        for row in table:
            writer.writerow(row)

def lookup_department(researcher_qid, orcid_department, departments, department_labels):
    for department in departments:
        if researcher_qid == department['qid']:
            result = ''
            #print(department['affiliation'])
            for label in department_labels:
                if department['affiliation'] == label['qid']:
                    partial_ratio = fuzz.partial_ratio(orcid_department, label['label_en'])
                    #set_ratio = fuzz.token_set_ratio(orcid_department, label['label_en'])
                    if partial_ratio > 80:
                        #print('partial ratio', partial_ratio)
                        #print('set_ratio', set_ratio)
                        result = str(partial_ratio) + ' ' + orcid_department + '/' + label['label_en']
            return result

def find_surname_givens(name):
    # Get rid of periods and commas
    name = name.replace('.', ' ')
    name = name.replace(',', ' ')
    
    # Split name
    pieces = name.split(' ')
    
    # Get rid of empty pieces formed from extra spaces
    while '' in pieces:
        pieces.remove('')
    
    # Must be at least a surname and something else
    if len(pieces) <= 1:
        return False
    
    # Make sure first character is alphabetic
    # only fixes the case where there is one alphanumeric, but more than one is rare
    # typical cases are like (Kit) or "Kit"    
    for piece_index in range(len(pieces)):
        if not pieces[piece_index][0:1].isalpha(): 
            pieces[piece_index] = pieces[piece_index][1:len(pieces)] # remove the first non-alphabetic character
    # Now get rid of any empty strings; could also be caused by double spaces
    for piece in pieces:
        if len(piece) == 0: # there's nothing left, get rid of piece
            pieces.remove('')
            
    # Get rid of ", Jr.", "III", etc.
    if 'Jr' in pieces:
        pieces.remove('Jr')
    if 'Sr' in pieces:
        pieces.remove('Sr')
    if 'II' in pieces:
        pieces.remove('II')
    if 'III' in pieces:
        pieces.remove('III')
    if 'IV' in pieces:
        pieces.remove('IV')
    if 'V' in pieces:
        pieces.remove('V')
    
    # Not interested unless there are at least two pieces
    if len(pieces) == 1:
        return False
    
    # Put all but last piece together again
    given_names = ''
    for piece in pieces[0:len(pieces)-2]:
        given_names += piece + ' '
    given_names += pieces[len(pieces)-2]
    
    return {'given': given_names, 'family': pieces[len(pieces)-1]}

def generate_header_dictionary(accept_media_type,user_agent_header):
    request_header_dictionary = {
        'Accept' : accept_media_type,
#        'Content-Type': 'application/sparql-query',
        'Content-Type': 'application/x-www-form-urlencoded',
        'User-Agent': user_agent_header
    }
    return request_header_dictionary

# The following function requires the request header generated above
sparql_request_header = generate_header_dictionary(accept_media_type,user_agent_header)
# The query is a valid SPARQL query string

# Sends a query to the query service endpoint. 
# NOTE: request_header and endpoint are global variables defined earlier in the script
def send_sparql_query(query_string):
    # You can delete the two print statements if the queries are short. However, for large/long queries,
    # it's good to let the user know what's going on.
    print('querying SPARQL endpoint to acquire item metadata')
    #response = requests.post(endpoint, data=query_string.encode('utf-8'), headers=sparql_request_header)
    response = requests.post(wikidata_endpoint_url, data=dict(query=query_string), headers=sparql_request_header) # use URL-encoded method
    #print(response.text) # uncomment to view the raw response, e.g. if you are getting an error
    data = response.json()

    # Extract the values from the response JSON
    results = data['results']['bindings']
    
    print('done retrieving data')
    # print(json.dumps(results, indent=2))
    return results


# Manage altLabels

This process starts with altLabels downloaded from Wikidata and adds alternate names from ORCID

## Download Vanderbilt people's altLabels from Wikidata

Developed at https://github.com/HeardLibrary/linked-data/blob/master/publications/wikidata/download-vanderbilt-people-altlabels.py



In [None]:
query = '''select distinct  ?person ?altLabel where {
  ?person p:P108 ?statement.
  {?statement ps:P108 wd:Q29052.}
  union
  {?statement ps:P108 wd:Q7914455.}
  ?person skos:altLabel ?altLabel.
  FILTER(lang(?altLabel)="en")
}'''

# The endpoint defaults to returning XML, so the Accept: header is required
r = requests.get(wikidata_endpoint_url, params={'query' : query}, headers={'Accept' : 'application/json'})

data = r.json()
#print(json.dumps(data,indent = 2))

table = [['qid', 'altLabel', 'source']]
items = data['results']['bindings']
for item in items:
    wikidata_iri = extract_local_name(item['person']['value'])
    alt_label = ''
    if 'altLabel' in item:
        alt_label = item['altLabel']['value']
    table.append([wikidata_iri, alt_label, 'wikidata'])
    
filename = 'vanderbilt_wikidata_altlabels.csv'
write_lists_to_csv(filename, table)
print('done')

In [None]:
filename = 'vanderbilt_wikidata_altlabels.csv'
researcher_altlabels = read_dicts_from_csv(filename)
fieldnames = researcher_altlabels[0].keys() # get the field names from the existing file

filename = 'orcid_other_names.csv'
orcid_other = read_dicts_from_csv(filename)

filename = 'researchers.csv'
researcher_items = read_dicts_from_csv(filename)


In [None]:
print(orcid_other[0])
print()
print(researcher_items[0])
print()
print(researcher_altlabels[0])

In [None]:
for name in orcid_other:
    # Note: the alternate names list contains all VU affiliated people, including students. If they didn't also
    # indicate that VU was their employer, they got screened out of the main ORCID list. So there will be a lot
    # of non-matches here.
    for researcher in researcher_items:
        if name['orcid'] == researcher['orcid']:
            researcher_dict = {'qid': researcher['qid'], 'altLabel': name['altName'], 'source': 'orcid'}
            researcher_altlabels.append(researcher_dict)
            break

filename = 'vanderbilt_wikidata_altlabels.csv'
write_dicts_to_csv(researcher_altlabels, filename, fieldnames)

print('done')