# Common Code

This code block includes import statements, function definitions, and declarations of variables that are common to the rest of the script. It needs to be run once before the other code blocks.

**Note: the code in this block is found in the stand-alone file vb_common_code.py**

In [None]:
import requests   # best library to manage HTTP transactions
from bs4 import BeautifulSoup # web-scraping library
import json
from time import sleep
import csv
import math
from fuzzywuzzy import fuzz # fuzzy logic matching
from fuzzywuzzy import process
import xml.etree.ElementTree as et # library to traverse XML tree
import urllib
import datetime
import string

# For a particular processing round, set a short name for the department here.
# This name is used to generate a set of unique processing files for that department.
testEmployer = 'Vanderbilt University' # to test against Wikidata employer property
employerQId = 'Q29052' # Vanderbilt University
deathDateLimit = '2000' # any death dates before this date will be assumed to not be a match
birthDateLimit = '1920' # any birth dates before this date will be assumed to not be a match
wikibase_instance_namespace = 'http://www.wikidata.org/entity/'

# ---------------------
# utility functions used across blocks
# ---------------------

wikidataEndpointUrl = 'https://query.wikidata.org/sparql'

# NCBI identification requirements:
# tool name and email address should be sent with all requests
# see https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
emailAddress = 'steve.baskauf@vanderbilt.edu' # put your email address here
toolName = 'VanderBot' # give your application a name here

# generates a dictionary to be passed in a requests GET method to generate the request header
def generate_header_dictionary_non_sparql(acceptMediaType):
    userAgentHeader = 'VanderBot/1.3 (https://github.com/HeardLibrary/linked-data/tree/master/vanderbot; mailto:steve.baskauf@vanderbilt.edu)'
    requestHeaderDictionary = {
        'Accept' : acceptMediaType,
        'User-Agent': userAgentHeader
    }
    return requestHeaderDictionary

# write a list of lists to a CSV file
def write_lists_to_csv(fileName, array):
    with open(fileName, 'w', newline='', encoding='utf-8') as fileObject:
        writerObject = csv.writer(fileObject)
        for row in array:
            writerObject.writerow(row)



# Query ORCID for Vanderbilt University people

Script developed at https://github.com/HeardLibrary/linked-data/blob/master/publications/orcid/orcid-get-json.ipynb

Retrieves results 100 at a time, then processes them by extracting desired information.  **NOTE: takes hours to run.**

Saves results in a file and the alternative names in a second file.

In [None]:
table = [['orcid', 'given_names', 'family_name', 'start_date', 'end_date', 'department', 'organization']]
other_name_list = [['orcid', 'altName']]

# use the API to search for people associated with Vanderbilt University
# First search is for only one record, just to get the number of hits found
search_uri = 'https://pub.orcid.org/v2.0/search/?q=affiliation-org-name:"Vanderbilt+University"&start=1&rows=1'
accept_media_type = 'application/json'
response = requests.get(search_uri, headers = generate_header_dictionary_non_sparql(accept_media_type))
data = response.json()
#print(data)
number_results = data["num-found"]
print(data["num-found"])
number_pages = math.floor(number_results/100)
#print(number_pages)
remainder = number_results - 100*number_pages
#print(remainder)

for page_count in range(0, number_pages + 1):  # the remainder will be caught when page_count = number_pages
    print('page: ', page_count)
    search_uri = 'https://pub.orcid.org/v2.0/search/?q=affiliation-org-name:"Vanderbilt+University"&start='+str(page_count*100+1)
    response = requests.get(search_uri, headers={'Accept' : 'application/json'})
    print(response.url)
    data = response.json()
    orcids_dicts_list = data['result']

    # extract the identifier strings from the data structure
    orcids = []
    for orcid_dict in orcids_dicts_list:
        dictionary = {'id': orcid_dict['orcid-identifier']['path'], 'iri': orcid_dict['orcid-identifier']['uri']}
        orcids.append(dictionary)

    for orcid_index in range(0, len(orcids)):
        response = requests.get(orcids[orcid_index]['iri'], headers={'Accept' : 'application/json'})
        data = response.json()
        #print(json.dumps(data, indent = 2))
        orcid_id = data['orcid-identifier']['path']
        #print(orcid_id)
        # if there isn't a name, then go on to the next ORCID
        if not data['person']['name']:
            continue
        if data['person']['name']['given-names']:  
            given_names = data['person']['name']['given-names']['value']
        else:
            continue
        if data['person']['name']['family-name']:
            family_name = data['person']['name']['family-name']['value']
        # This has been a big pain when people don't have surnames.
        # It causes matches with everyone who has the same first name!
        else:
            continue
        #print(given_names, ' ', family_name)
        other_names = data['person']['other-names']['other-name']
        for other_name in other_names:
            #print(other_name['content'])
            other_name_list.append([orcid_id, other_name['content']])

        affiliations = data['activities-summary']['employments']['affiliation-group']
        #print(json.dumps(affiliations, indent = 2))
        for affiliation in affiliations:
            summaries = affiliation['summaries']
            #print(summaries)
            #print()
            for summary in summaries:
                employment = summary['employment-summary']
                #print(json.dumps(employment, indent = 2))
                start_date = ''
                if employment['start-date']:
                    if employment['start-date']['year']:
                        start_date += employment['start-date']['year']['value']
                        start_month = employment['start-date']['month']
                        if start_month:
                            start_date += '-' + start_month['value']
                            start_day = employment['start-date']['day']
                            if start_day:
                                start_date += '-' + start_day['value']
                #print('start date: ', start_date)
                end_date = ''
                if employment['end-date']:
                    if employment['end-date']['year']:
                        end_date += employment['end-date']['year']['value']
                        end_month = employment['end-date']['month']
                        if end_month:
                            end_date += '-' + end_month['value']
                            end_day = employment['end-date']['day']
                            if end_day:
                                end_date += '-' + end_day['value']
                #print('end date: ', end_date)
                department = employment['department-name']
                # if there is no value for department, set it to empty string
                if not department:
                    department = ''
                #print(department)
                if employment['organization']:
                    organization = employment['organization']['name']
                #print(organization)
                if 'Vanderbilt University' in organization:
                    print(orcid_id, given_names, family_name, start_date, end_date, department, organization)
                    table.append([orcid_id, given_names, family_name, start_date, end_date, department, organization])
                #print(table)
        sleep(.25)

print()
print('Done')
file_name = 'orcid_data.csv'
write_lists_to_csv(file_name, table)
file_name = 'orcid_other_names.csv'
write_lists_to_csv(file_name, other_name_list)


# Download Vanderbilt people's altLabels from Wikidata

Developed at https://github.com/HeardLibrary/linked-data/blob/master/publications/wikidata/download-vanderbilt-people-altlabels.py

These values aren't used for anything currently (2020-03-19), so running this is optional. But it will be useful in the future when we want to start collecting aliases.

In [None]:
query = '''select distinct  ?person ?altLabel where {
  ?person p:P108 ?statement.
  ?statement ps:P108  wd:Q29052.
  ?person skos:altLabel ?altLabel.
  FILTER(lang(?altLabel)="en")
}'''

# The endpoint defaults to returning XML, so the Accept: header is required
r = requests.get(wikidataEndpointUrl, params={'query' : query}, headers={'Accept' : 'application/json'})

data = r.json()
#print(json.dumps(data,indent = 2))

table = [['wikidataIri', 'altLabel']]
items = data['results']['bindings']
for item in items:
    wikidata_iri = item['person']['value']
    alt_label = ''
    if 'altLabel' in item:
        alt_label = item['altLabel']['value']
    table.append([wikidata_iri, alt_label])
    
filename = 'vanderbilt_wikidata_altlabels.csv'
write_lists_to_csv(filename, table)
print('done')