# Common Code

This code block includes import statements, function definitions, and declarations of variables that are common to the rest of the script. It needs to be run once before the other code blocks.

**Note: some code in this block is found in the stand-alone file vb_common_code.py**

In [21]:
import requests   # best library to manage HTTP transactions
from bs4 import BeautifulSoup # web-scraping library
import json
from time import sleep
import csv
import math
from fuzzywuzzy import fuzz # fuzzy logic matching
from fuzzywuzzy import process
import xml.etree.ElementTree as et # library to traverse XML tree
import urllib
import datetime
import string

# For a particular processing round, set a short name for the department here.
# This name is used to generate a set of unique processing files for that department.
testEmployer = 'Vanderbilt University' # to test against Wikidata employer property
employerQId = 'Q29052' # Vanderbilt University
deathDateLimit = '2000' # any death dates before this date will be assumed to not be a match
birthDateLimit = '1920' # any birth dates before this date will be assumed to not be a match
wikibase_instance_namespace = 'http://www.wikidata.org/entity/'

# ---------------------
# function definitions
# ---------------------

wikidata_endpoint_url = 'https://query.wikidata.org/sparql'
accept_media_type = 'application/json'
user_agent_header = 'VanderBot/1.7 (https://github.com/HeardLibrary/linked-data/tree/master/vanderbot; mailto:steve.baskauf@vanderbilt.edu)'

# NCBI identification requirements:
# tool name and email address should be sent with all requests
# see https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
emailAddress = 'steve.baskauf@vanderbilt.edu' # put your email address here
toolName = 'VanderBot' # give your application a name here

# Generate the current UTC xsd:date
def generate_utc_date():
    whole_time_string_z = datetime.datetime.utcnow().isoformat() # form: 2019-12-05T15:35:04.959311
    date_z = whole_time_string_z.split('T')[0] # form 2019-12-05
    return date_z

today = generate_utc_date()

# generates a dictionary to be passed in a requests GET method to generate the request header
def generate_header_dictionary_non_sparql(acceptMediaType):
    userAgentHeader = 'VanderBot/1.3 (https://github.com/HeardLibrary/linked-data/tree/master/vanderbot; mailto:steve.baskauf@vanderbilt.edu)'
    requestHeaderDictionary = {
        'Accept' : acceptMediaType,
        'User-Agent': userAgentHeader
    }
    return requestHeaderDictionary

# write a list of lists to a CSV file
def write_lists_to_csv(fileName, array):
    with open(fileName, 'w', newline='', encoding='utf-8') as fileObject:
        writerObject = csv.writer(fileObject)
        for row in array:
            writerObject.writerow(row)

# Read from a CSV file into a list of dictionaries
def read_dicts_from_csv(filename):
    with open(filename, 'r', newline='', encoding='utf-8') as file_object:
        dict_object = csv.DictReader(file_object)
        array = []
        for row in dict_object:
            array.append(row)
    return array

# Write list of dictionaries to a CSV file
def write_dicts_to_csv(table, filename, fieldnames):
    with open(filename, 'w', newline='', encoding='utf-8') as csv_file_object:
        writer = csv.DictWriter(csv_file_object, fieldnames=fieldnames)
        writer.writeheader()
        for row in table:
            writer.writerow(row)

def lookup_department(researcher_qid, orcid_department, departments, department_labels):
    for department in departments:
        if researcher_qid == department['qid']:
            result = ''
            #print(department['affiliation'])
            for label in department_labels:
                if department['affiliation'] == label['qid']:
                    partial_ratio = fuzz.partial_ratio(orcid_department, label['label_en'])
                    #set_ratio = fuzz.token_set_ratio(orcid_department, label['label_en'])
                    if partial_ratio > 80:
                        #print('partial ratio', partial_ratio)
                        #print('set_ratio', set_ratio)
                        result = str(partial_ratio) + ' ' + orcid_department + '/' + label['label_en']
            return result

def find_surname_givens(name):
    # Get rid of periods and commas
    name = name.replace('.', ' ')
    name = name.replace(',', ' ')
    
    # Split name
    pieces = name.split(' ')
    
    # Get rid of empty pieces formed from extra spaces
    while '' in pieces:
        pieces.remove('')
    
    # Must be at least a surname and something else
    if len(pieces) <= 1:
        return False
    
    # Make sure first character is alphabetic
    # only fixes the case where there is one alphanumeric, but more than one is rare
    # typical cases are like (Kit) or "Kit"    
    for piece_index in range(len(pieces)):
        if not pieces[piece_index][0:1].isalpha(): 
            pieces[piece_index] = pieces[piece_index][1:len(pieces)] # remove the first non-alphabetic character
    # Now get rid of any empty strings; could also be caused by double spaces
    for piece in pieces:
        if len(piece) == 0: # there's nothing left, get rid of piece
            pieces.remove('')
            
    # Get rid of ", Jr.", "III", etc.
    if 'Jr' in pieces:
        pieces.remove('Jr')
    if 'Sr' in pieces:
        pieces.remove('Sr')
    if 'II' in pieces:
        pieces.remove('II')
    if 'III' in pieces:
        pieces.remove('III')
    if 'IV' in pieces:
        pieces.remove('IV')
    if 'V' in pieces:
        pieces.remove('V')
    
    # Not interested unless there are at least two pieces
    if len(pieces) == 1:
        return False
    
    # Put all but last piece together again
    given_names = ''
    for piece in pieces[0:len(pieces)-2]:
        given_names += piece + ' '
    given_names += pieces[len(pieces)-2]
    
    return {'given': given_names, 'family': pieces[len(pieces)-1]}

def generate_header_dictionary(accept_media_type,user_agent_header):
    request_header_dictionary = {
        'Accept' : accept_media_type,
#        'Content-Type': 'application/sparql-query',
        'Content-Type': 'application/x-www-form-urlencoded',
        'User-Agent': user_agent_header
    }
    return request_header_dictionary

# The following function requires the request header generated above
sparql_request_header = generate_header_dictionary(accept_media_type,user_agent_header)
# The query is a valid SPARQL query string

# Sends a query to the query service endpoint. 
# NOTE: request_header and endpoint are global variables defined earlier in the script
def send_sparql_query(query_string):
    # You can delete the two print statements if the queries are short. However, for large/long queries,
    # it's good to let the user know what's going on.
    print('querying SPARQL endpoint to acquire item metadata')
    #response = requests.post(endpoint, data=query_string.encode('utf-8'), headers=sparql_request_header)
    response = requests.post(wikidata_endpoint_url, data=dict(query=query_string), headers=sparql_request_header) # use URL-encoded method
    #print(response.text) # uncomment to view the raw response, e.g. if you are getting an error
    data = response.json()

    # Extract the values from the response JSON
    results = data['results']['bindings']
    
    print('done retrieving data')
    # print(json.dumps(results, indent=2))
    return results


# Query ORCID for Vanderbilt University people

Script developed at https://github.com/HeardLibrary/linked-data/blob/master/publications/orcid/orcid-get-json.ipynb

Retrieves results 100 at a time, then processes them by extracting desired information.  **NOTE: takes hours to run.**

Saves results in a file and the alternative names in a second file.

In [None]:
table = [['orcid', 'given_names', 'family_name', 'start_date', 'end_date', 'department', 'organization']]
other_name_list = [['orcid', 'altName']]

# use the API to search for people associated with Vanderbilt University
# First search is for only one record, just to get the number of hits found
search_uri = 'https://pub.orcid.org/v2.0/search/?q=affiliation-org-name:"Vanderbilt+University"&start=1&rows=1'
accept_media_type = 'application/json'
response = requests.get(search_uri, headers = generate_header_dictionary_non_sparql(accept_media_type))
data = response.json()
#print(data)
number_results = data["num-found"]
print(data["num-found"])
number_pages = math.floor(number_results/100)
#print(number_pages)
remainder = number_results - 100*number_pages
#print(remainder)

for page_count in range(0, number_pages + 1):  # the remainder will be caught when page_count = number_pages
    print('page: ', page_count)
    search_uri = 'https://pub.orcid.org/v2.0/search/?q=affiliation-org-name:"Vanderbilt+University"&start='+str(page_count*100+1)
    response = requests.get(search_uri, headers={'Accept' : 'application/json'})
    print(response.url)
    data = response.json()
    orcids_dicts_list = data['result']

    # extract the identifier strings from the data structure
    orcids = []
    for orcid_dict in orcids_dicts_list:
        dictionary = {'id': orcid_dict['orcid-identifier']['path'], 'iri': orcid_dict['orcid-identifier']['uri']}
        orcids.append(dictionary)

    for orcid_index in range(0, len(orcids)):
        response = requests.get(orcids[orcid_index]['iri'], headers={'Accept' : 'application/json'})
        data = response.json()
        #print(json.dumps(data, indent = 2))
        orcid_id = data['orcid-identifier']['path']
        #print(orcid_id)
        # if there isn't a name, then go on to the next ORCID
        if not data['person']['name']:
            continue
        if data['person']['name']['given-names']:  
            given_names = data['person']['name']['given-names']['value']
        else:
            continue
        if data['person']['name']['family-name']:
            family_name = data['person']['name']['family-name']['value']
        # This has been a big pain when people don't have surnames.
        # It causes matches with everyone who has the same first name!
        else:
            continue
        #print(given_names, ' ', family_name)
        other_names = data['person']['other-names']['other-name']
        for other_name in other_names:
            #print(other_name['content'])
            other_name_list.append([orcid_id, other_name['content']])

        affiliations = data['activities-summary']['employments']['affiliation-group']
        #print(json.dumps(affiliations, indent = 2))
        for affiliation in affiliations:
            summaries = affiliation['summaries']
            #print(summaries)
            #print()
            for summary in summaries:
                employment = summary['employment-summary']
                #print(json.dumps(employment, indent = 2))
                start_date = ''
                if employment['start-date']:
                    if employment['start-date']['year']:
                        start_date += employment['start-date']['year']['value']
                        start_month = employment['start-date']['month']
                        if start_month:
                            start_date += '-' + start_month['value']
                            start_day = employment['start-date']['day']
                            if start_day:
                                start_date += '-' + start_day['value']
                #print('start date: ', start_date)
                end_date = ''
                if employment['end-date']:
                    if employment['end-date']['year']:
                        end_date += employment['end-date']['year']['value']
                        end_month = employment['end-date']['month']
                        if end_month:
                            end_date += '-' + end_month['value']
                            end_day = employment['end-date']['day']
                            if end_day:
                                end_date += '-' + end_day['value']
                #print('end date: ', end_date)
                department = employment['department-name']
                # if there is no value for department, set it to empty string
                if not department:
                    department = ''
                #print(department)
                if employment['organization']:
                    organization = employment['organization']['name']
                #print(organization)
                if 'Vanderbilt University' in organization:
                    print(orcid_id, given_names, family_name, start_date, end_date, department, organization)
                    table.append([orcid_id, given_names, family_name, start_date, end_date, department, organization])
                #print(table)
        sleep(.25)

print()
print('Done')
file_name = 'orcid_data.csv'
write_lists_to_csv(file_name, table)
file_name = 'orcid_other_names.csv'
write_lists_to_csv(file_name, other_name_list)


# Download Vanderbilt people's altLabels from Wikidata

Developed at https://github.com/HeardLibrary/linked-data/blob/master/publications/wikidata/download-vanderbilt-people-altlabels.py

These values aren't used for anything currently (2020-03-19), so running this is optional. But it will be useful in the future when we want to start collecting aliases.

In [None]:
query = '''select distinct  ?person ?altLabel where {
  ?person p:P108 ?statement.
  ?statement ps:P108  wd:Q29052.
  ?person skos:altLabel ?altLabel.
  FILTER(lang(?altLabel)="en")
}'''

# The endpoint defaults to returning XML, so the Accept: header is required
r = requests.get(wikidata_endpoint_url, params={'query' : query}, headers={'Accept' : 'application/json'})

data = r.json()
#print(json.dumps(data,indent = 2))

table = [['wikidataIri', 'altLabel']]
items = data['results']['bindings']
for item in items:
    wikidata_iri = item['person']['value']
    alt_label = ''
    if 'altLabel' in item:
        alt_label = item['altLabel']['value']
    table.append([wikidata_iri, alt_label])
    
filename = 'vanderbilt_wikidata_altlabels.csv'
write_lists_to_csv(filename, table)
print('done')

# Check Wikidata people for new ORCIDs

Check the downloaded records for Wikidata against the ORCID download to see if any people in Wikidata whose employer is Vanderbilt have new ORCIDs.

In [29]:
filename = 'wikidata/researchers.csv'
researcher_items = read_dicts_from_csv(filename)
fieldnames = researcher_items[0].keys() # get the field names from the existing file

filename = 'vanderbilt_wikidata_altlabels.csv'
researcher_altlabels = read_dicts_from_csv(filename)

filename = 'orcid_other_names.csv'
orcid_other = read_dicts_from_csv(filename)

filename = 'orcid_data.csv'
orcid_people = read_dicts_from_csv(filename)

filename = 'wikidata/departments.csv'
departments = read_dicts_from_csv(filename)

filename = 'wikidata/department_labels.csv'
department_labels = read_dicts_from_csv(filename)

print(researcher_items[0])
print()
print(orcid_people[0])


OrderedDict([('qid', 'Q100300828'), ('label_en', 'William C. Binkley'), ('description_en', 'American historian'), ('instance_of_uuid', '00540db8-46ca-177f-0bdf-2b39aa35a4c7'), ('instance_of', 'Q5'), ('gender_uuid', '442a23e6-4aaa-92ab-27b8-2a4461f951e4'), ('gender', 'Q6581097'), ('orcid_uuid', ''), ('orcid', ''), ('orcid_ref1_hash', ''), ('orcid_ref1_retrieved_nodeId', ''), ('orcid_ref1_retrieved_val', ''), ('orcid_ref1_retrieved_prec', ''), ('employer_uuid', '1efa2fed-45da-5ca3-9aff-b5a8ad5c6972'), ('employer', 'Q29052'), ('employer_start_time_nodeId', 'dbd0be1d7c1dc59596f8f5fa3fc01747'), ('employer_start_time_val', '1930-01-01T00:00:00Z'), ('employer_start_time_prec', '9'), ('employer_end_time_nodeId', '986ecd7803f8bd906d8deabf3fc68bd8'), ('employer_end_time_val', '1953-01-01T00:00:00Z'), ('employer_end_time_prec', '9'), ('employer_ref1_hash', 'eb45baba528a09bfbcbc6fc05adca6f3d929672f'), ('employer_ref1_referenceUrl', 'https://www.tshaonline.org/handbook/entries/binkley-william-campb

In [31]:

# NOTES for future work.
# There were multiple errors caused by family name mismatches, so there should be a requirement for exact matches there.
# Also, if this is run repeatedly, there needs to be a list of Q IDs and ORCIDs known to not be the same person so
# that they don't need to be checked again each time.

count = 1
#for researcher_item_index in range(500):
for researcher_item_index in range(len(researcher_items)):
    count +=1
    if count%100 == 0:
        print(count)
    if researcher_items[researcher_item_index]['orcid'] == '':
        researcher_names = find_surname_givens(researcher_items[researcher_item_index]['label_en'])
        for orcid_person in orcid_people:
            if researcher_names['family'] == orcid_person['familyName']: # require exact match to family name
                w_ratio = fuzz.WRatio(orcid_person['givenNames'] + ' ' + orcid_person['familyName'], researcher_items[researcher_item_index]['label_en'])

                if w_ratio > 90:
                    #ratio = fuzz.ratio(orcid_person['givenNames'] + ' ' + orcid_person['familyName'], researcher_item['label_en'])
                    #partial_ratio = fuzz.partial_ratio(orcid_person['givenNames'] + ' ' + orcid_person['familyName'], researcher_item['label_en'])
                    #sort_ratio = fuzz.token_sort_ratio(orcid_person['givenNames'] + ' ' + orcid_person['familyName'], researcher_item['label_en'])
                    #set_ratio = fuzz.token_set_ratio(orcid_person['givenNames'] + ' ' + orcid_person['familyName'], researcher_item['label_en'])
                    print(w_ratio, researcher_items[researcher_item_index]['label_en'], wikibase_instance_namespace + researcher_items[researcher_item_index]['qid'])
                    dept_match = lookup_department(researcher_items[researcher_item_index]['qid'], orcid_person['department'], departments, department_labels)
                    print('   ', orcid_person['givenNames'] + ' ' + orcid_person['familyName'], 'https://orcid.org/' + orcid_person['orcid'])
                    print(orcid_person['department'])
                    if dept_match != '':
                        print('Dept. match:', dept_match)
                    print()
                    #print('name similarity ratio', ratio)
                    #print('partial ratio', partial_ratio)
                    #print('sort_ratio', sort_ratio)
                    #print('set_ratio', set_ratio)
                    #print('w_ratio', w_ratio)
                    researcher_items[researcher_item_index]['orcid'] = orcid_person['orcid']
                    researcher_items[researcher_item_index]['orcid_ref1_retrieved_val'] = today
                    write_dicts_to_csv(researcher_items, 'wikidata/researchers.csv', fieldnames)
                    break

print('done')

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
100 Tao Liang http://www.wikidata.org/entity/Q83422369
    Tao Liang https://orcid.org/0000-0001-6355-5546


1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
95 Zhen Wang http://www.wikidata.org/entity/Q90280723
    Zheng Wang https://orcid.org/0000-0001-5943-344X
Psychology

95 Sharon Elizabeth Davis http://www.wikidata.org/entity/Q90330601
    Elizabeth Davis https://orcid.org/0000-0001-7626-2435


95 Kevin B. Johnson http://www.wikidata.org/entity/Q90330651
    Kevin Johnson https://orcid.org/0000-0003-0479-4242
Biochemistry

3600
95 Zachary J. Jones http://www.wikidata.org/entity/Q90408955
    Zachary Jones https://orcid.org/0000-0002-2515-786X
Pathology, Microbiology, and Immunoogy

3700
3800
3900
96 Jian-Chun Chen http://www.wikidata.org/entity/Q91133915
    Jianchun Chen https://orcid.org/0000-0002-9438-1638
medicine
Dept. match: 88 medicine/Vanderbilt Department 

95 Daniel J. Benedetti http://www.wikidata.org/entity/Q91485869
    Daniel Benedetti https://orcid.org/0000-0003-1524-3518
Pediatrics
Dept. match: 100 Pediatrics/Vanderbilt Department of Pediatrics

95 Deborah Price Jones http://www.wikidata.org/entity/Q91485917
    Deborah Jones https://orcid.org/0000-0002-9321-6286
Pediatrics
Dept. match: 100 Pediatrics/Vanderbilt Department of Pediatrics

95 Debra L. Friedman http://www.wikidata.org/entity/Q91485932
    Debra Friedman https://orcid.org/0000-0002-0589-4176
Pediatric Hematology / Oncology

4900
5000
95 Rosemary J. Hunter http://www.wikidata.org/entity/Q91486709
    Rosemary Hunter https://orcid.org/0000-0003-3602-2335
Pediatrics
Dept. match: 100 Pediatrics/Vanderbilt Department of Pediatrics

100 Travis Crook http://www.wikidata.org/entity/Q91486867
    Travis Crook https://orcid.org/0000-0002-5363-9428
Pediatrics
Dept. match: 100 Pediatrics/Vanderbilt Department of Pediatrics

95 Fang Yan http://www.wikidata.org/entity/Q91486962
    

# Check new orcid download for people not associated with VU

This part of the script looks for people who ORCID said worked at Vanderbilt, but who don't have any employer property in Wikidata saying that they work(ed) at VU.

In [20]:
print(len(orcid_people))
orcid_list = ''
for person in orcid_people[:500]:
    orcid_list += '"' + person['orcid'] + '"\n'
#print(orcid_list)

query_string = '''select distinct ?person ?name ?orcid where {
  values ?orcid
  {
''' + orcid_list +'''  }
  ?person wdt:P496 ?orcid.
  ?person rdfs:label ?name.
  filter(lang(?name) = "en")
  minus {
    {?person wdt:P108 wd:Q29052.}
    union
    {?person wdt:P108 wd:Q7914455.}
  }
  }'''

#print(query)

data = send_sparql_query(query_string)
print(json.dumps(data, indent = 2))

3130
querying SPARQL endpoint to acquire item metadata
done retrieving data
[
  {
    "orcid": {
      "type": "literal",
      "value": "0000-0001-5815-1176"
    },
    "person": {
      "type": "uri",
      "value": "http://www.wikidata.org/entity/Q90330691"
    },
    "name": {
      "xml:lang": "en",
      "type": "literal",
      "value": "Brian O Bachmann"
    }
  },
  {
    "orcid": {
      "type": "literal",
      "value": "0000-0002-6669-4514"
    },
    "person": {
      "type": "uri",
      "value": "http://www.wikidata.org/entity/Q59749182"
    },
    "name": {
      "xml:lang": "en",
      "type": "literal",
      "value": "Kate Sborov"
    }
  },
  {
    "orcid": {
      "type": "literal",
      "value": "0000-0002-5723-5834"
    },
    "person": {
      "type": "uri",
      "value": "http://www.wikidata.org/entity/Q88284427"
    },
    "name": {
      "xml:lang": "en",
      "type": "literal",
      "value": "Jacob I Feldman"
    }
  },
  {
    "orcid": {
      "type": "