# Configuration

In [2]:
import pandas as pd
import csv
import json
import datetime
import requests
from time import sleep
from fuzzywuzzy import fuzz # fuzzy logic matching
from langdetect import detect_langs
import re
import xml.etree.ElementTree as et # library to traverse XML tree

sparql_sleep = 0.1
default_language = 'en'
accept_media_type = 'application/json'
endpoint = 'https://query.wikidata.org/sparql'
user_agent_header = 'write_edt/0.1 (https://github.com/HeardLibrary/linked-data/; mailto:steve.baskauf@vanderbilt.edu)'

# PubMed required info
email_address = 'steve.baskauf@vanderbilt.edu' # put your email address here
tool_name = 'retrieve_doi_data0.1' # give your application a name here, no spaces


# ----------------
# Define functions
# ----------------

def generate_sparql_header_dictionary(accept_media_type,user_agent_header):
    request_header_dictionary = {
        'Accept' : accept_media_type,
#        'Content-Type': 'application/sparql-query',
        'Content-Type': 'application/x-www-form-urlencoded',
        'User-Agent': user_agent_header
    }
    return request_header_dictionary

# The following function requires the request header generated above
sparql_request_header = generate_sparql_header_dictionary(accept_media_type, user_agent_header)

# Read from a CSV file into a list of dictionaries
def read_dicts_from_csv(filename):
    with open(filename, 'r', newline='', encoding='utf-8') as file_object:
        dict_object = csv.DictReader(file_object)
        array = []
        for row in dict_object:
            array.append(row)
    return array

# Write list of dictionaries to a CSV file
def write_dicts_to_csv(table, filename, fieldnames):
    with open(filename, 'w', newline='', encoding='utf-8') as csv_file_object:
        writer = csv.DictWriter(csv_file_object, fieldnames=fieldnames)
        writer.writeheader()
        for row in table:
            writer.writerow(row)

def detect_language(string):
    try:
        lang_list = detect_langs(string)
        lang_string = str(lang_list[0])
        confidence = float(lang_string[3:])
        lang = lang_string[:2]
    except: #exceptions occur when no info to decide, e.g. numbers
        lang = 'zxx'
        confidence = float(0)
    return lang, confidence

# Use language detection on the title of the thesis
def determine_language_from_title(language_list, title):
    lang, confidence = detect_language(title)
    if lang == 'en' and confidence > 0.99:
        language_list.append('Q1860')
    elif lang == 'es':
        language_list.append('Q1321')
    elif lang == 'fr':
        language_list.append('Q150')
    else:
        language_list.append('')

    return language_list


# Rule-based function for deciding whether to combine columns
def merge_column_values(header_list, etd_frame):
    """Uses some rules to combine all columns in a single column. Returns a list."""
    column_list = []
    for identifier, data in etd_frame.iterrows():
        for extra_header in header_list[1:len(header_list) + 1]:
            #print(data[header_list[0]], data[extra_header])
            # The basic column doesn't have a value, use the other column
            if data[header_list[0]].strip() == '':
                if data[extra_header].strip() != '':
                    data[header_list[0]] = data[extra_header].strip()
            # Check whether basic column value is different from the other column
            else:
                if data[extra_header] != '':
                    # Special handling for Handle
                    if header_list[0] == 'dc.identifier.uri':
                        # If the second column has the Handle in it, use it, otherwise leave the first value
                        if 'http://hdl.handle.net/' in data[extra_header]:
                            data[header_list[0]] = data[extra_header]
                    else:
                        if data[header_list[0]].strip() != data[extra_header].strip():
                            data[header_list[0]] += ' *merged* ' + data[extra_header].strip()
        column_list.append(data[header_list[0]])
    return column_list

def extract_author_values(dataframe):
    """Uses some rules to combine author names from two columns. Returns a list of names with proper order."""
    author_list = []
    for identifier, data in dataframe.iterrows():
        
        last_first_string = ''
        
        # The dc.contrubutor.author column doesn't have a value
        if data['dc.contributor.author'] == '':
            if data['dc.creator'] != '': # Use dc.creator
                last_first_string = data['dc.creator']
        # The dc.contributor.author column has a value
        else:
            # Use the dc.contributor.author value since there's no dc.creator
            if data['dc.creator'] == '':
                last_first_string = data['dc.contributor.author']
            # Both dc.contributor.author and dc.creator have values
            else:
                # If they are both the same, just use one of them
                if data['dc.contributor.author'] == data['dc.creator']:
                    last_first_string = data['dc.creator']
                # If they are different, then there's an error and concatinate both.
                else:
                    last_first_string = data['dc.contributor.author'] + ' *merged* ' + data['dc.creator']

        author_list.append(invert_author_names(last_first_string))

    return author_list

def invert_author_names(name):
    """Input inverted author names, handle suffixes, and invert. Returns a tuple of name and suffix."""
    whitespace_pieces = name.split(',')
    
    # Remove whitespace from all pieces of the name
    pieces = []
    for piece in whitespace_pieces:
        pieces.append(piece.strip())
        
    name_suffix = ''
    # Look for suffixes. Note: must check longer versions before shorter (i.e. III before II, Jr. before Jr)
    suffix_list = [
        {'string': 'Jr.', 'value': 'Jr.'} , 
        {'string': 'Jr', 'value': 'Jr.'} , 
        {'string': 'JR.', 'value': 'Jr.'} , 
        {'string': 'JR', 'value': 'Jr.'} , 
        {'string': 'III', 'value': 'III'} , 
        {'string': 'II', 'value': 'II'} , 
        {'string': 'IV', 'value': 'IV'} , 
        {'string': 'VI', 'value': 'VI'} , 
        {'string': 'V', 'value': 'V'}
    ]
    
    # Don't know how to interpret, put pieces back together and flag as error.
    if len(pieces) > 3:
        name = ' '.join(pieces) + ' *error*'
        
    # If three pieces search for a suffix
    elif len(pieces) == 3:
        found = False
        for piece in pieces:
            for suffix in suffix_list:
                if piece == suffix['string']:
                    name_suffix = suffix['value']
                    found = True
                    break # Stop checking
            if found:
                pieces.remove(suffix['string']) # Remove the suffix piece from the list
                name = pieces[1] + ' ' + pieces[0] # Invert and join the remaining two pieces
        
    # If two pieces, look for trailing suffixes
    elif len(pieces) == 2:
        for piece_number in range(2):
            for suffix in suffix_list:
                # If the name piece ends with the suffix string, remove it and set the value of name_suffix
                if len(pieces[piece_number]) > len(suffix['string']) and pieces[piece_number][-len(suffix['string']):] == suffix['string']:
                    pieces[piece_number] = pieces[piece_number][:-len(suffix['string'])].strip()
                    name_suffix = suffix['value']
                    break
        
        name = pieces[1] + ' ' + pieces[0]
        
    # One piece, just look for suffix.
    else:
        for suffix in suffix_list:
            # If the name ends with the suffix string, remove it and set the value of name_suffix
            if len(pieces[0]) > len(suffix['string']) and pieces[0][-len(suffix['string']):] == suffix['string']:
                pieces[0] = pieces[0][:-len(suffix['string'])].strip()
                name_suffix = suffix['value']
                break

        name = pieces[0]
    
    return ' '.join(fix_all_caps(name.split(' '))), name_suffix

def extract_advisor_values(dataframe):
    """Uses some rules to combine advisor names from two columns. Returns a list of lists of names with proper order."""
    advisor_list = []
    for identifier, data in dataframe.iterrows():
        
        last_first_string = ''
        error = False
        
        # The dc.contributor.advisor column doesn't have a value
        if data['dc.contributor.advisor'] == '':
            if data['dc.contributor.committeeChair'] != '': # Use dc.contributor.committeeChair
                last_first_string = data['dc.contributor.committeeChair']
        # The dc.contributor.advisor column has a value
        else:
            # Use the dc.contributor.advisor value since there's no dc.contributor.committeeChair
            if data['dc.contributor.committeeChair'] == '':
                last_first_string = data['dc.contributor.advisor']
            # Both dc.contributor.advisor and dc.contributor.committeeChair have values
            else:
                # If they are both the same, just use one of them
                if data['dc.contributor.advisor'] == data['dc.contributor.committeeChair']:
                    last_first_string = data['dc.contributor.committeeChair']
                # If they are different, then there's an error and concatinate both.
                else:
                    last_first_string = data['dc.contributor.advisor'] + ' *merged* ' + data['dc.contributor.committeeChair']
                    error = True
                    
        if error:
            pass
        else:
            # Split the multiple advisors if more than one
            last_first_advisors = last_first_string.split('||')
            coadvisors = []
            for advisor in last_first_advisors:
                coadvisors.append(invert_author_names(remove_honorifics(advisor)))

        advisor_list.append(coadvisors)

    return advisor_list

def remove_honorifics(name):
    """Removes prefixes like Dr. and Prof. and suffixes like Ph.D and M.D. Returns a name string."""

    # List Ph.D. first since it usuall comes after M.D.
    suffix_list = [
        {'string': 'Ph.D.', 'value': 'Ph.D.'}, 
        {'string': 'PhD', 'value': 'Ph.D.'},
        {'string': 'M.D.', 'value': 'M.D.'}, 
        {'string': 'MD', 'value': 'M.D.'}
    ]
    
    # List Ph.D. first since it usuall comes after M.D.
    prefix_list = [
        {'string': 'Dr.', 'value': 'Dr.'}, 
        {'string': 'Dr', 'value': 'Dr.'},
        {'string': 'Professor', 'value': 'Prof.'}, 
        {'string': 'Prof.', 'value': 'Prof.'}, 
        {'string': 'Prof', 'value': 'Prof.'}
    ]
    # Remove suffix honorifics
    for suffix in suffix_list:
        # If the name ends with the suffix string, remove it.
        if len(name) > len(suffix['string']) and name[-len(suffix['string']):] == suffix['string']:
            name = name[:-len(suffix['string'])].strip()
            # remove any trailing commas
            if name[-1] == ',':
                name = name[:-1].strip()

    # Remove prefix honorifics
    for prefix in prefix_list:
        # If the name begins with the prefix string, remove it.
        if len(name) > len(prefix['string']) and name[:len(prefix['string'])] == prefix['string']:
            name = name[len(prefix['string']):].strip()
        
    return name
        
# Function copied from https://github.com/HeardLibrary/linked-data/blob/master/publications/crossref/retrieve_doi_data.ipynb
def title_if_no_lowercase(string):
    """Changes to titlecase only if there are no lowercase letters in the string."""
    lower = 'abcdefghijklmnopqrstuvwxyz'
    is_lower = False
    for letter in string:
        if letter in lower:
            is_lower = True
    if is_lower:
        return string
    else:
        return string.title()

# Function copied from https://github.com/HeardLibrary/linked-data/blob/master/publications/crossref/retrieve_doi_data.ipynb
def fix_all_caps(name_pieces):
    """Input is a list of name strings from name split by spaces"""
    clean_pieces = []
    for piece in name_pieces:
        # Special handing for names starting with apostrophe-based prefixes
        apostrophe_list = ["van't", "'t", "O'", "D'", "d'", "N'"]
        apostrophe_prefix = ''
        for possible_apostrophe_prefix in apostrophe_list:
            if possible_apostrophe_prefix in piece:
                # Remove prefix
                piece = piece.replace(possible_apostrophe_prefix, '')
                apostrophe_prefix = possible_apostrophe_prefix
        
        # Special handling for name parts that are lowercase
        lower_case_list = ['von', 'de', 'van', 'der']
        if piece.lower() in lower_case_list:
            piece = piece.lower()
        else:
            # Special handling for hyphenated names; doesn't work for an edge case with more than 2 hyphens
            if '-' in piece:
                halves = piece.split('-')
                piece = title_if_no_lowercase(halves[0]) + '-' + title_if_no_lowercase(halves[1])
            else:
                piece = title_if_no_lowercase(piece)
        
        # put any apostrophe prefix back on the front
        if apostrophe_prefix:
            piece = apostrophe_prefix + piece
        
        clean_pieces.append(piece)
    return clean_pieces
# ---------------
# The following functions were modified from https://github.com/HeardLibrary/linked-data/blob/master/publications/crossref/retrieve_doi_data.ipynb
# ---------------

def find_surname_givens(name):
    # Get rid of periods and commas
    name = name.replace('.', ' ')
    name = name.replace(',', ' ')
    
    # Split name
    pieces = name.split(' ')
    
    # Get rid of empty pieces formed from extra spaces
    while '' in pieces:
        pieces.remove('')
        
    # Must be at least a surname and something else
    if len(pieces) <= 1:
        return False
    
    # Make sure first character is alphabetic
    # only fixes the case where there is one alphanumeric, but more than one is rare
    # typical cases are like (Kit) or "Kit"    
    for piece_index in range(len(pieces)):
        if not pieces[piece_index][0:1].isalpha(): 
            pieces[piece_index] = pieces[piece_index][1:len(pieces)] # remove the first non-alphabetic character
    # Now get rid of any empty strings; could also be caused by double spaces
    for piece in pieces:
        if len(piece) == 0: # there's nothing left, get rid of piece
            pieces.remove('')
            
    # Get rid of ", Jr.", "III", etc.
    if 'Jr' in pieces:
        pieces.remove('Jr')
    if 'Sr' in pieces:
        pieces.remove('Sr')
    if 'II' in pieces:
        pieces.remove('II')
    if 'III' in pieces:
        pieces.remove('III')
    if 'IV' in pieces:
        pieces.remove('IV')
    if 'V' in pieces:
        pieces.remove('V')
    
    # Not interested unless there are at least two pieces
    if len(pieces) == 1:
        return False
    
    # Put all but last piece together again
    given_names = ''
    for piece in pieces[0:len(pieces)-2]:
        given_names += piece + ' '
    given_names += pieces[len(pieces)-2]
    
    return {'given': given_names, 'family': pieces[len(pieces)-1]}


def disambiguate_authors(authors):
    filename = 'researchers.csv'
    researchers = read_dicts_from_csv(file_path + filename)

    filename = 'vanderbilt_wikidata_altlabels.csv'
    altnames = read_dicts_from_csv(file_path + filename)

    filename = 'departments.csv'
    departments = read_dicts_from_csv(file_path + filename)

    filename = 'department_labels.csv'
    department_labels = read_dicts_from_csv(file_path + filename)

    max_pmids_to_check = 10

    # screens.json is a configuration file that defines the kinds of screens to be performed on potential Q ID matches from Wikidata
    screens = load_json_into_data_struct('screens.json')

    # Perform screening operations on authors to try to determine their Q IDs
    found_qid_values = []
    not_found_author_list = []
    author_count = 1
    for author in authors:
        print(author_count)
        found = False
        
        # First eliminate the case where all of the name pieces are empty
        if (author['givenName'] + ' ' + author['familyName']).strip() == '':
            break
            
        # Record stated_as
        stated_as = (author['givenName'] + ' ' + author['familyName']).strip()
            
        # Fix case where names are stupidly in all caps
        name_pieces = author['givenName'].strip().split(' ')
        author['givenName'] = ' '.join(fix_all_caps(name_pieces))
        name_pieces = author['familyName'].strip().split(' ')
        author['familyName'] = ' '.join(fix_all_caps(name_pieces))
        
        # Screen for exact match to Wikidata labels
        for researcher in researchers:
            if researcher['label_en'] == author['givenName'] + ' ' + author['familyName']:
                found = True
                result_string = 'researcher exact label match: ' + researcher['qid'] + ' ' + researcher['label_en']
                name = researcher['label_en']
                qid = researcher['qid']
                break
        if not found:
            # screen for exact match to alternate names
            for altname in altnames:
                if altname['altLabel'] == author['givenName'] + ' ' + author['familyName']:
                    found = True
                    result_string = 'researcher altname match: ' + altname['qid'] + ' ' + altname['altLabel']
                    name = altname['altLabel']
                    qid = altname['qid']
                    break
            if not found:
                # If the researcher has an ORCID, see if it's at Wikidata
                if author['orcid'] != '':
                    hit = searchWikidataForQIdByOrcid(author['orcid'])
                    if hit != {}:
                        found = True
                        result_string = 'Wikidata ORCID search: ' + hit['qid'] + ' ' + hit['label'] + ' / ' + hit['description']
                        name = hit['label']
                        qid = hit['qid']

                if not found:
                    # screen for fuzzy match to Wikidata-derived labels
                    for researcher in researchers:
                        # Require the surname to match the label surname exactly
                        split_names = find_surname_givens(researcher['label_en']) # returns False if no family name
                        if split_names: # skip names that don't have 2 parts !!! also misses non-English labels!
                            if split_names['family'] == author['familyName']: # require exact match to family name
                                w_ratio = fuzz.ratio(author['givenName'] + ' ' + author['familyName'], researcher['label_en'])
                                #w_ratio = fuzz.WRatio(author['givenName'] + ' ' + author['familyName'], researcher['label_en'])
                                if w_ratio > 90:
                                    found = True
                                    result_string = 'fuzzy label match: ' + str(w_ratio) + ' ' + researcher['qid'] + ' ' + researcher['label_en'] + ' / ' + author['givenName'] + ' ' + author['familyName']
                                    name = researcher['label_en']
                                    qid = researcher['qid']
                                    break
                    if not found:
                        # screen for fuzzy match to alternate names
                        for altname in altnames:
                            split_names = find_surname_givens(altname['altLabel'])
                            if split_names: # skip names that don't have 2 parts
                                if split_names['family'] == author['familyName']: # require exact match to family name
                                    w_ratio = fuzz.ratio(author['givenName'] + ' ' + author['familyName'], altname['altLabel'])
                                    #w_ratio = fuzz.WRatio(author['givenName'] + ' ' + author['familyName'], altname['altLabel'])
                                    if w_ratio > 90:
                                        found = True
                                        result_string = 'researcher altname fuzzy match: ' + str(w_ratio) + ' ' + altname['qid'] + ' ' + altname['altLabel'] + ' / ' + author['givenName'] + ' ' + author['familyName']
                                        name = altname['altLabel']
                                        qid = altname['qid']
                                        break
                        if not found:
                            name = author['givenName'] + ' ' + author['familyName']
                            print('Searching Wikidata for', name)
                            print('researcher known affiliations: ', author['affiliation'])
                            print()
                            hits = search_name_at_wikidata(name)
                            #print(hits)

                            qids = []
                            for hit in hits:
                                qids.append(hit['qid'])
                            return_list = screen_qids(qids, screens)
                            #print(return_list)

                            # Save discovered data to return if not matched
                            discovered_data = []
                            for hit in return_list:
                                hit_data = hit
                                split_names = find_surname_givens(hit['label'])

                                # Require the surname to match the Wikidata label surname exactly
                                # This prevents a high fraction of fuzzy matches where the last names are similar but not the same
                                if split_names: # skip names that don't have 2 parts
                                    if split_names['family'] == author['familyName']: # require exact match to family name
                                        #print(author['givenName'] + ' ' + author['familyName'], hit['label'])
                                        #print(hit)
                                        w_ratio = fuzz.WRatio(author['givenName'] + ' ' + author['familyName'], hit['label'])
                                        #print('w_ratio:', w_ratio)
                                        #ratio = fuzz.ratio(author['givenName'] + ' ' + author['familyName'], hit['label'])
                                        #print('ratio:', ratio)
                                        #partial_ratio = fuzz.partial_ratio(author['givenName'] + ' ' + author['familyName'], hit['label'])
                                        #print('partial_ratio:', partial_ratio)
                                        #token_sort_ratio = fuzz.token_sort_ratio(author['givenName'] + ' ' + author['familyName'], hit['label'])
                                        #print('token_sort_ratio:', token_sort_ratio)
                                        #token_set_ratio = fuzz.token_set_ratio(author['givenName'] + ' ' + author['familyName'], hit['label'])
                                        #print('token_set_ratio:', token_set_ratio)

                                        # This screen requires a high degree of similarity between the overall ORCID names and Wikidata labels
                                        if w_ratio > 80:
                                            print('Wikidata search fuzzy match:', w_ratio, author['givenName'] + ' ' + author['familyName'], ' / ', 'https://www.wikidata.org/wiki/'+ hit['qid'], hit['label'])
                                            print('Wikidata description: ', hit['description'])

                                            # Here we need to check Wikidata employer and affiliation and fuzzy match against known affiliations
                                            occupations, employers, affiliations = search_wikidata_occ_emp_aff(hit['qid'])
                                            print('occupations:', occupations)
                                            hit_data['occupations'] = occupations
                                            print('employers:', employers)
                                            hit_data['employers'] = employers
                                            print('affiliations', affiliations)
                                            hit_data['affiliations'] = affiliations
                                            print()

                                            # Perform a check of the employer to make sure we didn't miss somebody in the earlier
                                            # string matching
                                            for employer in employers:
                                                if 'Vanderbilt University' in employer: # catch university and med center
                                                    found = True
                                                    result_string = 'Match Vanderbilt employer in Wikidata: ' + hit['qid'] + ' ' + author['givenName'] + ' ' + author['familyName']
                                                    qid = hit['qid']

                                            # If the author doesn't have any known affiliations, there is no point in checking PubMed
                                            if author['affiliation'] != []:
                                                # Search Wikidata for articles written by this match
                                                articles_in_wikidata = search_wikidata_article(hit['qid'])
                                                #print(articles_in_wikidata)

                                                # Step through articles with PubMed IDs found in Wikidata and see if the author affiliation or ORCID matches any of the articles
                                                check = 0
                                                for article_in_wikidata in articles_in_wikidata:
                                                    if article_in_wikidata['pmid'] != '':
                                                        check += 1
                                                        if check > max_pmids_to_check:
                                                            print('More articles, but stopping after checking', max_pmids_to_check)
                                                            break # break out of article-checking loop
                                                        print('Checking article, PMID:', article_in_wikidata['pmid'], article_in_wikidata['title'])
                                                        pubmed_match = identified_in_pubmed(article_in_wikidata['pmid'], author['givenName'] + ' ' + author['familyName'], author['affiliation'], author['orcid'])
                                                        if not pubmed_match:
                                                            #print('no match')
                                                            print()
                                                        else:
                                                            found = True
                                                            result_string = 'PubMed affilation match: ' + hit['qid'] + ' ' + author['givenName'] + ' ' + author['familyName']
                                                            qid = hit['qid']
                                                            break # break out of article-checking loop

                                            if found:
                                                break # break out of hit list loop
                                            print()
                                            # If none of the matching criteria are met, save the data for future use
                                            discovered_data.append(hit_data)

        if not found:
            not_found_author_list.append({'name_string': author['givenName'] + ' ' + author['familyName'], 'series_ordinal': author_count, 'possible_matches': discovered_data})
            print('not found:', author['givenName'] + ' ' + author['familyName'])

        else:
            found_qid_values.append({'qid': qid, 'stated_as': stated_as, 'series_ordinal': author_count})
            print(result_string)
            for department in departments:
                if qid == department['qid']:
                    for department_label in department_labels:
                        if department_label['qid'] == department['affiliation']:
                            print(department_label['label_en'])
                            break
        print()
        author_count += 1

    print()
    return found_qid_values, not_found_author_list

def searchWikidataForQIdByOrcid(orcid):
    query_string = '''
select distinct ?qid ?label ?description where {
    ?qid wdt:P496 "''' + orcid + '''".
    ?qid rdfs:label ?label.
    FILTER(lang(?label)="'''+ default_language + '''")
    OPTIONAL {
    ?qid schema:description ?description.
    FILTER(lang(?description)="'''+ default_language + '''")
    }
      }
    '''
    #print(query_string)

    results = send_sparql_query(query_string)
    #print(json.dumps(results, indent=2))

    if len(results) > 1:
        print('Warning!!! Multiple items with same ORCID!')
        print(results)
    if len(results) == 0:
        out_dict = {}        
    else:
        out_dict = {
            'qid': extract_local_name(results[0]['qid']['value']),
            'label': results[0]['label']['value']
            }
        if 'description' in results[0]:
            out_dict['description'] = results[0]['description']['value']
        else:
            out_dict['description'] = ''           
    return out_dict

# Generate the current UTC xsd:date
def generate_utc_date():
    whole_time_string_z = datetime.datetime.utcnow().isoformat() # form: 2019-12-05T15:35:04.959311
    date_z = whole_time_string_z.split('T')[0] # form 2019-12-05
    return date_z

# Extracts the local name part of an IRI, e.g. a qNumber from a Wikidata IRI
def extract_local_name(iri):
    # pattern is http://www.wikidata.org/entity/Q6386232
    pieces = iri.split('/')
    last_piece = len(pieces)
    return pieces[last_piece - 1]

# Load JSON file data from local drive into a Python data structure
def load_json_into_data_struct(path):
    with open(path, 'rt', encoding='utf-8') as file_object:
        file_text = file_object.read()
    structure = json.loads(file_text)
    # uncomment the following line to view the data
    # print(json.loads(structure, indent = 2))
    return(structure)

# Sends a query to the query service endpoint. 
# NOTE: request_header and endpoint are global variables defined earlier in the script
def send_sparql_query(query_string):
    # You can delete the two print statements if the queries are short. However, for large/long queries,
    # it's good to let the user know what's going on.
    #print('querying SPARQL endpoint to acquire item metadata')
    #response = requests.post(endpoint, data=query_string.encode('utf-8'), headers=sparql_request_header)
    response = requests.post(endpoint, data=dict(query=query_string), headers=sparql_request_header)
    #print(response.text) # uncomment to view the raw response, e.g. if you are getting an error
    data = response.json()

    # Extract the values from the response JSON
    results = data['results']['bindings']
    
    #print('done retrieving data')
    # print(json.dumps(results, indent=2))
    
    sleep(sparql_sleep) # delay to avoid hitting the Query Service too fast
    return results

def generate_name_alternatives(name):
    # treat commas as if they were spaces
    name = name.replace(',', ' ')
    # get rid of periods, sometimes periods are close up with no spaces
    name = name.replace('.', ' ')

    pieces = name.split(' ')
    while '' in pieces:
        pieces.remove('')
    
    # Remove ", Jr.", "III", etc. from end of name
    if pieces[len(pieces)-1] == 'Jr':
        pieces = pieces[0:len(pieces)-1]
        suffix = ', Jr.'
    elif pieces[len(pieces)-1] == 'II':
        pieces = pieces[0:len(pieces)-1]
        suffix = ' II'
    elif pieces[len(pieces)-1] == 'III':
        pieces = pieces[0:len(pieces)-1]
        suffix = ' III'
    elif pieces[len(pieces)-1] == 'IV':
        pieces = pieces[0:len(pieces)-1]
        suffix = ' IV'
    elif pieces[len(pieces)-1] == 'V':
        pieces = pieces[0:len(pieces)-1]
        suffix = ' V'
    elif len(pieces) > 3 and pieces[len(pieces)-2] == 'the' and pieces[len(pieces)-1] == 'elder':
        pieces = pieces[0:len(pieces)-2]
        suffix = ' the elder'
    else:
        suffix = ''
        
    # Fix stupid situation where name is written in ALL CAPS
    pieces = fix_all_caps(pieces)        

    # generate initials for all names
    initials = []
    for piece in pieces:
        # make sure first character is alphabetic
        # only fixes the case where there is one alphanumeric, but more than one is rare
        # typical cases are like (Kit) or "Kit"
        if not piece[0:1].isalpha():
            piece = piece[1:len(piece)] # remove the first non-alphabetic character
        if len(piece) > 0:
            initials.append(piece[0:1])
        
    alternatives = []
    # full name
    name_version = ''
    for piece_number in range(0, len(pieces)-1):
        name_version += pieces[piece_number] + ' '
    name_version += pieces[len(pieces)-1]
    alternatives.append(name_version)
    
    # full name with suffix
    if suffix != '':
        name_version = ''
        for piece_number in range(0, len(pieces)-1):
            name_version += pieces[piece_number] + ' '
        name_version += pieces[len(pieces)-1] + suffix
        alternatives.append(name_version)
    
    # first and last name with initials
    name_version = pieces[0] + ' '
    for piece_number in range(1, len(pieces)-1):
        name_version += initials[piece_number] + ' '
    name_version += pieces[len(pieces)-1]
    alternatives.append(name_version)
    
    # first and last name with initials and periods
    name_version = pieces[0] + ' '
    for piece_number in range(1, len(pieces)-1):
        name_version += initials[piece_number] + '. '
    name_version += pieces[len(pieces)-1]
    alternatives.append(name_version)

    # first and last name only
    name_version = pieces[0] + ' '
    name_version += pieces[len(pieces)-1]
    alternatives.append(name_version)

    # first initial and last name only
    name_version = initials[0] + ' '
    name_version += pieces[len(pieces)-1]
    alternatives.append(name_version)

    # first initial with period and last name only
    name_version = initials[0] + '. '
    name_version += pieces[len(pieces)-1]
    alternatives.append(name_version)

    # all name initials with last name
    name_version = initials[0] + ' '
    for piece_number in range(1, len(pieces)-1):
        name_version += initials[piece_number] + ' '
    name_version += pieces[len(pieces)-1]
    alternatives.append(name_version)

    # all name initials with periods with last name
    name_version = ''
    for piece_number in range(0, len(pieces)-1):
        name_version += initials[piece_number] + '. '
    name_version += pieces[len(pieces)-1]
    alternatives.append(name_version)

    # all name initials concatenated with last name
    name_version = ''
    for piece_number in range(0, len(pieces)-1):
        name_version += initials[piece_number]
    name_version += ' ' + pieces[len(pieces)-1]
    alternatives.append(name_version)
    
    # remove duplicates
    dedupe = list(set(alternatives))

    return dedupe

def search_name_at_wikidata(name):
    # carry out search for most languages that use Latin characters, plus some other commonly used languages
    # See https://doi.org/10.1145/3233391.3233965
    language_codes = [
        'en',
        'es',
        'pt',
        'fr',
        'it',
        'nl',
        'de',
        'da',
        'et',
        'hu',
        'ga',
        'ro',
        'sk',
        'sl',
        'zu',
        'tr',
        'sv',
        'zh',
        'ru',
        'ja',
        'ar',
        'pl',
        'uk',
        'ca',
        'cs',
        'la',
        'nb',
        'hu',
        'he',
        'eo',
        'fi',
        'ko'
      ]
    name_list = generate_name_alternatives(name)
    alternatives = ''
    for language_code in language_codes:
        for alternative in name_list:
            # get rid of quotes, which will break the query
            alternative = alternative.replace('"', '')
            alternative = alternative.replace("'", '')
            alternatives += '"' + alternative + '"@' + language_code + '\n'
    query = '''
select distinct ?item ?label where {
  VALUES ?value
  {
  ''' + alternatives + '''}
?item rdfs:label|skos:altLabel ?value.
?item rdfs:label ?label.
FILTER(lang(?label)='en')
  }
'''
    #print(query)
    #print('searching for ', name)
    results = []
#    r = requests.post(endpoint, data=query.encode('utf-8'), headers=sparql_request_header)
    r = requests.post(endpoint, data=dict(query=query), headers=sparql_request_header)
    try:
        data = r.json()
        #print(data)
        statements = data['results']['bindings']
        for statement in statements:
            wikidata_iri = statement['item']['value']
            if 'label' in statement:
                name = statement['label']['value']
            else:
                name = ''
            qnumber = extract_local_name(wikidata_iri)
            results.append({'qid': qnumber, 'name': name})
    except:
        results = [{'error': r.text}]
    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(sparql_sleep)
    return results

def screen_qids(qids, screens):
    qid_values =''
    for qid in qids:
        qid_values += 'wd:' + qid + '\n'

    graph_pattern = ''
    first_screen = True
    for screen in screens:
        # Each requirement in a screen has an AND relationship (all must be satisfied)
        subgraph_pattern = ''
        for requirement in screen:

            # Set the value if required or use a dummy variable if any value is allowed
            if requirement['entity'] == '':
                value = '?var' + requirement['property'] # add the property string to the variable to guarantee uniqueness
            elif re.match(r'Q\d+', requirement['entity']): # regex to match Q IDs
                value = 'wd:' + requirement['entity']
            else: # if not nothing or a Q ID, assume it's a string literal
                if requirement['lang'] == '':
                    value = '"' + requirement['entity'] + '"'
                else:
                    value = '"' + requirement['entity'] + '"@' + requirement['lang']

            # Set the property (label, description, or P value)
            if requirement['property'] == 'label':
                property = 'rdfs:label'
            elif requirement['property'] == 'description':
                property = 'schema:description'
            else:
                property = 'wdt:' + requirement['property']

            # Place the value in either the subject or object position in the triple
            if requirement['position'] == 'object':
                triple_pattern = '?qid ' + property + ' ' + value + '.'
            else:
                triple_pattern = value + ' ' + property + ' ?qid.'

            # Add filters if needed
            if requirement['filter_type'] == '<' or requirement['filter_type'] == '>': 
                # note: string comparison only e.g. for datetimes, needs modification for actual numbers
                triple_pattern += '\nFILTER (STR(?var' + requirement['property'] + ') ' + requirement['filter_type'] + ' "' + requirement['filter_string'] + '")'

            if requirement['filter_type'] == 'in': 
                # note: string comparison only
                triple_pattern += '\nFILTER (CONTAINS(?var' + requirement['property'] + ', "' + requirement['filter_string'] + '"))'

            # Use MINUS if you want to exclude items that fit the pattern.
            if requirement['require'] == 'exclude':
                triple_pattern = 'minus {' + triple_pattern + '}'

            triple_pattern += '\n'
            #print(triple_pattern)
            subgraph_pattern += triple_pattern

        # Now attach the subgraph pattern to any previous subgraph patterns using UNION to great an OR relationship
        subgraph_pattern = '{\n' + subgraph_pattern + '}\n' # create a subgraph pattern so that several can be UNIONed
        if first_screen: # The first subgraph pattern doesn't need the UNION inserted
            first_screen = False
        else:
            graph_pattern = graph_pattern + 'UNION\n'
        graph_pattern += subgraph_pattern 

    query_string = '''
    select distinct ?qid ?label ?description where {
      VALUES ?qid
      {
      ''' + qid_values + '''}
    ''' + graph_pattern + '''
    
    ?qid rdfs:label ?label.
    FILTER(lang(?label)="'''+ default_language + '''")
    
    OPTIONAL {
    ?qid schema:description ?description.
    FILTER(lang(?description)="'''+ default_language + '''")
    }
      }
    '''
    #print(query_string)

    results = send_sparql_query(query_string)
    #print(json.dumps(results, indent=2))

    return_list = []
    for result in results:
        out_dict = {
            'qid': extract_local_name(result['qid']['value']),
            'label': result['label']['value']
            }
        if 'description' in result:
            out_dict['description'] = result['description']['value']
        else:
            out_dict['description'] = ''           
        return_list.append(out_dict)
    return return_list

# returns lists of occupations, employers, and affiliations for a person with Wikidata ID qid
def search_wikidata_occ_emp_aff(qid):
    results_list = []

    query_string = '''select distinct ?occupation ?employer ?affiliation where {
        optional {
            wd:'''+ qid + ''' wdt:P106 ?occupationId.
            ?occupationId rdfs:label ?occupation.
            FILTER(lang(?occupation) = "'''+ default_language + '''")            
            }

        optional {
            wd:'''+ qid + ''' wdt:P108 ?employerId.
            ?employerId rdfs:label ?employer.
            FILTER(lang(?employer) = "'''+ default_language + '''")            
            }

        optional {
            wd:'''+ qid + ''' wdt:P1416 ?affiliationId.
            ?affiliationId rdfs:label ?affiliation.
            FILTER(lang(?affiliation) = "'''+ default_language + '''")            
            }
        }'''
    
    #print(query_string)
    statements = send_sparql_query(query_string)
    #print(statements)
    
    # pull all possible occupations
    occupationList = []
    employerList = []
    affiliationList = []
    for statement in statements:
        if 'occupation' in statement:
            occupationList.append(statement['occupation']['value'])
        if 'employer' in statement:
            employerList.append(statement['employer']['value'])
        if 'affiliation' in statement:
            affiliationList.append(statement['affiliation']['value'])
    occupationList = list(set(occupationList))
    employerList = list(set(employerList))
    affiliationList = list(set(affiliationList))
    #print(occupationList)
    #print(employerList)
    #print(affiliationList)
    
    # delay to avoid hitting the SPARQL endpoint too rapidly
    sleep(sparql_sleep)
    return occupationList, employerList, affiliationList 

# returns a list of results of articles by person with Wikidata ID qid
def search_wikidata_article(qid):
    results_list = []
    # P50 is "author"; P698 is the PubMed ID of the article
    query = '''select distinct ?title ?pmid where {
      ?article wdt:P50 wd:''' + qid + '''.
      optional {
          ?article rdfs:label ?title.
          FILTER(lang(?title) = "'''+ default_language + '''")
          }
      optional {?article wdt:P698 ?pmid.}
      }'''
    #print(query)
    r = requests.post(endpoint, data=dict(query=query), headers=sparql_request_header)
    try:
        data = r.json()
        statements = data['results']['bindings']
        for statement in statements:
            if 'title' in statement:
                title = statement['title']['value']
                #print('title=',title)
            else:
                title = ''
            if 'pmid' in statement:
                pmid = statement['pmid']['value']
            else:
                pmid = ''
            results_list.append({'title': title, 'pmid': pmid})
    except:
        results_list = [r.text]
    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(sparql_sleep)
    return results_list

# NOTE: The affiliation is a list of strings. All other arguments are strings
def identified_in_pubmed(pmid, name, affiliations, orcid):
    department_test_ratio = 70 # ratio required when a generic name similarity is crosschecked with dept name
    test_ratio = 90 # similarity required for a potential match of a generic wikidata match
    screen = False
    potentialOrcid = ''

    #print('Checking authors in PubMed article: ', pmid)
    pubmed_authors = retrieve_pubmed_data(pmid)
    if pubmed_authors == []:
        print('PubMed ID does not seem to be valid.')
    #print(pubmed_authors)
    for pubmed_author in pubmed_authors:
        # Perform a check based on pubmed_author surnames and departments. 
        # Note: only SURNAME is checked, so coauthor problems are possible as above.
        # More complex checking could be done by looking up the name in ORCID, if available.
        # Always report, but only match when person and department names are similar.
        name_test_ratio = fuzz.token_set_ratio(pubmed_author['surname'], name)
        #print(nameTestRatio, pubmed_author['surname'])
        if name_test_ratio >= test_ratio:
            if pubmed_author['orcid'] != '' and orcid != '':
                # both employee and pubmed_author must have ORCIDs to do this check
                if orcid != extract_local_name(pubmed_author['orcid']):
                    # Reject the article if the matched surname has an inconsistent ORCID
                    print('*** ' + pubmed_author['forename'] + ' ' + pubmed_author['surname'] + ' is NOT the same person; ORCID ' + pubmed_author['orcid'] + ' does not match.')
                    return screen
                # If the PubMed metadata gives an ORCID for the matched person, record it
                else:
                    print(pubmed_author['forename'] + ' ' + pubmed_author['surname'] + ' has matching ORCID ' + pubmed_author['orcid'])
                    screen = True
                    return screen # don't continue the loop since ORCIDs match

            # If there is an affiliation, display it. 
            # If the department name matches the affiliation, call it a match
            if pubmed_author['affiliation'] != '': 
                for affiliation in affiliations:
                    set_ratio = fuzz.token_set_ratio(affiliation, pubmed_author['affiliation'])
                    print('Affiliation test: ', set_ratio, pubmed_author['affiliation'])
                    if set_ratio >= department_test_ratio:
                        print('*** pubmed_author/affiliation match!')
                        screen = True
                        return screen # don't continue the loop (look up pubmed_author) since it's an affiliation match

    return screen

def retrieve_pubmed_data(pmid):
    fetch_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
    param_dict = {
        'tool': tool_name, 
        'email': email_address,
        'db': 'pubmed', 
         #'retmode': 'xml', 
        'rettype': 'abstract', 
        'id': pmid
    }
    response = requests.get(fetch_url, params=param_dict)    
    #print(response.url)
    if response.status_code == 404:
        affiliations = [] # return an empty list if the constructed URL won't dereference
    else:
        pubData = response.text  # the response text is XML
        #print(pubData)  # uncomment this line to see the XML

        # process the returned XML, see https://docs.python.org/2/library/xml.etree.elementtree.html
        root = et.fromstring(pubData)
        try:
            title = root.findall('.//ArticleTitle')[0].text
        except:
            title = ''
        names = root.findall('.//Author')
        affiliations = []
        for name in names:
            try:
                affiliation = name.find('./AffiliationInfo/Affiliation').text
            except:
                affiliation = ''
            try:
                surname = name.find('./LastName').text
            except:
                surname = ''
            try:
                forename = name.find('./ForeName').text
            except:
                forename = ''
            try:
                id_field = name.find('./Identifier')
                if id_field.get('Source') == 'ORCID':
                    orcid = id_field.text
                else:
                    orcid = ''
            except:
                orcid = ''

            #print(lastName)
            #print(affiliation)
            affiliations.append({'affiliation': affiliation, 'surname': surname, 'forename': forename, 'orcid': orcid})
        #print()

    # See https://www.ncbi.nlm.nih.gov/books/NBK25497/ for usage guidelines. 
    # An API key is required for more than 3 requests per second.
    sleep(0.35) # wait before hitting the API again to avoid getting blocked
    #print(affiliations)
    return affiliations



# Load data



In [6]:
etd_frame = pd.read_csv('etd_metadata.csv', na_filter=False, dtype = str)
#etd_frame = etd_frame.iloc[3759:3765].copy() # uncomment to test on a subset
#etd_frame = etd_frame.iloc[:5].copy() # uncomment to test on a subset
etd_frame.set_index('id', inplace=True)

# Get a list of the column headers
headers_list = list(etd_frame.columns)
#print(headers_list)

#etd_frame = etd_frame.head(1) # Uncomment to test on first line of table only
etd_frame.head()


Unnamed: 0_level_0,collection,dc.contributor.advisor,dc.contributor.advisor[],dc.contributor.author,dc.contributor.author[],dc.contributor.committeeChair,dc.contributor.committeeChair[],dc.contributor.committeeMember,dc.contributor.committeeMember[],dc.creator,...,local.embargo.terms[],thesis.degree.department,thesis.degree.discipline,thesis.degree.discipline[],thesis.degree.grantor,thesis.degree.grantor[],thesis.degree.level,thesis.degree.level[],thesis.degree.name,thesis.degree.name[]
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8a231b27-b5db-4ce2-9949-bc6652275652,1803/9599,"Hornberger, George M",,,,,,,,"De Silva Manikkuwahandi, Thushara",...,,,Environmental Engineering,,Vanderbilt University,,Doctoral,,PhD,
bc0f7f0d-a207-436c-b319-b05b74e04c9c,1803/9599,"Lopez, Carlos",,,,,,,,"Kochen, Michael Allen",...,,,Biomedical Informatics,,Vanderbilt University,,Doctoral,,PhD,
b9bc7f0a-38f8-48d2-b0ff-844a38d3643d,1803/9599,"Rosenthal, Sandra J",,,,,,,,"Taylor, Devin Alexander",...,,,Chemistry,,Vanderbilt University,,Masters,,MS,
8d7a88cc-9356-48c7-9e5f-f2305923a568,1803/9599,"Weiss, Sharon M",,,,,,,,"Ryder, Landen Daniel",...,,,Electrical Engineering,,Vanderbilt University,,Masters,,MS,
df1d7425-cdfb-4b78-9277-54b4ab1b9ad8,1803/9599,"Gokhale, Aniruddha S",,,,,,,,"Bhattacharjee, Anirban",...,,,Computer Science,,Vanderbilt University,,Doctoral,,PhD,


# Process data for output

- collapse duplicate columns having same name and dates
- generate output or fixed values
- rename columns if necessary

In [8]:
output_dict = {}

# Combine columns with the same beginning of column header
print('collapsing duplicate columns')
for header in headers_list:
    if '[' not in header:
        header_list = []
        for multi_header in headers_list:
            if header in multi_header and header.count('.') == multi_header.count('.'):
                header_list.append(multi_header)
        #print(header_list)
        # Add the column to the output dict with the column header as the key
        output_dict[header_list[0]] = merge_column_values(header_list, etd_frame)

# Combine date columns into one "created" column (using the earliest date)
print('merging published dates')
created_date_list = []
for identifier, data in etd_frame.iterrows():
    date = [data['dc.date.accessioned'][:4], data['dc.date.created'][:4], data['dc.date.issued'][:4], data['dc.date.submitted'][-4:]]
    while '' in date:
        date.remove('') # remove missing data
    created_date_list.append(min(date)) # Dates are strings, but min will return first in alphabetical order, so earliest
output_dict['published'] = created_date_list

# Get rid of the source date columns
del output_dict['dc.date.accessioned']
del output_dict['dc.date.created']
del output_dict['dc.date.issued']
del output_dict['dc.date.submitted']

# Rename columns by changing the keys for the columns in the output_dict
print('renaming columns')
output_dict['label_en'] = output_dict.pop('dc.title')
output_dict['handle'] = output_dict.pop('dc.identifier.uri')
output_dict['full_text_available'] = output_dict['handle'].copy()

# Make a list with the same number of items as all other columns
# Use the Q ID for VU
output_dict['dissert_submit_to'] = ['Q29052'] * len(created_date_list)

# Determine the language of the work
print('determining the work language')
language_list = []
for identifier, data in etd_frame.iterrows():
    if data['dc.language.iso'] != '':
        if data['dc.language.iso'][:2].lower() == 'en':
            language_list.append('Q1860')
        elif data['dc.language.iso'][:2].lower() == 'es':
            language_list.append('Q1321')
        elif data['dc.language.iso'][:2].lower() == 'fr':
            language_list.append('Q150')
        else:
            language_list = determine_language_from_title(language_list, data['dc.title'])
    else:
        language_list = determine_language_from_title(language_list, data['dc.title'])
output_dict['language'] = language_list

# Remove non-English titles from the title column (they'll have to be added manually after item creation)
print('removing non-English titles')
scrub = []
for title in output_dict['label_en']:
    lang, confidence = detect_language(title)
    if lang == 'en' and confidence > 0.99:
        scrub.append(title)
    else:
        scrub.append('')
output_dict['title_en'] = scrub

# Determine whether it's a masters or doctoral thesis
print('determining thesis type')
description = []
instance_of = []
for identifier, data in etd_frame.iterrows():
    print(data['dc.title'])
    if data['thesis.degree.name'] != '' and data['thesis.degree.name'][0].lower() == 'm': # check first letter since can be MS or MA; others are PhD
        description.append("master's thesis")
        instance_of.append('Q1907875')
    elif data['thesis.degree.name'] != '' and data['thesis.degree.name'][0].lower() == 'p':
        description.append('doctoral thesis')
        instance_of.append('Q187685')
    else:
        description.append('thesis')
        instance_of.append('Q1266946')
output_dict['description_' + default_language] = description
output_dict['instance_of'] = instance_of

print('outputing data')
collapsed_columns_frame = pd.DataFrame(output_dict, index = etd_frame.index)
collapsed_columns_frame.to_csv('collapsed.csv') # Uncomment to save data at this point

#print(collapsed_columns_frame.head())
print('done')


collapsing duplicate columns
merging published dates
renaming columns
determining the work language
removing non-English titles
determining thesis type
Development of Decision Support Tools for Water Energy Food Infrastructure Management of Sri Lanka
Mechanistic Hypothesis Exploration of Signaling Network Processes  via Bayesian Inference Methods
Elucidating the Membrane Diffusion Dynamics of Muscarinic-1 Acetylcholine Receptors with Quantum Dots
Simulation of Optical Energy Deposition for Pulsed Laser-Induced Single Event Effects Testing in Microelectronic Devices
Algorithms and Techniques for Automated Deployment and Efficient Management of Large-Scale Distributed Data Analytics Services
Endocannabinoid Signaling Collapse Mediates Stress-Induced Amygdalo-Cortical Strengthening
Regulation of iron homeostasis by the sulfur assimilation pathway
Enhanced Charge Collection in Silicon Carbide Power MOSFETs Demonstarted by Pulsed-Laser Two-Photon Absroption SEE Experiments
COMPUTATIONAL FLU

National Security, Economization, and the Rhetoric of Refugee and Veteran PTSD
Fabrication, Characterization, and Applications of Porous Silicon Metal-Oxide Nanocomposites
Putting Together the Pieces of the Puzzle: The Development of an Opioid Abuse Triage Checklist for Emergency Departments
Synthesis and Validation of a Trifunctional Trimethoprim-based Probe for Use with Degradation Domain System
Analysis of Friction Stir Welding Behavior of Aluminum Cerium Alloy and Viability for Industrial Application
Structural and Functional Dynamics of Serotonin Transporter Gene Variants
A Change in Structure: Perceptions of Mental Health & Illness at the University
Quality of Life for Pediatric Cancer Patients and their Families: The Application of Palliative Care Principles into Care by Children’s Hospitals
Storytelling, Memory, and Nostalgia:
The Identities of Iranian Revolutionary Migrants and First-Generation Persian-Americans
Antibody Affinity Maturation in Antigen-Distal Residues
Sustainab

Using Daily Progress Note Data to Predict Discharge Date from the Neonatal Intensive Care Unit
The role of the BMP antagonist Gremlin 2 during cardiac tissue repair
Thinking Sex with the Great Whore (Rev 17-18): Deviant Sexualities in the context of Empire
Somatomotor functioning in marmosets and the evolution of spinal cords in primates
Human hemoglobin as an iron source of Staphylococcus aureus
Neural Correlates of Obesity: Disgust, Inflammation, and Brain Function
On Groups of Large Exponents n and
n-periodic Products
Structural Studies of Fluxional Lesions in Deoxyribonucleic Acid
High Affinity Peptide Neurotoxin Quantum Dot Conjugates for Detecting Endogenous Targets in Live Cells and Ex Vivo Tissue
Narrowing the Window:  Multisensory Perceptual Learning and its Neural Correlates
Structural biology of the C-terminal domain of eukaryotic replication factor Mcm10
Polyester based ‘nanosponges’ as a delivery platform for diverse therapeutics to advance the treatment of a broad range o

SIMON: a distributed real-time system for critical care patient monitoring and event detection
Free Electron Laser Ablation of Soft Tissue: The Effects of Chromophore and Pulse Characteristics on Ablation Mechanics
Sequential analysis of parent-adolescent interactions as a function of adolescent depressive symptoms
“There is no happiness at work!”: emotion management, inauthenticity, and psychological distress in the workplace
Forging Ethnic Identity Through Faith: Religion and the Syrian-Lebanese Community in São Paulo
Study of Titanium Dioxide Nanoparticles via Molecular Dynamics Simulations
Alternative instructional strategies for low-literate adults: An investigation of the effects of static and dynamic visuals on learning outcomes
A Near Infrared Spectroscopy Study of Counterfactual Thinking
Assembly and regulation of signaling proteins at fission yeast microtubule organizing centers
Non-Hermitian Orthogonality and Meromorphic Approximation
Democracy, Dliberation, and Political Le

Trajectory Auto-Corrected Image Reconstruction
Lead Optimization for Discovery of Potent and Selective Dopamine Receptor D<sub>4</sub> Antagonist
What is the Zone and Are We in It? 
Visions of the Anthropocene in Andrei Tarkovsky’s Stalker
"Restless and still Unsatisfied We Roam": Politics and Gender in Eliza Haywood's <i>The Fair Captive
Globalization, Postmodernity, and the Bildungsroman: Tracing Narratives of Development in
Three Versions of Orson Scott Card’s “Ender’s Game” Story
How Technology, Strategic Decision Making, and School Context Influence Principals' Use of a Data Warehouse: A Latent Class Growth Analysis
Simulation using Transaction Level Modeling : Implementation for ARA Modules
Uncertainty in Image-to-Physical Registration for Soft-Tissue Image Guided Surgery
Colistin-Functionalized Nanoparticles for the Rapid Capture of Acinetobacter baumannii
The Origins and Consequences of Compulsory Voting in Latin America
Taking the Law into Our Hands: Trust, Social Capital and 

Targeted Interrogation of Blood-Brain Barrier Biology
ADDITIVELY MANUFACTURED THERMITE-BASED ENERGETICS: CHARACTERIZATION AND APPLICATIONS
Understanding Epo-dependent Enhancer-Promoter Interactions in the Regulation of Erythroid Gene Expression
Does Lying Require More or Less Working Memory and What Does it Mean for the Legal System?
Scientific Modeling for Sensemaking and Expression: Understanding the Multimodal Affordances of Modeling for Emerging Bilingual Students
Exploring the Relationship Between Adolescent Media Use, Sexual Socialization and Sexual Behavior
Physiological Response Patterns During Social Interaction to Predict Internalizing Symptoms in Children with Autism Spectrum Disorder
Drosophila as a model system to study neuropsychiatric and neurological disorders
Toward Using Membrane Distillation for Brine Treatment: Understanding Energy Efficiency and the Challenge of Mineral Scaling
LAMENT BEYOND BLAME: CONSEQUENCES OF WOMEN’S POETRY IN LAMENTATIONS 1-2
An Evaluation of

                                     collection dc.contributor.advisor  \
id                                                                       
8a231b27-b5db-4ce2-9949-bc6652275652  1803/9599   Hornberger, George M   
bc0f7f0d-a207-436c-b319-b05b74e04c9c  1803/9599          Lopez, Carlos   
b9bc7f0a-38f8-48d2-b0ff-844a38d3643d  1803/9599    Rosenthal, Sandra J   
8d7a88cc-9356-48c7-9e5f-f2305923a568  1803/9599       Weiss, Sharon  M   
df1d7425-cdfb-4b78-9277-54b4ab1b9ad8  1803/9599   Gokhale, Aniruddha S   

                                     dc.contributor.author  \
id                                                           
8a231b27-b5db-4ce2-9949-bc6652275652                         
bc0f7f0d-a207-436c-b319-b05b74e04c9c                         
b9bc7f0a-38f8-48d2-b0ff-844a38d3643d                         
8d7a88cc-9356-48c7-9e5f-f2305923a568                         
df1d7425-cdfb-4b78-9277-54b4ab1b9ad8                         

                                     dc.contri

In [9]:
# Output CSV label/description fields to be populated without references
out_fields_labels = ['label_' + default_language, 'description_' + default_language]

# Output CSV property fields to be populated without references
out_fields_noref = ['instance_of']

# Output CSV fields that include reference fields with only retrieved date
out_fields_no_url = ['handle', 'full_text_available']

# Output CSV fields that include reference fields with both reference URL and retrieved date
out_fields_ref = ['published', 'title_' + default_language, 'language', 'dissert_submit_to']

# Function hacked from extract_doi_metadata() function of linked-data/publications/crossref/retrieve_doi_data.ipynb
def extract_metadata(crossref_results, handle, today):   
    out_dict = {'qid': ''}
    for field in out_fields_labels:   
        #print(field, crossref_results[field])
        out_dict[field] = crossref_results[field]
    #print()
    for field in out_fields_noref:   
        #print(field, crossref_results[field])
        out_dict[field + '_uuid'] = ''
        out_dict[field] = crossref_results[field]
    #print()
    
    # Fields with a retrieved date, but reference URL not needed
    for field in out_fields_no_url:
        #print(field, crossref_results[field])
        out_dict[field + '_uuid'] = ''
        if field == 'published':
            out_dict[field + '_nodeId'] = ''
            out_dict[field + '_val'] = crossref_results[field]
            out_dict[field + '_prec'] = ''
        else:
            out_dict[field] = crossref_results[field]
        # Only add a reference if there is a value for that field
        if crossref_results[field] == '':
            out_dict[field + '_ref1_hash'] = ''
            out_dict[field + '_ref1_retrieved_nodeId'] = ''
            out_dict[field + '_ref1_retrieved_val'] = ''
            out_dict[field + '_ref1_retrieved_prec'] = ''
        else:
            out_dict[field + '_ref1_hash'] = ''
            out_dict[field + '_ref1_retrieved_nodeId'] = ''
            out_dict[field + '_ref1_retrieved_val'] = today
            out_dict[field + '_ref1_retrieved_prec'] = ''
    #print()
    
    # Fields with both reference URLs and retrieved dates
    for field in out_fields_ref:
        #print(field, crossref_results[field])
        out_dict[field + '_uuid'] = ''
        if field == 'published':
            out_dict[field + '_nodeId'] = ''
            out_dict[field + '_val'] = crossref_results[field]
            out_dict[field + '_prec'] = ''
        else:
            out_dict[field] = crossref_results[field]
        # Only add a reference if there is a value for that field
        if crossref_results[field] == '':
            out_dict[field + '_ref1_hash'] = ''
            out_dict[field + '_ref1_referenceUrl'] = ''
            out_dict[field + '_ref1_retrieved_nodeId'] = ''
            out_dict[field + '_ref1_retrieved_val'] = ''
            out_dict[field + '_ref1_retrieved_prec'] = ''
        else:
            out_dict[field + '_ref1_hash'] = ''
            out_dict[field + '_ref1_referenceUrl'] = handle
            out_dict[field + '_ref1_retrieved_nodeId'] = ''
            out_dict[field + '_ref1_retrieved_val'] = today
            out_dict[field + '_ref1_retrieved_prec'] = ''
    return(out_dict)

today = generate_utc_date()
articles_list = []

for identifier, data in collapsed_columns_frame.iterrows():

    handle = data['handle']
    print(handle)

    primary_metadata = extract_metadata(data, handle, today)
    articles_list.append(primary_metadata)
#print(json.dumps(articles_list, indent = 2))

# Writ the data to the file after every lookup in case the script crashes
fieldnames = list(articles_list[0].keys()) # get field names from first dict in list
write_dicts_to_csv(articles_list, 'articles.csv', fieldnames)
print('done')

http://hdl.handle.net/1803/9845
http://hdl.handle.net/1803/9695||http://hdl.handle.net/1803/9790
http://hdl.handle.net/1803/9825
http://hdl.handle.net/1803/9826
https://ir.vanderbilt.edu/xmlui/handle/1803/9849
https://ir.vanderbilt.edu/xmlui/handle/1803/9850
https://ir.vanderbilt.edu/xmlui/handle/1803/9851
https://ir.vanderbilt.edu/xmlui/handle/1803/9852
https://ir.vanderbilt.edu/xmlui/handle/1803/9853
https://ir.vanderbilt.edu/xmlui/handle/1803/9854
https://ir.vanderbilt.edu/xmlui/handle/1803/9855
https://ir.vanderbilt.edu/xmlui/handle/1803/9856
http://hdl.handle.net/1803/9920
http://hdl.handle.net/1803/16355
http://hdl.handle.net/1803/16356
http://hdl.handle.net/1803/16357
http://hdl.handle.net/1803/16358
http://hdl.handle.net/1803/16359
http://hdl.handle.net/1803/16360
http://hdl.handle.net/1803/16361
http://hdl.handle.net/1803/16362
http://hdl.handle.net/1803/16363
http://hdl.handle.net/1803/16364
http://hdl.handle.net/1803/16365
http://hdl.handle.net/1803/16366
http://hdl.handle.n

http://hdl.handle.net/1803/11163
http://hdl.handle.net/1803/11164
http://hdl.handle.net/1803/11165
http://hdl.handle.net/1803/11166
http://hdl.handle.net/1803/11167
http://hdl.handle.net/1803/11168
http://hdl.handle.net/1803/11169
http://hdl.handle.net/1803/11170
http://hdl.handle.net/1803/11171
http://hdl.handle.net/1803/11172
http://hdl.handle.net/1803/11173
http://hdl.handle.net/1803/11174
http://hdl.handle.net/1803/11175
http://hdl.handle.net/1803/11176
http://hdl.handle.net/1803/11177
http://hdl.handle.net/1803/11178
http://hdl.handle.net/1803/11179
http://hdl.handle.net/1803/11180
http://hdl.handle.net/1803/11181
http://hdl.handle.net/1803/11182
http://hdl.handle.net/1803/11183
http://hdl.handle.net/1803/11184
http://hdl.handle.net/1803/11185
http://hdl.handle.net/1803/11186
http://hdl.handle.net/1803/11187
http://hdl.handle.net/1803/11188
http://hdl.handle.net/1803/11189
http://hdl.handle.net/1803/11190
http://hdl.handle.net/1803/11191
http://hdl.handle.net/1803/11192
http://hdl

http://hdl.handle.net/1803/11926
http://hdl.handle.net/1803/11927
http://hdl.handle.net/1803/11928
http://hdl.handle.net/1803/11929
http://hdl.handle.net/1803/11930
http://hdl.handle.net/1803/11931
http://hdl.handle.net/1803/11932
http://hdl.handle.net/1803/11933
http://hdl.handle.net/1803/11934
http://hdl.handle.net/1803/11935
http://hdl.handle.net/1803/11936
http://hdl.handle.net/1803/11937
http://hdl.handle.net/1803/11938
http://hdl.handle.net/1803/11939
http://hdl.handle.net/1803/11940
http://hdl.handle.net/1803/11941
http://hdl.handle.net/1803/11942
http://hdl.handle.net/1803/11943
http://hdl.handle.net/1803/11944
http://hdl.handle.net/1803/11945
http://hdl.handle.net/1803/11946
http://hdl.handle.net/1803/11947
http://hdl.handle.net/1803/11948
http://hdl.handle.net/1803/11949
http://hdl.handle.net/1803/11950
http://hdl.handle.net/1803/11951
http://hdl.handle.net/1803/11952
http://hdl.handle.net/1803/11953
http://hdl.handle.net/1803/11954
http://hdl.handle.net/1803/11955
http://hdl

http://hdl.handle.net/1803/12926
http://hdl.handle.net/1803/12927
http://hdl.handle.net/1803/12928
http://hdl.handle.net/1803/12929
http://hdl.handle.net/1803/12930
http://hdl.handle.net/1803/12931
http://hdl.handle.net/1803/12932
http://hdl.handle.net/1803/12933
http://hdl.handle.net/1803/12934
http://hdl.handle.net/1803/12935
http://hdl.handle.net/1803/12936
http://hdl.handle.net/1803/12937
http://hdl.handle.net/1803/12938
http://hdl.handle.net/1803/12939
http://hdl.handle.net/1803/12940
http://hdl.handle.net/1803/12941
http://hdl.handle.net/1803/12942
http://hdl.handle.net/1803/12943
http://hdl.handle.net/1803/12944
http://hdl.handle.net/1803/12945
http://hdl.handle.net/1803/12946
http://hdl.handle.net/1803/12947
http://hdl.handle.net/1803/12948
http://hdl.handle.net/1803/12949
http://hdl.handle.net/1803/12950
http://hdl.handle.net/1803/12951
http://hdl.handle.net/1803/12952
http://hdl.handle.net/1803/12953
http://hdl.handle.net/1803/12954
http://hdl.handle.net/1803/12955
http://hdl

http://hdl.handle.net/1803/13851
http://hdl.handle.net/1803/13852
http://hdl.handle.net/1803/13853
http://hdl.handle.net/1803/13854
http://hdl.handle.net/1803/13855
http://hdl.handle.net/1803/13856
http://hdl.handle.net/1803/13857
http://hdl.handle.net/1803/13858
http://hdl.handle.net/1803/13859
http://hdl.handle.net/1803/13860
http://hdl.handle.net/1803/13861
http://hdl.handle.net/1803/13862
http://hdl.handle.net/1803/13863
http://hdl.handle.net/1803/13864
http://hdl.handle.net/1803/13865
http://hdl.handle.net/1803/13866
http://hdl.handle.net/1803/13867
http://hdl.handle.net/1803/13868
http://hdl.handle.net/1803/13869
http://hdl.handle.net/1803/13870
http://hdl.handle.net/1803/13871
http://hdl.handle.net/1803/13872
http://hdl.handle.net/1803/13873
http://hdl.handle.net/1803/13874
http://hdl.handle.net/1803/13875
http://hdl.handle.net/1803/13876
http://hdl.handle.net/1803/13877
http://hdl.handle.net/1803/13878
http://hdl.handle.net/1803/13879
http://hdl.handle.net/1803/13880
http://hdl

http://hdl.handle.net/1803/14494
http://hdl.handle.net/1803/14495
http://hdl.handle.net/1803/14496
http://hdl.handle.net/1803/14497
http://hdl.handle.net/1803/14498
http://hdl.handle.net/1803/14499
http://hdl.handle.net/1803/14500
http://hdl.handle.net/1803/14501
http://hdl.handle.net/1803/14502
http://hdl.handle.net/1803/14503
http://hdl.handle.net/1803/14504
http://hdl.handle.net/1803/14505
http://hdl.handle.net/1803/14506
http://hdl.handle.net/1803/14507
http://hdl.handle.net/1803/14508
http://hdl.handle.net/1803/14509
http://hdl.handle.net/1803/14510
http://hdl.handle.net/1803/14511
http://hdl.handle.net/1803/14512
http://hdl.handle.net/1803/14513
http://hdl.handle.net/1803/14514
http://hdl.handle.net/1803/14515
http://hdl.handle.net/1803/14516
http://hdl.handle.net/1803/14517
http://hdl.handle.net/1803/14518
http://hdl.handle.net/1803/14519
http://hdl.handle.net/1803/14520
http://hdl.handle.net/1803/14521
http://hdl.handle.net/1803/14522
http://hdl.handle.net/1803/14523
http://hdl

http://hdl.handle.net/1803/15426
http://hdl.handle.net/1803/15427
http://hdl.handle.net/1803/15428
http://hdl.handle.net/1803/15429
http://hdl.handle.net/1803/15430
http://hdl.handle.net/1803/15431
http://hdl.handle.net/1803/15432
http://hdl.handle.net/1803/15433
http://hdl.handle.net/1803/15434
http://hdl.handle.net/1803/15435
http://hdl.handle.net/1803/15436
http://hdl.handle.net/1803/15437
http://hdl.handle.net/1803/15438
http://hdl.handle.net/1803/15439
http://hdl.handle.net/1803/15440
http://hdl.handle.net/1803/15441
http://hdl.handle.net/1803/15442
http://hdl.handle.net/1803/15443
http://hdl.handle.net/1803/15444
http://hdl.handle.net/1803/15445
http://hdl.handle.net/1803/15446
http://hdl.handle.net/1803/15447
http://hdl.handle.net/1803/15448
http://hdl.handle.net/1803/15449
http://hdl.handle.net/1803/15450
http://hdl.handle.net/1803/15451
http://hdl.handle.net/1803/15452
http://hdl.handle.net/1803/15453
http://hdl.handle.net/1803/15454
http://hdl.handle.net/1803/15455
http://hdl

done


# Extract author and advisor information

Note: the output of this is designed to mimic what is produced after the first stage of https://github.com/HeardLibrary/linked-data/blob/master/publications/crossref/retrieve_doi_data.ipynb

In [None]:
authors = extract_author_values(collapsed_columns_frame)
advisors = extract_advisor_values(collapsed_columns_frame)

column_headers = ['doi', 'authors', 'editors']
output_list = []
for edt_number in range(len(authors[:10])):
    author_name_parts = find_surname_givens(authors[edt_number][0])
    output_dict = {'doi': collapsed_columns_frame.iloc[edt_number][['dc.identifier.uri']][0], 
                   'authors': json.dumps([
                       {'orcid': collapsed_columns_frame.iloc[edt_number]['dc.creator.orcid'], 
                        'sequence': 'first', 
                        'givenName': author_name_parts['given'], 
                        'familyName': author_name_parts['family'], 
                        'affiliation': ''}
                   ])
                  }
    advisor_list = []
    for advisor in advisors[edt_number]:
        author_name_parts = find_surname_givens(advisor[0])
        advisor_list.append({'orcid': '', 
                        'sequence': '', 
                        'givenName': author_name_parts['given'], 
                        'familyName': author_name_parts['family'], 
                        'affiliation': ''})
    output_dict['editors'] = json.dumps(advisor_list)
    output_list.append(output_dict)
    
#print(json.dumps(output_list, indent = 2))
write_dicts_to_csv(output_list, 'stored_retrieved_authors.csv', column_headers)
print('done')


## Generate the author and advisor ("editor") CSVs in form required to write with VanderBot

In [None]:
today = generate_utc_date()
file_path = ''
alt_reference ='' # not used

people_dict = {'author':[], 'editor': []}
unidentified = {'author':[], 'editor': []}
author_strings_list = []
# Load existing data if any (primarily if script crashes and has to be rerun)
#people_dict['author'] = read_dicts_from_csv(file_path + 'authors.csv')
#people_dict['editor'] = read_dicts_from_csv(file_path + 'editors.csv')
#author_strings_list = read_dicts_from_csv(file_path + 'author_strings.csv')

# Open the file containing the stored data about authors and editors retrieved from CrossRef
stored_retrieved_authors = read_dicts_from_csv(file_path + 'stored_retrieved_authors.csv')

# Open the article items file after upload in order to get the Q IDs for the newly written articles
articles = read_dicts_from_csv(file_path + 'articles.csv')
unidentified = []

for article in articles:
    qid = article['qid']
    doi = article['doi']
    print(qid, doi)
    #pmid = article['pmid']
    unidentified_for_article = {'qid': 'https://wikidata.org/entity/' + qid, 'doi': doi}
    
    found = False
    for article_authors in stored_retrieved_authors:
        if article['doi'] == article_authors['doi']:
            found = True
            authors = json.loads(article_authors['authors'])
            editors = json.loads(article_authors['editors'])
            break
    if found:
        for persontype in ['author', 'editor']:
        
            # Disambiguate authors against existing Wikidata people items
            found_author_qids, author_name_strings = disambiguate_authors(json.loads(article_authors[persontype + 's']))
            
            # Add data about unidentified people with possible Q ID matches to the list for further work.
            unidentified_for_article[persontype] = author_name_strings

            for author in found_author_qids:
                out_dict = {}
                out_dict['qid'] = qid
                out_dict['label_en'] = article['label_en']
                out_dict[persontype + '_uuid'] = ''
                out_dict[persontype] = author['qid']
                out_dict[persontype + '_series_ordinal'] = author['series_ordinal']
                out_dict[persontype + '_stated_as'] = author['stated_as']
                out_dict[persontype + '_ref1_hash'] = ''
                if alt_reference == '':
                    out_dict[persontype + '_ref1_referenceUrl'] = doi
                else:
                    out_dict[persontype + '_ref1_referenceUrl'] = alt_reference
                out_dict[persontype + '_ref1_retrieved_nodeId'] = ''
                out_dict[persontype + '_ref1_retrieved_val'] = today
                out_dict[persontype + '_ref1_retrieved_prec'] = ''
                people_dict[persontype].append(out_dict)
            #print(authors_list)

            if len(people_dict[persontype]) > 0:
                fieldnames = list(people_dict[persontype][0].keys()) 
                write_dicts_to_csv(people_dict[persontype], file_path + persontype + 's.csv', fieldnames)

            if persontype == 'author':
                for author in author_name_strings:
                    out_dict = {}
                    out_dict['qid'] = qid
                    out_dict['label_en'] = article['label_en']
                    out_dict['author_string_uuid'] = ''
                    out_dict['author_string'] = author['name_string']
                    out_dict['author_string_series_ordinal'] = author['series_ordinal']
                    out_dict['author_string_ref1_hash'] = ''
                    if alt_reference == '':
                        out_dict['author_string_ref1_referenceUrl'] = doi
                    else:
                        out_dict['author_string_ref1_referenceUrl'] = alt_reference
                    out_dict['author_string_ref1_retrieved_nodeId'] = ''
                    out_dict['author_string_ref1_retrieved_val'] = today
                    out_dict['author_string_ref1_retrieved_prec'] = ''
                    author_strings_list.append(out_dict)

                #print(author_strings_list)
                if len(author_strings_list) > 0:
                    fieldnames = list(author_strings_list[0].keys()) 
                    write_dicts_to_csv(author_strings_list, file_path + 'author_strings.csv', fieldnames)
                    
        if not(unidentified_for_article['author'] == [] and unidentified_for_article['editor'] == []):
            unidentified.append(unidentified_for_article)
        
    # Save the potential author and editor matches in a file
    # Save after each article in case of crash; maybe later just write at end
    with open(file_path + 'unidentified_people.json', 'wt', encoding='utf-8') as file_object:
        file_object.write(json.dumps(unidentified, indent=2))

print('done')
