In [None]:
import requests   # best library to manage HTTP transactions
from bs4 import BeautifulSoup # web-scraping library
import json
from time import sleep
import csv
import math
from fuzzywuzzy import fuzz # fuzzy logic matching
from fuzzywuzzy import process
import xml.etree.ElementTree as et # library to traverse XML tree
import urllib
import datetime
import string
from pathlib import Path
import copy # import the copy module from the standard library

import vb_common_code as vbc

employerQId = 'Q29052' # Vanderbilt University
sparqlSleep = 0.1 # number of seconds to wait between queries to SPARQL endpoint
testRatio = 90 # similarity required for a potential match of a generic wikidata match
journalTestRatio = 94 # similarity required for a potential match of a generic wikidata match

home = str(Path.home()) # gets path to home directory; supposed to work for both Win and Mac
csv_path = home + '/divinity-law/vulibDivinity.csv'
known_name_strings_path = home + '/divinity-law/known-name-strings.csv'
journals_path = home + '/divinity-law/journals.csv'

def reverse_names(nameLastFirst):
    nameLastFirst = nameLastFirst.strip()
    nameParts = nameLastFirst.split(',')
    if len(nameParts) < 2: # name probably isn't reversed
        name = nameLastFirst.strip()
    else:
        firstName = nameParts[1].strip()
        lastName = nameParts[0].strip()
        name = firstName + ' ' + lastName

        if len(nameParts) > 2:
            suffix = nameParts[2].strip()
        else:
            suffix = ''
        name = firstName + ' ' + lastName
        if suffix == 'Jr.':
            name += ', Jr.'
        elif suffix == 'Jr':
            name += ', Jr.'
        elif suffix == '':
            pass
        elif suffix == 'II':
            name += ' ' + suffix
        elif suffix == 'III':
            name += ' ' + suffix
        elif suffix == 'IV':
            name += ' ' + suffix
        elif suffix == 'V':
            name += ' ' + suffix
    return name

In [None]:
# read in data from Zotero download CSV
publications = vbc.readDict(csv_path)
print(len(publications),'publications read from CSV')

# read in previously discovered Q IDs and names
known_name_strings = vbc.readDict(known_name_strings_path)
print(len(known_name_strings),'discovered authors Q IDs read from CSV')

# read in list of all Q IDs and names of journals in Wikidata
journals_in_wikidata = vbc.readDict(journals_path)
print(len(journals_in_wikidata),'journals read from CSV')

# Download the labels and descriptions of all existing institutional people

org_label_query = vbc.Query(labelscreen='?id wdt:P1416 ?deptOrCollege.?deptOrCollege wdt:P749+ wd:' + employerQId + '.', sleep=sparqlSleep)
org_labels = org_label_query.labels_descriptions('')
print(len(org_labels), 'labels downloaded')

org_description_query = vbc.Query(labeltype='description', labelscreen='?id wdt:P1416 ?deptOrCollege.?deptOrCollege wdt:P749+ wd:' + employerQId + '.', sleep=sparqlSleep)
org_descriptions = org_description_query.labels_descriptions('')
print(len(org_descriptions), 'descriptions downloaded')


In [None]:
output_list = []
identified_authors_list = [] # list of dictionaries
unidentified_authors_list = [] # list of strings
identified_journals_list = [] # list of dictionaries
unidentified_journals_list = [] # list of strings

count = 0
for publication in publications:
    count += 1
    # print something every 100 rows
    if count % 100 == 0:
        print(count)
        
    # Set up the output row by starting with the input row
    # The input row is a dictionary
    output_row = copy.deepcopy(publication) # use deepcopy rather than a reference
        
    # *********** Author matching ***********
    
    # First separate the authors if more than one
    if ';' in publication['Author']:
        names = publication['Author'].split(';')
    else:
        names = [publication['Author']]
    names_list = []
    
    # Process each of the authors of the publication
    for name in names:
        # The name dict is where we store names and Q IDs (if discovered)
        # for all of the authors of the paper
        name_dictionary = {}
        
        # Guard against errors in the case where authors are missing
        if name.strip() != '':
            
            # Neet to revers the names if they are last name first
            name_string = reverse_names(name)
            
            # First check whether it's a name that matches a know Div School person
            found = False
            for label in org_labels:
                # NOTE: used token_set_ratio here because of the wide variety of 
                # ways that names are sliced and diced. Need to determine how 
                # often there are false positives.
                setRatio = fuzz.token_set_ratio(name_string, label['string'])
                if setRatio >= testRatio: # We get a name match
                    name_dictionary = label # Add the match to the name dict
                    found = True
                    matched = False
                    
                    # Find out whether we need to add to the list of IDed authors
                    for id_name in identified_authors_list:
                        if name_dictionary['string'] == id_name['string']:
                            matched = True
                            break
                    if not matched:
                        identified_authors_list.append(name_dictionary)
                    break
                    
            # If the name loop wasn't broken, move on to checking the 
            # list we keep of identified people who don't match the VU list
            for label in known_name_strings:
                if name_string == label['string']: # We get a name match
                    name_dictionary['qid'] = label['qid'] # Add the discoverd name
                    name_dictionary['string'] = label['label'] # Add the discoverd label
                    found = True
                    matched = False
                    
                    # Find out whether we need to add this match to the list of IDed authors
                    for id_name in identified_authors_list:
                        if name_dictionary['string'] == id_name['string']:
                            matched = True
                            break
                    if not matched:
                        identified_authors_list.append(name_dictionary)
                    break
                
            # If there is no match to either name source, then don't give a Q ID
            if not found:
                #print('none', name_string)
                name_dictionary['qid'] = ''
                name_dictionary['string'] = name_string
                matched = False
                
                # Find out if this is a new unidentified name or 
                # if it's already on the list of unIDed people
                for unid_name in unidentified_authors_list:
                    if name_dictionary['string'] == unid_name:
                        matched = True
                        break
                if not matched:
                    unidentified_authors_list.append(name_dictionary['string'])
            names_list.append(name_dictionary)
    output_row['author_qids_names'] = names_list 
    #print(names_list)
    
    
    # *********** Journal matching ***********

    # only process rows for journal articles
    if publication['Item Type'] == 'journalArticle':
        # first see if the Publication Title is one we already know about
        found_in_id_journals = False
        for known_journal in identified_journals_list:
            setRatio = fuzz.ratio(publication['Publication Title'], known_journal['label'])
            if setRatio >= journalTestRatio: # We get a name match
                found_in_id_journals = True
                output_row['publication_title_qid'] = known_journal['qid'] # assign this Q ID for the journal of the row
                # don't need to keep looking so quit the search loop
                break
                
        # need to check the big list if the journal is new
        if not found_in_id_journals:
            found_in_poss_journals = False
            for possible_journal in journals_in_wikidata:
                # Used regular ratio rather than token_set_ratio here. There were false positives
                # Due to the many similar journals that had overlapping name parts.
                setRatio = fuzz.ratio(possible_journal['label'], publication['Publication Title'])
                if setRatio >= journalTestRatio: # We get a title string match
                    found_in_poss_journals = True
                    output_row['publication_title_qid'] = possible_journal['qid'] # assign this Q ID for the journal of the row
                    print(possible_journal['qid'], possible_journal['label'], setRatio, publication['Publication Title'])
                    identified_journals_list.append({'qid': possible_journal['qid'], 'label': possible_journal['label']})
                    break
            if not found_in_poss_journals:
                matched = False
                for unid_journal in unidentified_journals_list:
                    if publication['Publication Title'] == unid_journal:
                        matched = True
                        break
                if not matched:
                    unidentified_journals_list.append(publication['Publication Title'])
    output_list.append(output_row)

# Save the data
filename = home + '/divinity-law/output.csv'
fieldnames = ['author_qids_names', 'publication_title_qid'] + list(publications[0].keys())
vbc.writeDictsToCsv(output_list, filename, fieldnames)

# Save the identified authors
filename = home + '/divinity-law/identified-authors.csv'
fieldnames = ['qid', 'string']
vbc.writeDictsToCsv(identified_authors_list, filename, fieldnames)

# Save the identified journals
filename = home + '/divinity-law/identified-journals.csv'
fieldnames = ['qid', 'label']
vbc.writeDictsToCsv(identified_journals_list, filename, fieldnames)

# Save the list of unidentified authors
filename = home + '/divinity-law/unidentified-authors.txt'
with open(filename, 'wt', encoding='utf-8') as fileObject:
    for author in unidentified_authors_list:
        print(author, file=fileObject)

# Save the list of unidentified journals
filename = home + '/divinity-law/unidentified-journals.txt'
with open(filename, 'wt', encoding='utf-8') as fileObject:
    for journal in unidentified_journals_list:
        print(journal, file=fileObject)

print('done')