In [57]:
import requests   # best library to manage HTTP transactions
from bs4 import BeautifulSoup # web-scraping library
import json
from time import sleep
import csv
import math
from fuzzywuzzy import fuzz # fuzzy logic matching
from fuzzywuzzy import process
import xml.etree.ElementTree as et # library to traverse XML tree
import urllib
import datetime
import string
from pathlib import Path

import vb_common_code as vbc

employerQId = 'Q29052' # Vanderbilt University
sparqlSleep = 0.1 # number of seconds to wait between queries to SPARQL endpoint
testRatio = 90 # similarity required for a potential match of a generic wikidata match

home = str(Path.home()) # gets path to home directory; supposed to work for both Win and Mac
csv_path = home + '/divinity-law/vulibDivinity.csv'
known_name_strings_path = home + '/divinity-law/known-name-strings.csv'
journals_path = home + '/divinity-law/journals.csv'

def reverse_names(nameLastFirst):
    nameLastFirst = nameLastFirst.strip()
    nameParts = nameLastFirst.split(',')
    if len(nameParts) < 2: # name probably isn't reversed
        name = nameLastFirst.strip()
    else:
        firstName = nameParts[1].strip()
        lastName = nameParts[0].strip()
        name = firstName + ' ' + lastName

        if len(nameParts) > 2:
            suffix = nameParts[2].strip()
        else:
            suffix = ''
        name = firstName + ' ' + lastName
        if suffix == 'Jr.':
            name += ', Jr.'
        elif suffix == 'Jr':
            name += ', Jr.'
        elif suffix == '':
            pass
        elif suffix == 'II':
            name += ' ' + suffix
        elif suffix == 'III':
            name += ' ' + suffix
        elif suffix == 'IV':
            name += ' ' + suffix
        elif suffix == 'V':
            name += ' ' + suffix
    return name

In [59]:
# read in data from Zotero download CSV
publications = vbc.readDict(csv_path)
print(len(publications),'publications read from CSV')

# read in previously discovered Q IDs and names
known_name_strings = vbc.readDict(known_name_strings_path)
print(len(known_name_strings),'discovered authors Q IDs read from CSV')

# read in list of all Q IDs and names of journals in Wikidata
journals_in_wikidata = vbc.readDict(journals_path)
print(len(journals_in_wikidata),'journals read from CSV')

# Download the labels and descriptions of all existing institutional people

org_label_query = vbc.Query(labelscreen='?id wdt:P1416 ?deptOrCollege.?deptOrCollege wdt:P749+ wd:' + employerQId + '.', sleep=sparqlSleep)
org_labels = org_label_query.labels_descriptions('')
print(len(org_labels), 'labels downloaded')

org_description_query = vbc.Query(labeltype='description', labelscreen='?id wdt:P1416 ?deptOrCollege.?deptOrCollege wdt:P749+ wd:' + employerQId + '.', sleep=sparqlSleep)
org_descriptions = org_description_query.labels_descriptions('')
print(len(org_descriptions), 'descriptions downloaded')


2819 publications read from CSV
17 discovered authors Q IDs read from CSV
18100 journals read from CSV
4438 labels downloaded
4438 descriptions downloaded


In [72]:
journal_name = 'Black Theology'
found = False
for journal in journals_in_wikidata:
    setRatio = fuzz.ratio(journal_name, journal['label'])
    if setRatio >= testRatio: # We get a name match
        print(setRatio, journal['qid'], journal['label'])
        found = True
if not found:
    print('No match')

100 Q15816830 Black Theology


In [56]:
output_list = []
identified_authors_list = [] # list of dictionaries
unidentified_authors_list = [] # list of strings
count = 0
for publication in publications:
    count += 1
    # print something every 100 rows
    if count % 100 == 0:
        print(count)
    if ';' in publication['Author']:
        names = publication['Author'].split(';')
    else:
        names = [publication['Author']]
    names_list = []
    for name in names:
        name_dictionary = {}
        if name.strip() != '':
            name_string = reverse_names(name)
            found = False
            for label in org_labels:
                setRatio = fuzz.token_set_ratio(name_string, label['string'])
                if setRatio >= testRatio: # We get a name match
                    #print(label['qid'], label['string'])
                    name_dictionary = label
                    found = True
                    matched = False
                    for id_name in identified_authors_list:
                        if name_dictionary['string'] == id_name['string']:
                            matched = True
                            break
                    if not matched:
                        identified_authors_list.append(name_dictionary)
                    break
                    
            for label in known_name_strings:
                if name_string == label['string']: # We get a name match
                    #print(label['qid'], label['string'])
                    name_dictionary['qid'] = label['qid']
                    name_dictionary['string'] = label['label']
                    found = True
                    matched = False
                    for id_name in identified_authors_list:
                        if name_dictionary['string'] == id_name['string']:
                            matched = True
                            break
                    if not matched:
                        identified_authors_list.append(name_dictionary)
                    break
                
            if not found:
                #print('none', name_string)
                name_dictionary['qid'] = ''
                name_dictionary['string'] = name_string
                for unid_name in unidentified_authors_list:
                    if name_dictionary['string'] == unid_name:
                        matched = True
                        break
                if not matched:
                    unidentified_authors_list.append(name_dictionary['string'])
            names_list.append(name_dictionary)
    #print(names_list)
print('\n')
print(identified_authors_list)
print(unidentified_authors_list)
                

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800


[{'qid': 'Q83505898', 'string': 'Lisa L. Thompson'}, {'qid': 'Q92216692', 'string': 'Dale P. Andrews'}, {'qid': 'Q83505887', 'string': 'Phillis Isabella Sheppard'}, {'qid': 'Q92207043', 'string': 'Dawn Ottoni Wilhelm'}, {'qid': 'Q92207288', 'string': 'Ronald J. Allen'}, {'qid': 'Q29447340', 'string': 'Emilie M. Townes'}, {'qid': 'Q27643173', 'string': 'Joerg Rieger'}, {'qid': 'Q83500312', 'string': 'Annalisa Azzoni'}, {'qid': 'Q83505894', 'string': 'Trudy Hawkins Stringer'}, {'qid': 'Q16122562', 'string': 'Fernando Segovia'}, {'qid': 'Q83505860', 'string': 'Bruce Morrill'}, {'qid': 'Q63038670', 'string': 'Mark Miller-McLemore'}, {'qid': 'Q83505836', 'string': 'Herbert R. Marbury'}, {'qid': 'Q53901122', 'string': 'Paul C.H. Lim'}, {'qid': 'Q4749053', 'string': 'Amy-Jill Levine'}, {'qid': 'Q83422314', 'string': 'James Hudnut-Beumler'}, {'qid': 'Q7595851', '