In [63]:
import requests   # best library to manage HTTP transactions
from bs4 import BeautifulSoup # web-scraping library
import json
from time import sleep
import csv
import math
from fuzzywuzzy import fuzz # fuzzy logic matching
from fuzzywuzzy import process
import xml.etree.ElementTree as et # library to traverse XML tree
import urllib
import datetime
import string
from pathlib import Path
import copy # import the copy module from the standard library

import vb_common_code as vbc

employerQId = 'Q29052' # Vanderbilt University
sparqlSleep = 0.1 # number of seconds to wait between queries to SPARQL endpoint
testRatio = 90 # similarity required for a potential match of a generic wikidata match
journalTestRatio = 94 # similarity required for a potential match of a generic wikidata match

home = str(Path.home()) # gets path to home directory; supposed to work for both Win and Mac
csv_path = home + '/divinity-law/vulibDivinity.csv'
known_name_strings_path = home + '/divinity-law/known-name-strings.csv'
wikidata_article_matches_path  = home + '/divinity-law/wikidata-article-matches.csv'
duplicate_works_path  = home + '/divinity-law/duplicate-works.csv'
journals_path = home + '/divinity-law/journals.csv'

# read from a CSV file beginning with a bit order mark (BOM) into a list of dictionaries
# Change encoding type to utf-8-sig instead of utf-8 to fix this
def read_dict_bom(filename):
    with open(filename, 'r', newline='', encoding='utf-8-sig') as fileObject: 
        dictObject = csv.DictReader(fileObject)
        array = []
        for row in dictObject:
            array.append(row)
    return array

def reverse_names(nameLastFirst):
    nameLastFirst = nameLastFirst.strip()
    nameParts = nameLastFirst.split(',')
    if len(nameParts) < 2: # name probably isn't reversed
        name = nameLastFirst.strip()
    else:
        firstName = nameParts[1].strip()
        lastName = nameParts[0].strip()
        name = firstName + ' ' + lastName

        if len(nameParts) > 2:
            suffix = nameParts[2].strip()
        else:
            suffix = ''
        name = firstName + ' ' + lastName
        if suffix == 'Jr.':
            name += ', Jr.'
        elif suffix == 'Jr':
            name += ', Jr.'
        elif suffix == '':
            pass
        elif suffix == 'II':
            name += ' ' + suffix
        elif suffix == 'III':
            name += ' ' + suffix
        elif suffix == 'IV':
            name += ' ' + suffix
        elif suffix == 'V':
            name += ' ' + suffix
    return name

def generateHeaderDictionary(acceptMediaType):
    userAgentHeader = 'VanderBot/1.0 (https://github.com/HeardLibrary/linked-data/tree/master/publications; mailto:steve.baskauf@vanderbilt.edu)'
    requestHeaderDictionary = {
        'Accept' : acceptMediaType,
        'User-Agent': userAgentHeader
    }
    return requestHeaderDictionary

# extracts the qNumber from a Wikidata IRI
def extract_qnumber(iri):
    # pattern is http://www.wikidata.org/entity/Q6386232
    pieces = iri.split('/')
    return pieces[4]

# extracts the reference hash from a reference IRI
def extract_qnumber(iri):
    # pattern is http://www.wikidata.org/entity/Q6386232
    pieces = iri.split('/')
    return pieces[5]

# extracts the UUID and qId from a statement IRI
def extract_statement_uuid(iri):
    # pattern is http://www.wikidata.org/entity/statement/Q7552806-8B88E0CA-BCC8-49D5-9AC2-F1755464F1A2
    pieces = iri.split('/')
    statement_id = pieces[5]
    pieces = statement_id.split('-')
    return pieces[1] + '-' + pieces[2] + '-' + pieces[3] + '-' + pieces[4] + '-' + pieces[5], pieces[0]


In [4]:
# read in data from Zotero download CSV
publications = read_dict_bom(csv_path) # need to use special function since Zotero output has BOM
print(len(publications),'publications read from CSV')

# read in previously discovered Q IDs and names
known_name_strings = vbc.readDict(known_name_strings_path)
print(len(known_name_strings),'discovered authors Q IDs read from CSV')

# read in previously discovered matches of articles to Wikidata
wikidata_article_matches = vbc.readDict(wikidata_article_matches_path)
print(len(wikidata_article_matches),'previously discovered article matches read from CSV')

# read in list of works determined to be duplicates
duplicate_works = vbc.readDict(duplicate_works_path)
print(len(duplicate_works),'duplicate works list items read from CSV')

# read in list of all Q IDs and names of journals in Wikidata
journals_in_wikidata = vbc.readDict(journals_path)
print(len(journals_in_wikidata),'journals read from CSV')

# Download the labels and descriptions of all existing institutional people
org_label_query = vbc.Query(labelscreen='?id wdt:P1416 ?deptOrCollege.?deptOrCollege wdt:P749+ wd:' + employerQId + '.', sleep=sparqlSleep)
org_labels = org_label_query.labels_descriptions('')
print(len(org_labels), 'Wikidata people labels downloaded')

'''
org_description_query = vbc.Query(labeltype='description', labelscreen='?id wdt:P1416 ?deptOrCollege.?deptOrCollege wdt:P749+ wd:' + employerQId + '.', sleep=sparqlSleep)
org_descriptions = org_description_query.labels_descriptions('')
print(len(org_descriptions), 'descriptions downloaded')
'''

2819 publications read from CSV
114 discovered authors Q IDs read from CSV
0 previously discovered article matches read from CSV
0 duplicate works list items read from CSV
18100 journals read from CSV
4649 Wikidata people labels downloaded


"\norg_description_query = vbc.Query(labeltype='description', labelscreen='?id wdt:P1416 ?deptOrCollege.?deptOrCollege wdt:P749+ wd:' + employerQId + '.', sleep=sparqlSleep)\norg_descriptions = org_description_query.labels_descriptions('')\nprint(len(org_descriptions), 'descriptions downloaded')\n"

In [None]:
output_list = []
identified_authors_list = [] # list of dictionaries
unidentified_authors_list = [] # list of strings
identified_journals_list = [] # list of dictionaries
unidentified_journals_list = [] # list of dictionaries

count = 0
for publication in publications:
    count += 1
    # print something every 100 rows
    if count % 100 == 0:
        print(count)
        
    # Set up the output row by starting with the input row
    # The input row is a dictionary
    output_row = copy.deepcopy(publication) # use deepcopy rather than a reference
        
    # *********** Author matching ***********
    
    # First separate the authors if more than one
    if ';' in publication['Author']:
        names = publication['Author'].split(';')
    else:
        names = [publication['Author']]
    names_list = []
    
    # Process each of the authors of the publication
    for name in names:
        # The name dict is where we store names and Q IDs (if discovered)
        # for all of the authors of the paper
        name_dictionary = {}
        
        # Guard against errors in the case where authors are missing
        if name.strip() != '':
            
            # Neet to revers the names if they are last name first
            name_string = reverse_names(name)
            
            # First check whether it's a name that matches a know Div School person
            found = False
            for label in org_labels:
                # NOTE: used token_set_ratio here because of the wide variety of 
                # ways that names are sliced and diced. Need to determine how 
                # often there are false positives.
                setRatio = fuzz.token_set_ratio(name_string, label['string'])
                if setRatio >= testRatio: # We get a name match
                    name_dictionary['string'] = label['string'] # Add the match to the name dict
                    name_dictionary['qid'] = label['qid'] # Add the match to the name dict
                    found = True
                    matched = False
                    
                    # Find out whether we need to add to the list of IDed authors
                    for id_name in identified_authors_list:
                        if name_dictionary['string'] == id_name['string']:
                            matched = True
                            break # already in the list, do nothing more
                    if not matched: # all the way through the loop without finding, so add to list
                        identified_authors_list.append(name_dictionary)
                        
                    break # since found, quit the loop looking for that author's label
                    
            # If the name loop wasn't broken, move on to checking the 
            # list we keep of identified people who don't match the VU list
            for label in known_name_strings:
                if name_string == label['string']: # We get a name match with dirty name string
                    name_dictionary['qid'] = label['qid'] # Add the discovered name to article name dict
                    name_dictionary['string'] = label['label'] # Add the Wikidata label for the match
                    found = True
                    matched = False
                    
                    # Find out whether we need to add this match to the list of IDed authors
                    for id_name in identified_authors_list:
                        if name_dictionary['string'] == id_name['string']:
                            matched = True
                            break
                    if not matched:
                        identified_authors_list.append(name_dictionary)
                    break
                
            # If there is no match to either name source, then don't give a Q ID
            if not found:
                #print('none', name_string)
                name_dictionary['qid'] = ''
                name_dictionary['string'] = name_string
                matched = False
                
                # Find out if this is a new unidentified name or 
                # if it's already on the list of unIDed people
                for unid_name in unidentified_authors_list:
                    if name_string == unid_name['label']:
                        matched = True
                        break # quit checking on more unIDed people
                if not matched: # need to add to the list if never found
                    unidentified_authors_list.append({'label': name_string, 'listing': name.strip()})
            names_list.append(name_dictionary)
    output_row['author_qids_names'] = names_list 
    #print(names_list)
    
    
    # *********** Journal matching ***********

    # only process rows for journal articles
    if publication['Item Type'] == 'journalArticle':
        # first see if the Publication Title is one we already know about
        found_in_id_journals = False
        for known_journal in identified_journals_list:
            setRatio = fuzz.ratio(publication['Publication Title'], known_journal['label'])
            if setRatio >= journalTestRatio: # We get a name match
                found_in_id_journals = True
                output_row['publication_title_qid'] = known_journal['qid'] # assign this Q ID for the journal of the row
                # don't need to keep looking so quit the search loop
                break
                
        # need to check the big list if the journal is new
        if not found_in_id_journals:
            found_in_poss_journals = False
            for possible_journal in journals_in_wikidata:
                # Used regular ratio rather than token_set_ratio here. There were false positives
                # Due to the many similar journals that had overlapping name parts.
                setRatio = fuzz.ratio(possible_journal['label'], publication['Publication Title'])
                if setRatio >= journalTestRatio: # We get a title string match
                    found_in_poss_journals = True
                    output_row['publication_title_qid'] = possible_journal['qid'] # assign this Q ID for the journal of the row
                    print(possible_journal['qid'], possible_journal['label'], setRatio, publication['Publication Title'])
                    identified_journals_list.append({'qid': possible_journal['qid'], 'label': possible_journal['label']})
                    break
            if not found_in_poss_journals:
                matched = False
                for unid_journal in unidentified_journals_list:
                    if publication['Publication Title'] == unid_journal:
                        matched = True
                        break
                if not matched:
                    unidentified_journals_list.append(publication['Publication Title'])
    output_list.append(output_row)

# Save the data
filename = home + '/divinity-law/output.csv'
fieldnames = ['author_qids_names', 'publication_title_qid'] + list(publications[0].keys())
vbc.writeDictsToCsv(output_list, filename, fieldnames)

# Save the identified authors
filename = home + '/divinity-law/identified-authors.csv'
fieldnames = ['qid', 'string']
vbc.writeDictsToCsv(identified_authors_list, filename, fieldnames)

# Save the identified journals
filename = home + '/divinity-law/identified-journals.csv'
fieldnames = ['qid', 'label']
vbc.writeDictsToCsv(identified_journals_list, filename, fieldnames)

# Save the list of unidentified authors
filename = home + '/divinity-law/unidentified-authors.csv'
fieldnames = ['label', 'listing']
vbc.writeDictsToCsv(unidentified_authors_list, filename, fieldnames)

# Save the list of unidentified journals
filename = home + '/divinity-law/unidentified-journals.txt'
with open(filename, 'wt', encoding='utf-8') as fileObject:
    for journal in unidentified_journals_list:
        print(journal, file=fileObject)
'''
# Save the discovered article matches
fieldnames = ['Key', 'qid']
vbc.writeDictsToCsv(wikidata_article_matches_path, filename, fieldnames)

# Save the discovered duplicate works
fieldnames = ['Key', 'duplicatekey']
vbc.writeDictsToCsv(duplicate_works_path, filename, fieldnames)
'''
print('done')


# Load matched journals from CSV and get metadata from Wikidata



In [31]:
# Load journal data from file
print('loading data from file')
filename = home + '/divinity-law/identified-journals.csv'
identified_journals = vbc.readDict(filename)
journal_qids = ''
for journal in identified_journals:
    journal_qids += 'wd:' + journal['qid'] + '\n'
# remove trailing newline
journal_qids = journal_qids[:len(journal_qids)-1]

# create properties dictionary
prop_list = [
    {'pid': 'rdfs:label', 'variable': 'label', 'value_type': 'string'},
    {'pid': 'schema:description', 'variable': 'description', 'value_type': 'string'},
    {'pid': 'wdt:P31', 'variable': 'instance_of', 'value_type': 'item'},
    {'pid': 'wdt:P1476', 'variable': 'title', 'value_type': 'string'},
    {'pid': 'wdt:P407', 'variable': 'language_of_work', 'value_type': 'item'},
    {'pid': 'wdt:P495', 'variable': 'country_of_origin', 'value_type': 'item'},
    {'pid': 'wdt:P123', 'variable': 'publisher', 'value_type': 'item'},
    {'pid': 'wdt:P571', 'variable': 'inception', 'value_type': 'date'},
    {'pid': 'wdt:P2669', 'variable': 'discontinued_date', 'value_type': 'item'},
    {'pid': 'wdt:P856', 'variable': 'official_website', 'value_type': 'uri'},
    {'pid': 'wdt:P155', 'variable': 'follows', 'value_type': 'item'},
    {'pid': 'wdt:P156', 'variable': 'followed_by', 'value_type': 'item'},
    {'pid': 'wdt:P921', 'variable': 'main_subject', 'value_type': 'item'},
    {'pid': 'wdt:P2896', 'variable': 'publication_interval', 'value_type': 'decimal'},
    {'pid': 'wdt:P236', 'variable': 'issn', 'value_type': 'string'}
]

# create a string for the query
query = '''
select distinct ?id '''

for property in prop_list:
    query += '?' + property['variable'] + ' '

query += 'where {'
        
query += '''
  VALUES ?id
{
''' + journal_qids + '''
}

'''

for property in prop_list:
    query += 'OPTIONAL {?id ' + property['pid'] + ' ?' + property['variable'] + '.}\n'

query += '''
filter(lang(?label)="en")
filter(lang(?description)="en")
}'''

#print(query)

# Send SPARQL query to the Wikidata Query Service
print('retrieving data from Wikidata')
endpoint = 'https://query.wikidata.org/sparql'
useragent = 'VanderDiv/0.1 (https://github.com/HeardLibrary/linked-data/tree/master/publications/divinity-law; mailto:steve.baskauf@vanderbilt.edu)' 
requestheader = {
'Content-Type': 'application/sparql-query',
'Accept' : 'application/json',
'User-Agent': useragent
}

# send request to Wikidata Query Service
response = requests.post(endpoint, data=query, headers=requestheader)
data = response.json()

# extract the values from the response JSON
results = data['results']['bindings']
output_list = []
for result in results:
    row_dict = {'qid': extract_qnumber(result['id']['value'])}
    for property in prop_list:
        try:
            if property['value_type'] == 'item':
                # remove wd: 'http://www.wikidata.org/entity/'
                value = extract_qnumber(result[property['variable']]['value'])
            else:
                value = result[property['variable']]['value']
            row_dict[property['variable']] = value
        except:
            pass
    output_list.append(row_dict)
#print(output_list)

# create the list of column headers
fieldnames = ['qid']
for property in prop_list:
    fieldnames.append(property['variable'])

# write the data to a CSV file
print('writing data to file')
vbc.writeDictsToCsv(output_list, home + '/divinity-law/journal_metadata_from_wikidata.csv', fieldnames)

print('done')

loading data from file
retrieving data from Wikidata
writing data to file
done


generic script

In [106]:
# Load journal data from file
print('loading data from file')
filename = home + '/divinity-law/identified-journals.csv'
identified_journals = vbc.readDict(filename)

# Create VALUES list for journals
journal_qids = ''
for journal in identified_journals:
    journal_qids += 'wd:' + journal['qid'] + '\n'
# remove trailing newline
journal_qids = journal_qids[:len(journal_qids)-1]

# create properties dictionary
prop_list = [
    {'pid': 'rdfs:label', 'variable': 'label', 'value_type': 'string'},
    {'pid': 'schema:description', 'variable': 'description', 'value_type': 'string'},
    {'pid': 'P31', 'variable': 'instance_of', 'value_type': 'item'},
    {'pid': 'P1476', 'variable': 'title', 'value_type': 'string'},
    {'pid': 'P407', 'variable': 'language_of_work', 'value_type': 'item'},
    {'pid': 'P495', 'variable': 'country_of_origin', 'value_type': 'item'},
    {'pid': 'P123', 'variable': 'publisher', 'value_type': 'item'},
    {'pid': 'P571', 'variable': 'inception', 'value_type': 'date'},
    {'pid': 'P2669', 'variable': 'discontinued_date', 'value_type': 'item'},
    {'pid': 'P856', 'variable': 'official_website', 'value_type': 'uri'},
    {'pid': 'P155', 'variable': 'follows', 'value_type': 'item'},
    {'pid': 'P156', 'variable': 'followed_by', 'value_type': 'item'},
    {'pid': 'P921', 'variable': 'main_subject', 'value_type': 'item'},
    {'pid': 'P2896', 'variable': 'publication_interval', 'value_type': 'decimal'},
    {'pid': 'P236', 'variable': 'issn', 'value_type': 'string'}
]

prop_list[2]

loading data from file


{'pid': 'P31', 'variable': 'instance_of', 'value_type': 'item'}

In [114]:
#for property in prop_list[14:15]:
for property in prop_list[2:3]:
    # create a string for the query
    query = '''
    select distinct ?id '''

    query += '?' + property['variable'] + '_value '
    query += '?' + property['variable'] + '_statement '

    query += 'where {'

    query += '''
      VALUES ?id
    {
    ''' + journal_qids + '''
    }

    '''


    query += '?id wdt:' + property['pid'] + ' ?' + property['variable'] + '_value.\n'
    query += '?id p:' + property['pid'] + ' ?' + property['variable'] + '_statement.\n'

    query += '''
    #filter(lang(?label)="en")
    #filter(lang(?description)="en")
    }'''

    print(query)


    select distinct ?id ?instance_of_value ?instance_of_statement where {
      VALUES ?id
    {
    wd:Q7311312
wd:Q4984286
wd:Q15749660
wd:Q4041903
wd:Q38312865
wd:Q4051386
wd:Q57615071
wd:Q7743622
wd:Q15710071
wd:Q15767535
wd:Q6295853
wd:Q15816830
wd:Q63871713
wd:Q91678397
wd:Q15764425
wd:Q15762047
wd:Q18350356
wd:Q50432461
wd:Q15763977
wd:Q15750019
wd:Q15751172
wd:Q68836075
wd:Q6295487
wd:Q6294702
wd:Q7989227
wd:Q7437800
wd:Q15753995
wd:Q57070350
wd:Q15755134
wd:Q50815593
wd:Q7311115
wd:Q4041879
wd:Q15754681
wd:Q15816920
wd:Q7628082
wd:Q5442869
wd:Q6087079
wd:Q15755037
wd:Q15767635
wd:Q15817868
wd:Q3523724
wd:Q5024509
wd:Q15754942
wd:Q15760154
wd:Q8075881
wd:Q6888892
wd:Q15759717
wd:Q5158991
wd:Q63871731
wd:Q15707782
wd:Q15710287
wd:Q15761107
wd:Q15762548
wd:Q4903269
wd:Q15763974
wd:Q5676604
wd:Q15763357
wd:Q15763031
wd:Q15755002
wd:Q50478535
wd:Q15716275
wd:Q63871802
wd:Q15756294
wd:Q15716423
wd:Q15746447
wd:Q15755905
wd:Q15749151
wd:Q62259203
wd:Q1941982
wd:Q50814896
wd:Q5042628

In [115]:
# Send SPARQL query to the Wikidata Query Service
print('retrieving data from Wikidata')
endpoint = 'https://query.wikidata.org/sparql'
useragent = 'VanderDiv/0.1 (https://github.com/HeardLibrary/linked-data/tree/master/publications/divinity-law; mailto:steve.baskauf@vanderbilt.edu)' 
requestheader = {
'Content-Type': 'application/sparql-query',
'Accept' : 'application/json',
'User-Agent': useragent
}

# send request to Wikidata Query Service
response = requests.post(endpoint, data=query, headers=requestheader)
data = response.json()

# extract the values from the response JSON
results = data['results']['bindings']
results

retrieving data from Wikidata


[{'id': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q68836075'},
  'instance_of_value': {'type': 'uri',
   'value': 'http://www.wikidata.org/entity/Q737498'},
  'instance_of_statement': {'type': 'uri',
   'value': 'http://www.wikidata.org/entity/statement/Q68836075-524F3A33-FD71-47D6-84A6-2C4E008B4BF0'}},
 {'id': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q91678397'},
  'instance_of_value': {'type': 'uri',
   'value': 'http://www.wikidata.org/entity/Q737498'},
  'instance_of_statement': {'type': 'uri',
   'value': 'http://www.wikidata.org/entity/statement/Q91678397-5cc36403-4360-14b0-3f4e-879b88125a8c'}},
 {'id': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q62259203'},
  'instance_of_value': {'type': 'uri',
   'value': 'http://www.wikidata.org/entity/Q5633421'},
  'instance_of_statement': {'type': 'uri',
   'value': 'http://www.wikidata.org/entity/statement/Q62259203-6E280AAE-1E0B-45B8-8F7B-32B3ECA5FF34'}},
 {'id': {'type': 'uri', 'value': 'http:

In [116]:
statements_list = []
for result in results:
    row_dict = {}
    row_dict = {'qid': extract_qnumber(result['id']['value'])}

    if property['value_type'] == 'item':
        # remove wd: 'http://www.wikidata.org/entity/'
        value = extract_qnumber(result[property['variable'] + '_value']['value'])
    else:
        value = result[property['variable'] + '_value']['value']
    row_dict[property['variable'] + '_value'] = value
    temp_uuid, temp_qid = extract_statement_uuid(result[property['variable'] + '_statement']['value'])
    row_dict[property['variable'] + '_statement'] = temp_qid + '-' + temp_uuid
    statements_list.append(row_dict)
    
print(statements_list)


[{'qid': 'Q68836075', 'instance_of_value': 'Q737498', 'instance_of_statement': 'Q68836075-524F3A33-FD71-47D6-84A6-2C4E008B4BF0'}, {'qid': 'Q91678397', 'instance_of_value': 'Q737498', 'instance_of_statement': 'Q91678397-5cc36403-4360-14b0-3f4e-879b88125a8c'}, {'qid': 'Q62259203', 'instance_of_value': 'Q5633421', 'instance_of_statement': 'Q62259203-6E280AAE-1E0B-45B8-8F7B-32B3ECA5FF34'}, {'qid': 'Q62259203', 'instance_of_value': 'Q5633421', 'instance_of_statement': 'Q62259203-80C4942D-123A-46CB-BF81-A25A6B00C307'}, {'qid': 'Q88977207', 'instance_of_value': 'Q737498', 'instance_of_statement': 'Q88977207-735CC0AB-754A-49FE-8FF1-22799E2D58CE'}, {'qid': 'Q63871731', 'instance_of_value': 'Q737498', 'instance_of_statement': 'Q63871731-7A024214-F5CB-425A-B167-4571CA6C7B0A'}, {'qid': 'Q63871802', 'instance_of_value': 'Q737498', 'instance_of_statement': 'Q63871802-E9E66F81-0778-4F51-963C-1F88FFC19382'}, {'qid': 'Q63871713', 'instance_of_value': 'Q737498', 'instance_of_statement': 'Q63871713-EC003

Try to get references for the statements

In [117]:
# Create VALUES list for statmements
statement_uuids = ''
for statement in statements_list:
    statement_uuids += 'wds:' + statement[property['variable'] + '_statement'] + '\n'
# remove trailing newline
statement_uuids = statement_uuids[:len(statement_uuids)-1]
print(statement_uuids)

wds:Q68836075-524F3A33-FD71-47D6-84A6-2C4E008B4BF0
wds:Q91678397-5cc36403-4360-14b0-3f4e-879b88125a8c
wds:Q62259203-6E280AAE-1E0B-45B8-8F7B-32B3ECA5FF34
wds:Q62259203-80C4942D-123A-46CB-BF81-A25A6B00C307
wds:Q88977207-735CC0AB-754A-49FE-8FF1-22799E2D58CE
wds:Q63871731-7A024214-F5CB-425A-B167-4571CA6C7B0A
wds:Q63871802-E9E66F81-0778-4F51-963C-1F88FFC19382
wds:Q63871713-EC003478-FBE5-4812-B414-868AC673033D
wds:Q50814896-DF665463-9CB2-4F05-A287-D5359512EEF8
wds:Q50814896-FDBA9022-8407-4FCE-BCEE-425E1BFB98CE
wds:Q50814896-DF665463-9CB2-4F05-A287-D5359512EEF8
wds:Q50814896-FDBA9022-8407-4FCE-BCEE-425E1BFB98CE
wds:Q50426286-83586CB7-1672-447E-A9C1-CE3D4DE88E18
wds:Q50426286-975B042C-A24C-4F39-A802-1746A1D7CE9A
wds:Q50426286-83586CB7-1672-447E-A9C1-CE3D4DE88E18
wds:Q50426286-975B042C-A24C-4F39-A802-1746A1D7CE9A
wds:Q15759717-A3C798F2-297B-41B8-A44E-B51678E43752
wds:Q15762548-1A25729F-E7BA-4A0E-8207-88A4FC7DD86D
wds:Q15762548-20760FE6-651D-48AD-B808-B8918B4A8E2F
wds:Q15762548-944364C9-23EB-4BE

Used this query:
```
select distinct ?gprop ?prop_label where {
     VALUES ?id
    {
wd:Q7311312
wd:Q4984286
wd:Q15749660
etc.
    }
?id p:P236 ?issn_statement.
?issn_statement prov:wasDerivedFrom ?reference.
?reference ?prop ?value.
?gprop wikibase:reference ?prop.
?gprop rdfs:label ?prop_label.
filter(lang(?prop_label)='en')
}
```
to find out what reference properties were in use in the journals of interest. They were:

```
gprop   prop_label
wd:P143  imported from Wikimedia project
wd:P248  stated in
wd:P813  retrieved
wd:P854  reference URL
wd:P4327 BHL bibliography ID
```
The only Wikimedia project they were imported from was Wikipedia, so not a useful one to track. I don't think BHL ID is either, so that leaves P248, P813, and P845 as the useful ones to track.

In [118]:
# create a string for the query
query = 'select distinct ?label ?' + property['variable'] + ' ?statement ?reference ?statedIn ?refUrl ?retrDate ?retrPrecision where {'

query += '''
  VALUES ?statement
{
''' + statement_uuids + '''
}

'''
query += '''
?statement ps:''' + property['pid'] + ' ?' + property['variable'] + '''.
?qid p:''' + property['pid'] + ''' ?statement.
?qid rdfs:label ?label.
filter(lang(?label)='en')
optional{
?statement prov:wasDerivedFrom ?reference.
?reference pr:P248 ?statedIn.
}
optional{
?statement prov:wasDerivedFrom ?reference.
?reference pr:P854 ?refUrl.
}
optional{
?statement prov:wasDerivedFrom ?reference.
?reference prv:P813 ?retrievedNode.
?retrievedNode wikibase:timeValue ?retrDate.
?retrievedNode wikibase:timePrecision ?retrPrecision.
}
}'''

print(query)

select distinct ?label ?instance_of ?statement ?reference ?statedIn ?refUrl ?retrDate ?retrPrecision where {
  VALUES ?statement
{
wds:Q68836075-524F3A33-FD71-47D6-84A6-2C4E008B4BF0
wds:Q91678397-5cc36403-4360-14b0-3f4e-879b88125a8c
wds:Q62259203-6E280AAE-1E0B-45B8-8F7B-32B3ECA5FF34
wds:Q62259203-80C4942D-123A-46CB-BF81-A25A6B00C307
wds:Q88977207-735CC0AB-754A-49FE-8FF1-22799E2D58CE
wds:Q63871731-7A024214-F5CB-425A-B167-4571CA6C7B0A
wds:Q63871802-E9E66F81-0778-4F51-963C-1F88FFC19382
wds:Q63871713-EC003478-FBE5-4812-B414-868AC673033D
wds:Q50814896-DF665463-9CB2-4F05-A287-D5359512EEF8
wds:Q50814896-FDBA9022-8407-4FCE-BCEE-425E1BFB98CE
wds:Q50814896-DF665463-9CB2-4F05-A287-D5359512EEF8
wds:Q50814896-FDBA9022-8407-4FCE-BCEE-425E1BFB98CE
wds:Q50426286-83586CB7-1672-447E-A9C1-CE3D4DE88E18
wds:Q50426286-975B042C-A24C-4F39-A802-1746A1D7CE9A
wds:Q50426286-83586CB7-1672-447E-A9C1-CE3D4DE88E18
wds:Q50426286-975B042C-A24C-4F39-A802-1746A1D7CE9A
wds:Q15759717-A3C798F2-297B-41B8-A44E-B51678E43752
wd

In [119]:
# send request to Wikidata Query Service
response = requests.post(endpoint, data=query, headers=requestheader)
data = response.json()

# extract the values from the response JSON
results = data['results']['bindings']
results

[{'statement': {'type': 'uri',
   'value': 'http://www.wikidata.org/entity/statement/Q50815593-87C17CD1-F50C-4BDE-804C-0AC06F958B37'},
  'reference': {'type': 'uri',
   'value': 'http://www.wikidata.org/reference/e544413e0106f7253a231dfffe136d9d9fff837a'},
  'retrDate': {'datatype': 'http://www.w3.org/2001/XMLSchema#dateTime',
   'type': 'literal',
   'value': '2018-05-10T00:00:00Z'},
  'retrPrecision': {'datatype': 'http://www.w3.org/2001/XMLSchema#integer',
   'type': 'literal',
   'value': '11'},
  'instance_of': {'type': 'uri',
   'value': 'http://www.wikidata.org/entity/Q773668'},
  'label': {'xml:lang': 'en',
   'type': 'literal',
   'value': 'Journal of French and Francophone Philosophy'},
  'statedIn': {'type': 'uri',
   'value': 'http://www.wikidata.org/entity/Q1227538'}},
 {'statement': {'type': 'uri',
   'value': 'http://www.wikidata.org/entity/statement/Q50426426-CC1BBEFB-DA93-4EF8-9D9D-899A08417EC2'},
  'reference': {'type': 'uri',
   'value': 'http://www.wikidata.org/refe

In [120]:
# extract results
references_list = []
for result in results:
    row_dict = {}
    row_dict[property['variable'] + '_uuid'], row_dict['qid'] = extract_statement_uuid(result['statement']['value'])
    row_dict[property['variable']] = result[property['variable']]['value']
    row_dict['label'] = result['label']['value']
    try:
        row_dict[property['variable'] + '_ref1_hash'] = extract_qnumber(result['reference']['value'])
    except:
        row_dict[property['variable'] + '_ref1_hash'] = ''
    try:
        row_dict[property['variable'] + '_ref1_statedIn'] = extract_qnumber(result['statedIn']['value'])
    except:
        row_dict[property['variable'] + '_ref1_statedIn'] = ''
    try:
        row_dict[property['variable'] + '_ref1_referenceUrl'] = result['refUrl']['value']
    except:
        row_dict[property['variable'] + '_ref1_referenceUrl'] = ''
    try:
        row_dict[property['variable'] + '_ref1_retrieved_val'] = result['retrDate']['value']
    except:
        row_dict[property['variable'] + '_ref1_retrieved_val'] = ''
    try:
        row_dict[property['variable'] + '_ref1_retrieved_prec'] = result['retrPrecision']['value']
    except:
        row_dict[property['variable'] + '_ref1_retrieved_prec'] = ''
    references_list.append(row_dict)
    

print(json.dumps(references_list, indent=2))

[
  {
    "instance_of_uuid": "87C17CD1-F50C-4BDE-804C-0AC06F958B37",
    "qid": "Q50815593",
    "instance_of": "http://www.wikidata.org/entity/Q773668",
    "label": "Journal of French and Francophone Philosophy",
    "instance_of_ref1_hash": "e544413e0106f7253a231dfffe136d9d9fff837a",
    "instance_of_ref1_statedIn": "Q1227538",
    "instance_of_ref1_referenceUrl": "",
    "instance_of_ref1_retrieved_val": "2018-05-10T00:00:00Z",
    "instance_of_ref1_retrieved_prec": "11"
  },
  {
    "instance_of_uuid": "CC1BBEFB-DA93-4EF8-9D9D-899A08417EC2",
    "qid": "Q50426426",
    "instance_of": "http://www.wikidata.org/entity/Q773668",
    "label": "Estudos de Religi\u00e3o",
    "instance_of_ref1_hash": "e544413e0106f7253a231dfffe136d9d9fff837a",
    "instance_of_ref1_statedIn": "Q1227538",
    "instance_of_ref1_referenceUrl": "",
    "instance_of_ref1_retrieved_val": "2018-05-10T00:00:00Z",
    "instance_of_ref1_retrieved_prec": "11"
  },
  {
    "instance_of_uuid": "FDBA9022-8407-4FCE-BC

In [121]:
# create the list of column headers
fieldnames = ['qid', property['variable'] + '_uuid', property['variable'], property['variable'] + '_ref1_hash', property['variable'] + '_ref1_statedIn', property['variable'] + '_ref1_referenceUrl', property['variable'] + '_ref1_retrieved_val', property['variable'] + '_ref1_retrieved_prec', 'label']

# write the data to a CSV file
print('writing data to file')
vbc.writeDictsToCsv(references_list, home + '/divinity-law/test-journal-instanceof.csv', fieldnames)

print('done')

writing data to file
done


Other stuff

In [158]:
print(list(publications[0].keys()))
print(publications[0]['Key'])

['Key', 'Item Type', 'Publication Year', 'Author', 'Title', 'Publication Title', 'ISBN', 'ISSN', 'DOI', 'Url', 'Abstract Note', 'Date', 'Date Added', 'Date Modified', 'Access Date', 'Pages', 'Num Pages', 'Issue', 'Volume', 'Number Of Volumes', 'Journal Abbreviation', 'Short Title', 'Series', 'Series Number', 'Series Text', 'Series Title', 'Publisher', 'Place', 'Language', 'Rights', 'Type', 'Archive', 'Archive Location', 'Library Catalog', 'Call Number', 'Extra', 'Notes', 'File Attachments', 'Link Attachments', 'Manual Tags', 'Automatic Tags', 'Editor', 'Series Editor', 'Translator', 'Contributor', 'Attorney Agent', 'Book Author', 'Cast Member', 'Commenter', 'Composer', 'Cosponsor', 'Counsel', 'Interviewer', 'Producer', 'Recipient', 'Reviewed Author', 'Scriptwriter', 'Words By', 'Guest', 'Number', 'Edition', 'Running Time', 'Scale', 'Medium', 'Artwork Size', 'Filing Date', 'Application Number', 'Assignee', 'Issuing Authority', 'Country', 'Meeting Name', 'Conference Name', 'Court', 'Refe

In [129]:
alternatives = ''
for author in identified_authors_list:
    alternatives += 'wd:' + author['qid'] + '\n'

# create a string for the query
query = '''
select distinct ?article ?articleLabel
    where {
        VALUES ?id
    {
''' + alternatives + '''
    }
    ?article wdt:P50 ?id.
    {?article wdt:P31 wd:Q18918145.} union {?article wdt:P31 wd:Q13442814}
    ?article rdfs:label ?articleLabel.
    filter(lang(?articleLabel)='en')
}'''

#print(query)



select distinct ?article ?articleLabel
    where {
        VALUES ?id
    {
wd:Q83505898
wd:Q92216692
wd:Q83505887
wd:Q92207043
wd:Q92207288
wd:Q29447340
wd:Q27643173
wd:Q83500312
wd:Q83505894
wd:Q16122562
wd:Q83505860
wd:Q63038670
wd:Q83505836
wd:Q53901122
wd:Q4749053
wd:Q91188434
wd:Q87400652
wd:Q83422314
wd:Q7595851
wd:Q83432379
wd:Q82775133
wd:Q6525772
wd:Q20708438
wd:Q55583731
wd:Q83389821
wd:Q63430468
wd:Q92206914
wd:Q92207578
wd:Q91909582
wd:Q83500331
wd:Q52274406
wd:Q73037010
wd:Q83505891
wd:Q4772041
wd:Q83505877
wd:Q83505983
wd:Q63038665
wd:Q6113796
wd:Q86366619
wd:Q92207880
wd:Q67554073
wd:Q83492880
wd:Q71850074
wd:Q83505989
wd:Q83505844
wd:Q84486
wd:Q16973499
wd:Q16830980
wd:Q83505863
wd:Q83500320
wd:Q86835641
wd:Q92224966
wd:Q83260986
wd:Q83505820
wd:Q92212610
wd:Q92212616
wd:Q15379207
wd:Q63183857
wd:Q83505948
wd:Q92213987
wd:Q5218399
wd:Q30302655
wd:Q19595159
wd:Q1328650
wd:Q6200439
wd:Q5473986
wd:Q18390705
wd:Q630122
wd:Q194754
wd:Q25455836
wd:Q89052941
wd:Q6303611
wd:Q

In [136]:
existing_articles = []
wikidataEndpointUrl = 'https://query.wikidata.org/sparql'
acceptMediaType = 'application/json'
r = requests.get(wikidataEndpointUrl, params={'query' : query}, headers = generateHeaderDictionary(acceptMediaType))
try:
    data = r.json()
    statements = data['results']['bindings']
    for statement in statements:
        wikidataIri = statement['article']['value']
        article_qnumber = extract_qnumber(wikidataIri)
        article_label = statement['articleLabel']['value']
        existing_articles.append({'qid': article_qnumber, 'label': article_label})
except:
    existing_articles = [r.text]
# delay a quarter second to avoid hitting the SPARQL endpoint to rapidly
sleep(sparqlSleep)

print(json.dumps(existing_articles, indent=2))


[
  {
    "qid": "Q53141308",
    "label": "Revisiting the living human web: theological education and the role of clinical pastoral education."
  },
  {
    "qid": "Q69685319",
    "label": "Marital therapy caught between person and public: Christian traditions on marriage"
  },
  {
    "qid": "Q69685326",
    "label": "Also a pastoral theologian: in pursuit of dynamic theology (or: meditations from a recalcitrant heart)"
  },
  {
    "qid": "Q69685321",
    "label": "Teaching practical theology: introducing six perspectives"
  },
  {
    "qid": "Q69685324",
    "label": "Embodied knowing, embodied theology: what happened to the body?"
  },
  {
    "qid": "Q69685325",
    "label": "Five misunderstandings about practical theology"
  },
  {
    "qid": "Q69685331",
    "label": "Cognitive neuroscience and the question of theological method"
  },
  {
    "qid": "Q69685330",
    "label": "Words made flesh: writings in pastoral and practical theology"
  },
  {
    "qid": "Q69685333",
    "l

In [159]:

count = 0
for publication in publications:
    count += 1
    # print something every 100 rows
    if count % 100 == 0:
        print(count)
        
    # only process rows for journal articles
    if publication['Item Type'] == 'journalArticle':
        for article in existing_articles:
            #setRatio = fuzz.token_set_ratio(article['label'], publication['Title'])
            setRatio = fuzz.ratio(article['label'].lower(), publication['Title'].lower())
            #if setRatio >= 95: # We get a name match
            if setRatio >= 80: # We get a name match
                if article['label'] != publication['Title']:
                    print('Match ratio:', setRatio)
                    print(article['label'])
                    print('https://www.wikidata.org/wiki/' + article['qid'])
                    print('row:', count + 1)
                    print(publication['Title'])
                    print(publication['Author'])
                    print()
print('done')

100
Match ratio: 92
Eros for the other: retaining truth in a pluralistic world
https://www.wikidata.org/wiki/Q69685414
row: 149
Review of Eros for the other: retaining truth in a pluralistic world
Armour, Ellen T.

200
300
400
500
600
700
800
900
Match ratio: 100
The wrestle of Christ and culture in pragmatic public theology
https://www.wikidata.org/wiki/Q92212019
row: 923
The Wrestle of Christ and Culture in Pragmatic Public Theology
Anderson, Victor

Match ratio: 100
The narrative turn in Christian ethics : a critical appraisal
https://www.wikidata.org/wiki/Q92214070
row: 925
The Narrative Turn in Christian Ethics : A Critical Appraisal
Anderson, Victor

Match ratio: 100
Black scholarly aesthetics and the religious critic : black experience as manifolds of manifestations and powers of presentations
https://www.wikidata.org/wiki/Q91649499
row: 945
Black scholarly aesthetics and the religious critic: black experience as manifolds of manifestations and powers of presentations
Anderson, 

In [236]:
def retrieveCrossRefDoi(doi):
    metadata = {}
    authorList = []
    crossRefEndpointUrl = 'https://api.crossref.org/works/'
    encodedDoi = urllib.parse.quote(doi)
    searchUrl = crossRefEndpointUrl + encodedDoi
    acceptMediaType = 'application/json'
    response = requests.get(searchUrl, headers=vbc.generateHeaderDictionary(acceptMediaType))
    if response.status_code == 404:
        authorList = [] # return an empty list if the DOI won't dereference at CrossRef
    else:
        if 1==1:
        #try:
            data = response.json()
            message = data['message']
            #print(json.dumps(data, indent = 2))
            if 'author' in message:
                authors = message['author']
                for author in authors:
                    authorDict = {}
                    if 'ORCID' in author:
                        authorDict['orcid'] = author['ORCID']
                    else:
                        authorDict['orcid'] = ''
                    if 'given' in author:
                        authorDict['givenName'] = author['given']
                    else:
                        authorDict['givenName'] = ''
                    if 'family' in author:
                        authorDict['familyName'] = author['family']
                    else:
                        authorDict['familyName'] = ''
                    affiliationList = []
                    if 'affiliation' in author:
                        for affiliation in author['affiliation']:
                            affiliationList.append(affiliation['name'])
                    # if there aren't any affiliations, the list will remain empty
                    authorDict['affiliation'] = affiliationList
                    authorList.append(authorDict)
            metadata['authors'] = authorList
            if 'issue' in message:
                metadata['issue'] = message['issue']
            else:
                metadata['issue'] = ''
                
            if 'page' in message:
                metadata['page'] = message['page']
            else:
                metadata['page'] = ''
            
            if 'volume' in message:
                metadata['volume'] = message['volume']
            else:
                metadata['volume'] = ''
            
            if 'language' in message:
                metadata['language'] = message['language']
            else:
                metadata['language'] = ''
            
            if 'issued' in message:
                metadata['date'] = message['issued']['date-parts'][0][0]
            else:
                metadata['date'] = ''
            
            if 'title' in message:
                metadata['title'] = message['title'][0]
            else:
                metadata['title'] = ''
            
            if 'container-title' in message:
                metadata['journaltitle'] = message['container-title'][0]
            else:
                metadata['journaltitle'] = ''
            
            if 'issn-type' in message:
                print_flag = False
                electronic_flag = False
                for issntype in message['issn-type']:
                    if issntype['type'] == 'print':
                        print_flag = True
                        metadata['printissn'] = issntype['value']
                    if issntype['type'] == 'electronic':
                        electronic_flag = True
                        metadata['electronicissn'] = issntype['value']
                if not print_flag:
                    metadata['printissn'] = ''
                if not electronic_flag:
                    metadata['electronicissn'] = ''
            else:
                metadata['printissn'] = ''
                metadata['electronicissn'] = ''
            
       #except:
        #    metadata = data
    return metadata


In [237]:
data = retrieveCrossRefDoi('10.5325/jafrireli.3.1.0044')
print(json.dumps(data, indent=2))

{
  "authors": [
    {
      "orcid": "",
      "givenName": "",
      "familyName": "Floyd-Thomas",
      "affiliation": []
    }
  ],
  "issue": "1",
  "page": "44",
  "volume": "3",
  "language": "",
  "date": 2015,
  "title": "Gaining One's Definition",
  "journaltitle": "Journal of Africana Religions",
  "printissn": "2165-5405",
  "electronicissn": ""
}


In [223]:
def search_cross_ref(query):
    metadata = {}
    #crossRefEndpointUrl = 'https://api.crossref.org/journal-article/works?'
    crossRefEndpointUrl = 'https://api.crossref.org/works?'
    #encodedDoi = urllib.parse.quote(doi)
    searchUrl = crossRefEndpointUrl + query
    acceptMediaType = 'application/json'
    response = requests.get(searchUrl, headers=vbc.generateHeaderDictionary(acceptMediaType))
    print(response.url)
    if response.status_code == 404:
        metadata = {} # return an empty list if the DOI won't dereference at CrossRef
    else:
        metadata = response.json()
    return metadata

In [234]:
query = 'query.author=Amy-Jill'
data = search_cross_ref(query)
print(json.dumps(data, indent=2))

https://api.crossref.org/works?query.author=Amy-Jill
{
  "status": "ok",
  "message-type": "work-list",
  "message-version": "1.0.0",
  "message": {
    "facets": {},
    "total-results": 203411,
    "items": [
      {
        "indexed": {
          "date-parts": [
            [
              2020,
              4,
              7
            ]
          ],
          "date-time": "2020-04-07T07:08:24Z",
          "timestamp": 1586243304715
        },
        "reference-count": 3,
        "publisher": "Informa UK Limited",
        "issue": "2",
        "content-domain": {
          "domain": [],
          "crossmark-restriction": false
        },
        "short-container-title": [
          "Religion"
        ],
        "published-print": {
          "date-parts": [
            [
              2002,
              4
            ]
          ]
        },
        "DOI": "10.1006/reli.2002.0419",
        "type": "journal-article",
        "created": {
          "date-parts": [
            [


In [235]:
print(json.dumps(data['message']['items'][19], indent=2))

{
  "indexed": {
    "date-parts": [
      [
        2020,
        3,
        30
      ]
    ],
    "date-time": "2020-03-30T05:14:02Z",
    "timestamp": 1585545242528
  },
  "reference-count": 34,
  "publisher": "Informa UK Limited",
  "issue": "2",
  "funder": [
    {
      "DOI": "10.13039/501100000882",
      "name": "University of Aberdeen",
      "doi-asserted-by": "publisher",
      "award": []
    }
  ],
  "content-domain": {
    "domain": [
      "www.tandfonline.com"
    ],
    "crossmark-restriction": true
  },
  "short-container-title": [
    "Journal of Agromedicine"
  ],
  "published-print": {
    "date-parts": [
      [
        2018,
        4,
        3
      ]
    ]
  },
  "DOI": "10.1080/1059924x.2017.1423000",
  "type": "journal-article",
  "created": {
    "date-parts": [
      [
        2018,
        4,
        12
      ]
    ],
    "date-time": "2018-04-12T17:18:51Z",
    "timestamp": 1523553531000
  },
  "page": "154-165",
  "update-policy": "http://dx.doi.org/10