In [49]:
import yaml
import datetime
import json
import pandas as pd
from fuzzywuzzy import fuzz # fuzzy logic matching
from langdetect import detect_langs

default_language = 'en'
precision_cutoff = 0.95
phrase_length_cutoff = 2

# !!! Need to set up an error log!

with open('config.yaml', 'r') as file_object:
    config = yaml.safe_load(file_object)

with open('mapping.yaml', 'r') as file_object:
    mapping = yaml.safe_load(file_object)

mapping

{'constants': [],
 'properties': [{'variable': 'full_work_available',
   'value': 'identity',
   'source': 'full_work_available',
   'ref': [{'variable': 'retrieved', 'value': 'today'}]},
  {'variable': 'edition_version',
   'value': 'identity',
   'source': 'edition_version'},
  {'variable': 'language', 'value': 'detect_language', 'source': 'label'},
  {'variable': 'number_pages', 'value': 'calculate_pages', 'source': 'page'},
  {'variable': 'doi',
   'value': 'clean_doi',
   'source': 'doi',
   'ref': [{'variable': 'referenceUrl', 'value': 'reference'},
    {'variable': 'retrieved', 'value': 'today'}]},
  {'variable': 'pmid',
   'value': 'identity',
   'source': 'pmid',
   'ref': [{'variable': 'referenceUrl', 'value': 'reference'},
    {'variable': 'retrieved', 'value': 'today'}]},
  {'variable': 'publication_date',
   'value': 'identity',
   'source': 'publication_date',
   'ref': [{'variable': 'referenceUrl', 'value': 'reference'},
    {'variable': 'retrieved', 'value': 'today'}]},

In [1]:
def extract_local_name(iri):
    """Extract the local name part of an IRI, e.g. a Q ID from a Wikidata IRI"""
    # pattern is http://www.wikidata.org/entity/Q6386232
    pieces = iri.split('/')
    return pieces[-1]

def extract_metadata(config, work_data):
    out_dict = {'qid': ''}
    
    for property in config['outfiles'][0]['prop_list']:
        field = property['variable']
        out_dict[field + '_uuid'] = ''
        
        if field in work_data:
            if work_data[field] == '':
                output_value = ''
                no_value = True
            else:
                output_value = work_data[field]
                no_value = False
        else:
            output_value = ''
            no_value = True

        # Populate the values-related columns
        if property['value_type'] == 'date':
            out_dict[field + '_nodeId'] = ''
            out_dict[field + '_val'] = output_value
            out_dict[field + '_prec'] = ''
            
        elif property['value_type'] == 'quantity':
            out_dict[field + '_nodeId'] = ''
            out_dict[field + '_val'] = output_value
            if no_value:
                out_dict[field + '_unit'] = ''
            else:
                out_dict[field + '_unit'] = work_data[field + '_unit']
            
        elif property['value_type'] == 'globecoordinate':
            out_dict[field + '_nodeId'] = ''
            out_dict[field + '_val'] = output_value
            if no_value:
                out_dict[field + '_long'] = ''
                out_dict[field + '_prec'] = ''
            else:
                out_dict[field + '_long'] = work_data[field + '_long']
                out_dict[field + '_prec'] = work_data[field + '_prec']

        else:
            out_dict[field] = output_value
            
        # Populate the qualifier columns
        for qualifier in property['qual']:
            qual_field = field + '_' + qualifier['variable']
            # To my knowledge, dates are the only complex types used as qualifiers (no quantities or globecoordinates).
            if qualifier['value_type'] == 'date':
                out_dict[qual_field + '_nodeId'] = ''
                if no_value:
                    out_dict[qual_field + '_val'] = ''
                else:
                    out_dict[qual_field + '_val'] = work_data[qual_field]
                out_dict[qual_field + '_prec'] = ''
            else:
                if no_value:
                    out_dict[qual_field] = ''
                else:
                    out_dict[qual_field] = work_data[qual_field]
                
        # Populate the reference columns
        # There's only a hash ID column if there's at least one reference.
        if len(property['ref']) > 0:
            out_dict[field + '_ref1_hash'] = ''
        for reference in property['ref']:
            ref_field = field + '_ref1_' + reference['variable']
            # To my knowledge, dates are the only complex types used as qualifiers (no quantities or globecoordinates).
            if reference['value_type'] == 'date':
                out_dict[ref_field + '_nodeId'] = ''
                if no_value:
                    out_dict[ref_field + '_val'] = ''
                else:
                    out_dict[ref_field + '_val'] = work_data[field + '_' + reference['variable']]
                out_dict[ref_field + '_prec'] = ''
            else:
                if no_value:
                    out_dict[ref_field] = ''
                else:
                    out_dict[ref_field] = work_data[field + '_' + reference['variable']]
        
    return out_dict


work_data = {'qid': '',
 #'label': 'Book Review: Theologie aus asiatischen Quellen: Der theologische Weg Choan-Seng Songs vor dem Hintergrund der asiatischen ökumenischen Diskussion',
 #'label': 'Chinese Accounts of the Strange: A Study in the History of Religions',
 'label': 'Le péché contre le sang : la syphilis et la construction de l’identité juive',
 #'label': 'Shenxian zhuan 神仙傳',
 'full_work_available': 'https://purl.stanford.edu/gc695hq2680',
 'full_work_available_retrieved': '2022-02-18',
 'edition_version': '',
 'language': 'Q1860',
 'number_pages': '18',
 'number_pages_unit': 'Q1069725',
 'doi': '10.1093/oxfordhb/9780190221171.013.34',
 'doi_referenceUrl': 'http://doi.org/10.1353/JBL.2013.0032',
 'doi_retrieved': '2022-08-21',
 'pmid': '29097536',
 'pmid_referenceUrl': 'http://doi.org/10.1353/JBL.2013.0032',
 'pmid_retrieved': '2022-08-21',
 'publication_date': '2013-07-14',
 'publication_date_referenceUrl': 'http://doi.org/10.1353/JBL.2013.0032',
 'publication_date_retrieved': '2022-08-21',
 'title_en': 'Orthography, Textual Criticism, and the Poetry of Job',
 'title_en_referenceUrl': 'http://doi.org/10.1353/JBL.2013.0032',
 'title_en_retrieved': '2022-08-21',
 'published_in': 'Q4041879',
 'published_in_referenceUrl': 'http://doi.org/10.1353/JBL.2013.0032',
 'published_in_retrieved': '2022-08-21',
 'volume': '101',
 'volume_referenceUrl': 'http://doi.org/10.1353/JBL.2013.0032',
 'volume_retrieved': '2022-08-21',
 'page': '1464-1465',
 'page_referenceUrl': 'http://doi.org/10.1353/JBL.2013.0032',
 'page_retrieved': '2022-08-21',
 'issue': '4',
 'issue_referenceUrl': 'http://doi.org/10.1353/JBL.2013.0032',
 'issue_retrieved': '2022-08-21',
 'isbn10': '0-8070-8592-8',
 'isbn10_referenceUrl': 'http://doi.org/10.1353/JBL.2013.0032',
 'isbn10_retrieved': '2022-08-21',
 'isbn13': '978-1-5416-4497-7',
 'isbn13_referenceUrl': 'http://doi.org/10.1353/JBL.2013.0032',
 'isbn13_retrieved': '2022-08-21',
 'publisher': 'Stanford University Press',
 'publisher_referenceUrl': 'http://doi.org/10.1353/JBL.2013.0032',
 'publisher_retrieved': '2022-08-21',
 'place_of_publication': 'Q173813',
 'place_of_publication_referenceUrl': 'http://doi.org/10.1353/JBL.2013.0032',
 'place_of_publication_retrieved': '2022-08-21'}
"""
work_data = work_data = {'qid': '',
 'author': 'Q86508256',
 'author_series_ordinal': '1',
 'author_stated_as': 'Fernando F. Segovia'}
"""

def identity(value):
    """Return the value argument with any leading and trailing whitespace removed."""
    return value.strip()

def detect_language(string):
    """Detect the language of the label."""
    try:
        lang_list = detect_langs(string)
        lang_string = str(lang_list[0])
        confidence = float(lang_string[3:])
        lang = lang_string[:2]
    except: #exceptions occur when no info to decide, e.g. numbers
        lang = 'zxx'
        confidence = float(0)
    if confidence < precision_cutoff:
        print('Warning: language confidence below', precision_cutoff, ':', confidence)
    return lang

def integer_value(r):
    """Return value of Roman numeral symbol.
    
    Note:
    -----
    Code from https://www.geeksforgeeks.org/python-program-for-converting-roman-numerals-to-decimal-lying-between-1-to-3999/"""    
    if (r == 'I'):
        return 1
    if (r == 'V'):
        return 5
    if (r == 'X'):
        return 10
    if (r == 'L'):
        return 50
    if (r == 'C'):
        return 100
    if (r == 'D'):
        return 500
    if (r == 'M'):
        return 1000
    return -1

def roman_to_decimal(numeral):
    """Convert Roman numerals to integers.
    
    Note:
    -----
    Code from https://www.geeksforgeeks.org/python-program-for-converting-roman-numerals-to-decimal-lying-between-1-to-3999/"""
    str = numeral.upper()
    res = 0
    i = 0

    while (i < len(str)):

        # Getting value of symbol s[i]
        s1 = integer_value(str[i])
        
        # Return a negative number if error.
        if s1 < 0:
            return -1

        if (i + 1 < len(str)):

            # Getting value of symbol s[i + 1]
            s2 = integer_value(str[i + 1])
            
            # Return a negative number if error.
            if s2 < 0:
                return -1

            # Comparing both values
            if (s1 >= s2):

                # Value of current symbol is greater
                # or equal to the next symbol
                res = res + s1
                i = i + 1
            else:

                # Value of current symbol is greater
                # or equal to the next symbol
                res = res + s2 - s1
                i = i + 2
        else:
            res = res + s1
            i = i + 1

    return res

def calculate_pages(range):
    """Calculate the number of pages from the page range.
    
    Note
    ----
    Supports properly formatted Roman numerals and doesn't care about whitespace."""
    numbers = range.split('-')
    
    # If there is only a single number or an empty cell, return the empty string.
    if len(numbers) < 2:
        return ''
    # Edge case where it isn't a well-formed range and has multiple hyphens
    if len(numbers) > 2:
        return ''
    
    # Step through the two numbers to try to convert them from Roman numerals if not integers.
    for index, number in enumerate(numbers):
        number = number.strip()
        if not number.isnumeric():
            numbers[index] = roman_to_decimal(number)
            
            # Will return -1 error if it contains characters not valid for Roman numerals 
            if numbers[index] < 0:
                return ''
    
    number_pages = int(numbers[1]) - int(numbers[0]) + 1 # Need to add one since first page in range counts
    if number_pages < 1:
        return ''
    return str(number_pages)
    
    return value

def clean_doi(value):
    """Turn DOI into uppercase and remove leading and trailing whitespace."""
    cleaned_value = value.upper().strip()
    return cleaned_value

def disambiguate_published_in(value):
    """Use the value in the ISSN column to try to find the containing work.
    
    Note:
    -----
    For journal articles, this performs a legitimate WQS search for the journal title using the ISSN.
    For book chapters, the ISSN column may contain the Q ID of the containing book, inserted there during
    a pre-processing step (a hack, but typically books would not have an ISSN and this column would be empty)."""
    if value == '':
        return value
    
    # The value is a Q ID and was determined during a pre-processing step (i.e. for book chapters)
    if value[0] == 'Q':
        return value

    # Look up the ISSN from CrossRef in Wikidata
    # Build query string
    query_string = '''select distinct ?container ?containerLabel where {
      ?container wdt:P236 ''' + value + '''.
      optional {
      ?container rdfs:label ?containerLabel.
      filter(lang(?containerLabel)="''' + default_language + '''")
      }
    }'''
    #print(query_string)

    # Send query to endpoint
    query_results = send_sparql_query(query_string)
    #pp.pprint(query_results)

    # !!!!!!!!!!!!!!!!!! Enable this code when the error log is set up
    """
    if len(query_results) > 1:
        print('Warning! More than one container in Wikidata matched the ISSN ', file=log_object)
        print(query_results, '\n', file=log_object)
    """
    # Extract Q ID from SPARQL query results. If there is more than one result, the last one will be used for the Q ID
    for result in query_results:
        container_qid = extract_local_name(result['container']['value'])
        # Skipping this since container name isn't passed into the function.
        """
        journal_name = result['containerLabel']['value']
        if journal_name != crossref_results['journal_title']:
            # NOTE: did empirical testing to see which kind of fuzzy matching worked best
            #ratio = fuzz.ratio(journal_name, crossref_results['journal_title'])
            #partial_ratio = fuzz.partial_ratio(journal_name, crossref_results['journal_title'])
            #sort_ratio = fuzz.token_sort_ratio(journal_name, crossref_results['journal_title'])
            #set_ratio = fuzz.token_set_ratio(journal_name, crossref_results['journal_title'])
            w_ratio = fuzz.WRatio(journal_name, crossref_results['journal_title'])
            #print('name similarity ratio', ratio)
            #print('partial ratio', partial_ratio)
            #print('sort_ratio', sort_ratio)
            #print('set_ratio', set_ratio)
            if w_ratio < 99:
                print('article:', crossref_results['label_' + default_language], 'w_ratio:', w_ratio, 'Warning: Wikidata journal: "' + journal_name + '"', journal_qid, 'does not match CrossRef journal title: "' + crossref_results['journal_title'] + '"\n', file=log_object)
        #print('article:', crossref_results['label_' + default_language], 'journal:', journal_qid, journal_name)
        """
    return container_qid


def disambiguate_publisher(name_string):
    """Look up the publisher Q ID from a list derived from a SPARQL query https://w.wiki/4pbi"""
    # Set publisher Q ID to empty string if there's no publisher string
    if name_string == '':
        return ''
    
    best_match_score = 0
    best_match = ''
    best_match_label = ''
    for qid, publisher in publishers.iterrows():  # The publishers DataFrame is a global variable
        w_ratio = fuzz.WRatio(name_string, publisher['label'])
        if w_ratio > best_match_score:
            best_match = qid
            best_match_label = publisher['label']
            best_match_score = w_ratio
            
    if best_match_score < 98:
        print('w_ratio:', best_match_score, 'Warning: poor match of: "' + best_match_label + '"', best_match, 'to CrossRef publisher: "' + name_string + '"\n')
        #print('w_ratio:', best_match_score, 'Warning: poor match of: "' + best_match_label + '"', best_match, 'to CrossRef publisher: "' + name_string + '"\n', file=log_object)
    return best_match

def disambiguate_place_of_publication(value):
    """Return the value argument unchanged."""
    return value

def today():
    """Generate the current UTC xsd:date"""
    whole_time_string_z = datetime.datetime.utcnow().isoformat() # form: 2019-12-05T15:35:04.959311
    date_z = whole_time_string_z.split('T')[0] # form 2019-12-05
    return date_z


In [2]:
iri = 'http://www.wikidata.org/entity/Q6386232'
extract_local_name(iri)

'Q6386232'

In [98]:
publishers = pd.read_csv('publishers.csv', na_filter=False, dtype = str)
publishers.set_index('qid', inplace=True)

for prop in mapping['properties']:
    print(prop['variable'], prop['value'])
    expression = prop['value'] + "('" + work_data[prop['source']] + "')"
    output = eval(expression)
    print(output)
    print()
    
    

full_work_available identity
https://purl.stanford.edu/gc695hq2680

edition_version identity


language detect_language
fr

number_pages calculate_pages
2

doi clean_doi
10.1093/OXFORDHB/9780190221171.013.34

pmid identity
29097536

publication_date identity
2013-07-14

title_en identity
Le péché contre le sang : la syphilis et la construction de l’identité juive

published_in disambiguate_published_in
Q4041879

volume identity
101

page identity
1464-1465

issue identity
4

isbn10 identity
0-8070-8592-8

isbn13 identity
978-1-5416-4497-7

publisher disambiguate_publisher
Q1479937

place_of_publication disambiguate_place_of_publication
Q173813



In [22]:
out_dict = extract_metadata(config, work_data)
out_dict

{'qid': '',
 'full_work_available_uuid': '',
 'full_work_available': 'https://purl.stanford.edu/gc695hq2680',
 'full_work_available_ref1_hash': '',
 'full_work_available_ref1_retrieved_nodeId': '',
 'full_work_available_ref1_retrieved_val': '2022-02-18',
 'full_work_available_ref1_retrieved_prec': '',
 'edition_version_uuid': '',
 'edition_version': '',
 'language_uuid': '',
 'language': 'Q1860',
 'number_pages_uuid': '',
 'number_pages_nodeId': '',
 'number_pages_val': '18',
 'number_pages_unit': 'Q1069725',
 'doi_uuid': '',
 'doi': '10.1093/OXFORDHB/9780190221171.013.34',
 'doi_ref1_hash': '',
 'doi_ref1_referenceUrl': 'http://doi.org/10.1353/JBL.2013.0032',
 'doi_ref1_retrieved_nodeId': '',
 'doi_ref1_retrieved_val': '2022-08-21',
 'doi_ref1_retrieved_prec': '',
 'pmid_uuid': '',
 'pmid': '29097536',
 'pmid_ref1_hash': '',
 'pmid_ref1_referenceUrl': 'http://doi.org/10.1353/JBL.2013.0032',
 'pmid_ref1_retrieved_nodeId': '',
 'pmid_ref1_retrieved_val': '2022-08-21',
 'pmid_ref1_retrie

In [16]:
def extract_metadata(config):
    out_dict = {'qid': ''}
    
    for field in out_fields_labels:   
        #print(field, crossref_results[field])
        out_dict[field] = crossref_results[field]
    #print()
    for field in out_fields_noref:   
        #print(field, crossref_results[field])
        out_dict[field + '_uuid'] = ''
        out_dict[field] = crossref_results[field]
    #print()
    
    # Fields with a retrieved date, but reference URL not needed
    for field in out_fields_no_url:
        #print(field, crossref_results[field])
        out_dict[field + '_uuid'] = ''
        if field == 'published':
            out_dict[field + '_nodeId'] = ''
            out_dict[field + '_val'] = crossref_results[field]
            out_dict[field + '_prec'] = ''
        else:
            out_dict[field] = crossref_results[field]
        # Only add a reference if there is a value for that field
        if crossref_results[field] == '':
            out_dict[field + '_ref1_hash'] = ''
            out_dict[field + '_ref1_retrieved_nodeId'] = ''
            out_dict[field + '_ref1_retrieved_val'] = ''
            out_dict[field + '_ref1_retrieved_prec'] = ''
        else:
            out_dict[field + '_ref1_hash'] = ''
            out_dict[field + '_ref1_retrieved_nodeId'] = ''
            out_dict[field + '_ref1_retrieved_val'] = today
            out_dict[field + '_ref1_retrieved_prec'] = ''
    #print()
    
    # Fields with both reference URLs and retrieved dates
    for field in out_fields_ref:
        #print(field, crossref_results[field])
        out_dict[field + '_uuid'] = ''
        if field == 'published':
            out_dict[field + '_nodeId'] = ''
            out_dict[field + '_val'] = crossref_results[field]
            out_dict[field + '_prec'] = ''
        else:
            out_dict[field] = crossref_results[field]
        # Only add a reference if there is a value for that field
        if crossref_results[field] == '':
            out_dict[field + '_ref1_hash'] = ''
            out_dict[field + '_ref1_referenceUrl'] = ''
            out_dict[field + '_ref1_retrieved_nodeId'] = ''
            out_dict[field + '_ref1_retrieved_val'] = ''
            out_dict[field + '_ref1_retrieved_prec'] = ''
        else:
            out_dict[field + '_ref1_hash'] = ''
            out_dict[field + '_ref1_referenceUrl'] = handle
            out_dict[field + '_ref1_retrieved_nodeId'] = ''
            out_dict[field + '_ref1_retrieved_val'] = today
            out_dict[field + '_ref1_retrieved_prec'] = ''
    return(out_dict)
