In [None]:
# Run only once if you need to install the python-Levenshtein package

# Install a pip package in the current Jupyter kernel
import sys
!{sys.executable} -m pip install python-Levenshtein

# Retrieve data using DOI

Currently the script only uses the CrossRef API, but potentially could use others like DataCite

## Configuration

This section imports libraries, sets default values, and defines functions

In [85]:
# (c) 2020 Vanderbilt University. This program is released under a GNU General Public License v3.0 http://www.gnu.org/licenses/gpl-3.0
# Author: Steve Baskauf

import requests   # best library to manage HTTP transactions
#from bs4 import BeautifulSoup # web-scraping library
import json
from time import sleep
import csv
import sys
#import math
from fuzzywuzzy import fuzz # fuzzy logic matching
#from fuzzywuzzy import process
#import xml.etree.ElementTree as et # library to traverse XML tree
import urllib
import datetime
#import string
import pprint
pp = pprint.PrettyPrinter(indent=4)

accept_media_type = 'application/json'
endpoint = 'https://query.wikidata.org/sparql'
user_agent_header = 'VanderBot/1.7 (https://github.com/HeardLibrary/linked-data/tree/master/vanderbot; mailto:steve.baskauf@vanderbilt.edu)'
sparql_sleep = 0.1
default_language = 'en'

# Option to log to a file instead of the console
log_path = '' # path to log file, default to none
log_object = sys.stdout # log output defaults to the console screen

opts = [opt for opt in sys.argv[1:] if opt.startswith('-')]
args = [arg for arg in sys.argv[1:] if not arg.startswith('-')]

if '--log' in opts: # set output to specified log file or path including file name
    log_path = args[opts.index('--log')]
    log_object = open(log_path, 'wt', encoding='utf-8') # direct output sent to log_object to log file instead of sys.stdout
if '-L' in opts: # set output to specified log file or path including file name
    log_path = args[opts.index('-L')]
    log_object = open(log_path, 'wt', encoding='utf-8') # direct output sent to log_object to log file instead of sys.stdout

# --------------------------
# *** For now, hard-code logging to errors.log
log_path = 'errors.log'
log_object = open(log_path, 'wt', encoding='utf-8')
# --------------------------

# List of work types used by CrossRef
work_types = [
    {
    'crossref_type_string': 'journal-article',
    'qid': 'Q18918145', # academic journal article, alternatively Q13442814 scholarly article
    'description': 'journal article'
    },
    {
    'crossref_type_string': 'book-chapter',
    'qid': 'Q21481766', # academic journal article, alternatively Q13442814 scholarly article
    'description': 'academic book chapter'
    }
]

# Output CSV label/description fields to be populated without references
out_fields_labdes = ['label_' + default_language, 'description_' + default_language]

# Output CSV property fields to be populated without references
out_fields_noref = ['instance_of']

# Output CSV fields that include reference fields
out_fields_ref = ['doi', 'published', 'title_' + default_language, 'journal', 'volume', 'page', 'issue']
    
# generates a dictionary to be passed in a requests GET method to generate the request header
def generate_header_dictionary(accept_media_type):
    request_header_dictionary = {
        'Accept' : accept_media_type,
        'User-Agent': user_agent_header
    }
    return request_header_dictionary

# Determine the current CrossRef rate limit by an initial ping
response = requests.get('https://api.crossref.org/works/10.3233/SW-150203', headers=generate_header_dictionary(accept_media_type))
crossref_headers = response.headers
limit_count = int(crossref_headers['x-rate-limit-limit'])
interval_string = crossref_headers['x-rate-limit-interval']
interval_sec = int(interval_string[:len(interval_string)-1]) # remove the "s" from the end
api_sleep = interval_sec / limit_count + 0.005

# Due to problems with direct POST of UTF-8, changed to POST with URL-encoded parameters
# See https://www.w3.org/TR/sparql11-protocol/#update-via-post-urlencoded
# and https://stackoverflow.com/questions/34618149/post-unicode-string-to-web-service-using-python-requests-library
def generate_sparql_header_dictionary(accept_media_type,user_agent_header):
    request_header_dictionary = {
        'Accept' : accept_media_type,
#        'Content-Type': 'application/sparql-query',
        'Content-Type': 'application/x-www-form-urlencoded',
        'User-Agent': user_agent_header
    }
    return request_header_dictionary

# The following function requires the request header generated above
sparql_request_header = generate_sparql_header_dictionary(accept_media_type, user_agent_header)

# Generate the current UTC xsd:date
def generate_utc_date():
    whole_time_string_z = datetime.datetime.utcnow().isoformat() # form: 2019-12-05T15:35:04.959311
    date_z = whole_time_string_z.split('T')[0] # form 2019-12-05
    return date_z

# Extracts the local name part of an IRI, e.g. a qNumber from a Wikidata IRI
def extract_local_name(iri):
    # pattern is http://www.wikidata.org/entity/Q6386232
    pieces = iri.split('/')
    last_piece = len(pieces)
    return pieces[last_piece - 1]

# Read from a CSV file into a list of dictionaries
def read_dicts_from_csv(filename):
    with open(filename, 'r', newline='', encoding='utf-8') as file_object:
        dict_object = csv.DictReader(file_object)
        array = []
        for row in dict_object:
            array.append(row)
    return array

# Write list of dictionaries to a CSV file
def write_dicts_to_csv(table, filename, fieldnames):
    with open(filename, 'w', newline='', encoding='utf-8') as csv_file_object:
        writer = csv.DictWriter(csv_file_object, fieldnames=fieldnames)
        writer.writeheader()
        for row in table:
            writer.writerow(row)

# Sends a query to the query service endpoint. 
# NOTE: request_header and endpoint are global variables defined earlier in the script
def send_sparql_query(query_string):
    # You can delete the two print statements if the queries are short. However, for large/long queries,
    # it's good to let the user know what's going on.
    #print('querying SPARQL endpoint to acquire item metadata')
    #response = requests.post(endpoint, data=query_string.encode('utf-8'), headers=sparql_request_header)
    response = requests.post(endpoint, data=dict(query=query_string), headers=sparql_request_header)
    #print(response.text) # uncomment to view the raw response, e.g. if you are getting an error
    data = response.json()

    # Extract the values from the response JSON
    results = data['results']['bindings']
    
    #print('done retrieving data')
    # print(json.dumps(results, indent=2))
    
    sleep(sparql_sleep) # delay to avoid hitting the Query Service too fast
    return results

# Use the CrossRef registration agency test to find out who issued the non-CrosRef DOI
def discover_issuing_agency(doi):
    encoded_doi = urllib.parse.quote(doi)
    search_url = 'https://api.crossref.org/works/' + encoded_doi + '/agency'
    response = requests.get(search_url, headers=generate_header_dictionary(accept_media_type))
    sleep(api_sleep) # delay to avoid hitting the API faster than acceptable rate
    if response.status_code == 404: # return "not found" if URL doesn't dereference
        return 'not found'
    try: # Try to parse as JSON
        data = response.json()
        return data['message']['agency']['id']
    except: # if the response isn't JSON, then just return the response text
        return response.text

# See https://github.com/CrossRef/rest-api-doc for API details
# Note: no authentication required, but must be "nice": observe rate limit, provide mailto:
def retrieve_crossref_data(doi):
    crossref_endpoint_url = 'https://api.crossref.org/works/'
    # urllib.parse.quote performs URL encoding of a string
    encoded_doi = urllib.parse.quote(doi)
    search_url = crossref_endpoint_url + encoded_doi
    response = requests.get(search_url, headers=generate_header_dictionary(accept_media_type))
    article_dict = {}
    if response.status_code == 404: # return empty dict if not found
        # *** NOTE: at some future time, look up data at alternative issuing agencies
        # For example, see https://support.datacite.org/docs/api-get-doi
        
        # For now, just log it
        print('article:', article['qid'], 'DOI issuing agency:', discover_issuing_agency(article['doi']),'\n', file=log_object)
        sleep(api_sleep)
        return article_dict
    else:
        author_list = []
        try:
            response_structure = response.json()
            data = response_structure['message']
        except:
            # if not JSON, just return the response text
            article_dict['message'] = response.text
            sleep(api_sleep)
            return article_dict
        #print(json.dumps(data, indent = 2))
        article_dict['doi'] = doi
        if 'author' in data:
            authors = data['author']
            for author in authors:
                authorDict = {}
                if 'ORCID' in author:
                    authorDict['orcid'] = author['ORCID']
                else:
                    authorDict['orcid'] = ''
                if 'sequence' in author:
                    authorDict['sequence'] = author['sequence']
                else:
                    authorDict['sequence'] = ''
                if 'given' in author:
                    authorDict['givenName'] = author['given']
                else:
                    authorDict['givenName'] = ''
                if 'family' in author:
                    authorDict['familyName'] = author['family']
                else:
                    authorDict['familyName'] = ''
                affiliationList = []
                if 'affiliation' in author:
                    for affiliation in author['affiliation']:
                        affiliationList.append(affiliation['name'])
                # if there aren't any affiliations, the list will remain empty
                authorDict['affiliation'] = affiliationList
                author_list.append(authorDict)
            article_dict['authors'] = author_list
        if 'issued' in data:
            issued = data['issued']['date-parts'][0]
            issued_date = str(issued[0])
            if len(issued) > 1:
                if len(str(issued[1])) == 1:
                    issued_date += '-0'+ str(issued[1])
                else:
                    issued_date += '-'+ str(issued[1])
                if len(issued) > 2:                
                    if len(str(issued[2])) == 1:
                        issued_date += '-0'+ str(issued[2])
                    else:
                        issued_date += '-'+ str(issued[2])
            article_dict['published'] = issued_date
        else:
            article_dict['published'] = ''
        if 'volume' in data:
            article_dict['volume'] = data['volume']
        else:
            article_dict['volume'] = ''
        if 'issue' in data:
            article_dict['issue'] = data['issue']
        else:
            article_dict['issue'] = ''
        if 'page' in data:
            article_dict['page'] = data['page']
        else:
            article_dict['page'] = ''
        if 'ISSN' in data:
            article_dict['journal_issn'] = data['ISSN']
        else:
            article_dict['journal_issn'] = []
        if 'title' in data:
            if len(data['title']) > 0:
                article_dict['title_' + default_language] = data['title'][0]
                article_dict['label_' + default_language] = data['title'][0]
        else:
            article_dict['title_' + default_language] = ''
            article_dict['label_' + default_language] = ''
        if 'container-title' in data:
            if len(data['container-title']) > 0:
                article_dict['journal_title'] = data['container-title'][0]
        else:
            article_dict['journal_title'] = ''
         
        if 'type' in data:
            found = False
            for work_type in work_types:
                if data['type'] == work_type['crossref_type_string']:
                    found = True
                    article_dict['instance_of'] = work_type['qid']
                    article_dict['description_' + default_language] = work_type['description']
            if not found:
                article_dict['instance_of'] = ''
                article_dict['description_' + default_language] = ''
        else:
            article_dict['instance_of'] = ''
            article_dict['description_' + default_language] = ''

    sleep(api_sleep)
    return article_dict

# Look up the ISSN from CrossRef in Wikidata
def extract_journal_qid(crossref_results, article):
    if len(crossref_results['journal_issn']) == 0:
        crossref_results['journal'] = ''
        print('article:', article['qid'], 'has no ISSN.\n', file=log_object)
        return crossref_results

    # Create VALUES list for items
    issns_string = ''
    for issn in crossref_results['journal_issn']:
        issns_string += '"' + issn + '"\n'
    # Remove trailing newline
    issns_string = issns_string[:len(issns_string)-1]

    # Build query string
    query_string = '''select distinct ?journal ?journalLabel where {
      VALUES ?issn
        {
    ''' + issns_string + '''
        }
      ?journal wdt:P236 ?issn.
      ?journal rdfs:label ?journalLabel.
      filter(lang(?journalLabel)="en")
    }'''
    #print(query_string)

    # Send query to endpoint
    query_results = send_sparql_query(query_string)
    #pp.pprint(query_results)

    if len(query_results) > 1:
        print('Warning! More than one publication in Wikidata matched the ISSN for article', article['qid'], file=log_object)
        print(query_results, '\n', file=log_object)

    # Extract Q ID from SPARQL query results. If there is more than one result, the last one will be used for the Q ID
    for result in query_results:
        journal_qid = extract_local_name(result['journal']['value'])
        journal_name = result['journalLabel']['value']
        if journal_name != crossref_results['journal_title']:
            # NOTE: did empirical testing to see which kind of fuzzy matching worked best
            #ratio = fuzz.ratio(journal_name, crossref_results['journal_title'])
            #partial_ratio = fuzz.partial_ratio(journal_name, crossref_results['journal_title'])
            #sort_ratio = fuzz.token_sort_ratio(journal_name, crossref_results['journal_title'])
            #set_ratio = fuzz.token_set_ratio(journal_name, crossref_results['journal_title'])
            w_ratio = fuzz.WRatio(journal_name, crossref_results['journal_title'])
            #print('name similarity ratio', ratio)
            #print('partial ratio', partial_ratio)
            #print('sort_ratio', sort_ratio)
            #print('set_ratio', set_ratio)
            if w_ratio < 99:
                print('article:', article['qid'], 'w_ratio:', w_ratio, 'Warning: Wikidata journal: "' + journal_name + '"', journal_qid, 'does not match CrossRef journal title: "' + crossref_results['journal_title'] + '"\n', file=log_object)
        #print('article:', article['qid'], 'journal:', journal_qid, journal_name)
    crossref_results['journal'] = journal_qid
    return crossref_results


## Retrieve DOI data for existing work records



In [3]:
filename = 'articles.csv'
articles = read_dicts_from_csv(filename)
for article in articles[50:100]:
    if article['doi'] != '':
        crossref_results = retrieve_crossref_data(article['doi'])
        if crossref_results != {}:        
            crossref_results = extract_journal_qid(crossref_results, article)

if log_path != '':
    log_object.close()
print('done')

done


## Retrieve DOI data for new record

In [87]:
doi = '10.1186/S13643-020-01393-8'
today = generate_utc_date()

filename = 'articles.csv'
articles = read_dicts_from_csv(filename)
fieldnames = articles[0].keys() # get the field names from the existing file

crossref_results = retrieve_crossref_data(doi)
if crossref_results != {}:        
    crossref_results = extract_journal_qid(crossref_results, article)
    #print(crossref_results)
    
    out_dict = {}
    for field in out_fields_labdes:   
        #print(field, crossref_results[field])
        out_dict[field] = crossref_results[field]
    #print()
    for field in out_fields_noref:   
        #print(field, crossref_results[field])
        out_dict[field + '_uuid'] = ''
        out_dict[field] = crossref_results[field]
    #print()
    for field in out_fields_ref:
        #print(field, crossref_results[field])
        out_dict[field + '_uuid'] = ''
        if field == 'published':
            out_dict[field + '_nodeId'] = ''
            out_dict[field + '_val'] = crossref_results[field]
            out_dict[field + '_prec'] = ''
        else:
            out_dict[field] = crossref_results[field]
        # Only add a reference if there is a value for that field
        if crossref_results[field] == '':
            out_dict[field + '_ref1_hash'] = ''
            out_dict[field + '_ref1_referenceUrl'] = ''
            out_dict[field + '_ref1_retrieved_nodeId'] = ''
            out_dict[field + '_ref1_retrieved_val'] = ''
            out_dict[field + '_ref1_retrieved_prec'] = ''
        else:
            out_dict[field + '_ref1_hash'] = ''
            out_dict[field + '_ref1_referenceUrl'] = 'http://doi.org/' + doi
            out_dict[field + '_ref1_retrieved_nodeId'] = ''
            out_dict[field + '_ref1_retrieved_val'] = today
            out_dict[field + '_ref1_retrieved_prec'] = ''

    #print()
print(out_dict)
articles.append(out_dict)

#write_dicts_to_csv(articles, filename, fieldnames)
print('done')


{'label_en': 'Leftover opioids following adult surgical procedures: a systematic review and meta-analysis', 'description_en': 'journal article', 'instance_of_uuid': '', 'instance_of': 'Q18918145', 'doi_uuid': '', 'doi': '10.1186/S13643-020-01393-8', 'doi_ref1_hash': '', 'doi_ref1_referenceUrl': 'http://doi.org/10.1186/S13643-020-01393-8', 'doi_ref1_retrieved_nodeId': '', 'doi_ref1_retrieved_val': '2021-03-26', 'doi_ref1_retrieved_prec': '', 'published_uuid': '', 'published_nodeId': '', 'published_val': '2020-06-11', 'published_prec': '', 'published_ref1_hash': '', 'published_ref1_referenceUrl': 'http://doi.org/10.1186/S13643-020-01393-8', 'published_ref1_retrieved_nodeId': '', 'published_ref1_retrieved_val': '2021-03-26', 'published_ref1_retrieved_prec': '', 'title_en_uuid': '', 'title_en': 'Leftover opioids following adult surgical procedures: a systematic review and meta-analysis', 'title_en_ref1_hash': '', 'title_en_ref1_referenceUrl': 'http://doi.org/10.1186/S13643-020-01393-8', 't

In [88]:
print(json.dumps(crossref_results, indent=2))

{
  "doi": "10.1186/S13643-020-01393-8",
  "authors": [
    {
      "orcid": "http://orcid.org/0000-0003-2551-019X",
      "sequence": "first",
      "givenName": "Lori",
      "familyName": "Schirle",
      "affiliation": []
    },
    {
      "orcid": "",
      "sequence": "additional",
      "givenName": "Amanda L.",
      "familyName": "Stone",
      "affiliation": []
    },
    {
      "orcid": "",
      "sequence": "additional",
      "givenName": "Matthew C.",
      "familyName": "Morris",
      "affiliation": []
    },
    {
      "orcid": "",
      "sequence": "additional",
      "givenName": "Sarah S.",
      "familyName": "Osmundson",
      "affiliation": []
    },
    {
      "orcid": "",
      "sequence": "additional",
      "givenName": "Philip D.",
      "familyName": "Walker",
      "affiliation": []
    },
    {
      "orcid": "",
      "sequence": "additional",
      "givenName": "Mary S.",
      "familyName": "Dietrich",
      "affiliation": []
    },
    {
      "or

In [89]:
def generate_name_alternatives(name):
    # treat commas as if they were spaces
    name = name.replace(',', ' ')
    # get rid of periods, sometimes periods are close up with no spaces
    name = name.replace('.', ' ')

    pieces = name.split(' ')
    while '' in pieces:
        pieces.remove('')
    
    # Remove ", Jr.", "III", etc. from end of name
    if pieces[len(pieces)-1] == 'Jr':
        pieces = pieces[0:len(pieces)-1]
        suffix = ', Jr.'
    elif pieces[len(pieces)-1] == 'II':
        pieces = pieces[0:len(pieces)-1]
        suffix = ' II'
    elif pieces[len(pieces)-1] == 'III':
        pieces = pieces[0:len(pieces)-1]
        suffix = ' III'
    elif pieces[len(pieces)-1] == 'IV':
        pieces = pieces[0:len(pieces)-1]
        suffix = ' IV'
    elif pieces[len(pieces)-1] == 'V':
        pieces = pieces[0:len(pieces)-1]
        suffix = ' V'
    elif len(pieces) > 3 and pieces[len(pieces)-2] == 'the' and pieces[len(pieces)-1] == 'elder':
        pieces = pieces[0:len(pieces)-2]
        suffix = ' the elder'
    else:
        suffix = ''

    # generate initials for all names
    initials = []
    for piece in pieces:
        # make sure first character is alphabetic
        # only fixes the case where there is one alphanumeric, but more than one is rare
        # typical cases are like (Kit) or "Kit"
        if not piece[0:1].isalpha():
            piece = piece[1:len(piece)] # remove the first non-alphabetic character
        if len(piece) > 0:
            initials.append(piece[0:1])
        
    alternatives = []
    # full name
    name_version = ''
    for piece_number in range(0, len(pieces)-1):
        name_version += pieces[piece_number] + ' '
    name_version += pieces[len(pieces)-1]
    alternatives.append(name_version)
    
    # full name with suffix
    if suffix != '':
        name_version = ''
        for piece_number in range(0, len(pieces)-1):
            name_version += pieces[piece_number] + ' '
        name_version += pieces[len(pieces)-1] + suffix
        alternatives.append(name_version)
    
    # first and last name with initials
    name_version = pieces[0] + ' '
    for piece_number in range(1, len(pieces)-1):
        name_version += initials[piece_number] + ' '
    name_version += pieces[len(pieces)-1]
    alternatives.append(name_version)
    
    # first and last name with initials and periods
    name_version = pieces[0] + ' '
    for piece_number in range(1, len(pieces)-1):
        name_version += initials[piece_number] + '. '
    name_version += pieces[len(pieces)-1]
    alternatives.append(name_version)

    # first and last name only
    name_version = pieces[0] + ' '
    name_version += pieces[len(pieces)-1]
    alternatives.append(name_version)

    # first initial and last name only
    name_version = initials[0] + ' '
    name_version += pieces[len(pieces)-1]
    alternatives.append(name_version)

    # first initial with period and last name only
    name_version = initials[0] + '. '
    name_version += pieces[len(pieces)-1]
    alternatives.append(name_version)

    # all name initials with last name
    name_version = initials[0] + ' '
    for piece_number in range(1, len(pieces)-1):
        name_version += initials[piece_number] + ' '
    name_version += pieces[len(pieces)-1]
    alternatives.append(name_version)

    # all name initials with periods with last name
    name_version = ''
    for piece_number in range(0, len(pieces)-1):
        name_version += initials[piece_number] + '. '
    name_version += pieces[len(pieces)-1]
    alternatives.append(name_version)

    # all name initials concatenated with last name
    name_version = ''
    for piece_number in range(0, len(pieces)-1):
        name_version += initials[piece_number]
    name_version += ' ' + pieces[len(pieces)-1]
    alternatives.append(name_version)
    
    # remove duplicates
    dedupe = list(set(alternatives))

    return dedupe

def search_name_at_wikidata(name):
    # carry out search for most languages that use Latin characters, plus some other commonly used languages
    # See https://doi.org/10.1145/3233391.3233965
    language_codes = [
        'en',
        'es',
        'pt',
        'fr',
        'it',
        'nl',
        'de',
        'da',
        'et',
        'hu',
        'ga',
        'ro',
        'sk',
        'sl',
        'zu',
        'tr',
        'sv',
        'zh',
        'ru',
        'ja',
        'ar',
        'pl',
        'uk',
        'ca',
        'cs',
        'la',
        'nb',
        'hu',
        'he',
        'eo',
        'fi',
        'ko'
      ]
    name_list = generate_name_alternatives(name)
    alternatives = ''
    for language_code in language_codes:
        for alternative in name_list:
            # get rid of quotes, which will break the query
            alternative = alternative.replace('"', '')
            alternative = alternative.replace("'", '')
            alternatives += '"' + alternative + '"@' + language_code + '\n'
    query = '''
select distinct ?item ?label where {
  VALUES ?value
  {
  ''' + alternatives + '''}
?item rdfs:label|skos:altLabel ?value.
?item rdfs:label ?label.
FILTER(lang(?label)='en')
  }
'''
    #print(dict(data=query))
    #print('searching for ', name)
    results = []
#    r = requests.post(endpoint, data=query.encode('utf-8'), headers=sparql_request_header)
    r = requests.post(endpoint, data=dict(query=query), headers=sparql_request_header)
    try:
        data = r.json()
        statements = data['results']['bindings']
        for statement in statements:
            wikidata_iri = statement['item']['value']
            if 'label' in statement:
                name = statement['label']['value']
            else:
                name = ''
            qnumber = extract_local_name(wikidata_iri)
            results.append({'qId': qnumber, 'name': name})
    except:
        results = [{'error': r.text}]
    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(sparql_sleep)
    return results



In [62]:
#name = '尼可罗·马基亚维利'
#name = 'Nicolás Maquiavelo'
name = 'Никколо Макиавелли'
#generate_name_alternatives(name)
search_name_at_wikidata(name)

[{'qId': 'Q1399', 'name': 'Niccolò Machiavelli'}]

In [90]:
for author in crossref_results['authors']:
    name = author['givenName'] + ' ' + author['familyName']
    hit = search_name_at_wikidata(name)
    print(name)
    print(hit)
    print()

Lori Schirle
[]

Amanda L. Stone
[{'qId': 'Q96192760', 'name': 'Amanda Stone'}, {'qId': 'Q67224428', 'name': 'A Stone'}, {'qId': 'Q4757730', 'name': 'Andrew Leete Stone'}, {'qId': 'Q20646521', 'name': 'Amanda Addams'}, {'qId': 'Q52156179', 'name': 'Alice Balch Stone'}]

Matthew C. Morris
[{'qId': 'Q6789094', 'name': 'Matthew Morris'}, {'qId': 'Q6790991', 'name': 'Matthew Morris'}, {'qId': 'Q56935999', 'name': 'Matthew C Morris'}, {'qId': 'Q64597505', 'name': 'Matthew C Morris'}, {'qId': 'Q71304625', 'name': 'Matthew Morris'}, {'qId': 'Q57488067', 'name': 'Mary Morris'}, {'qId': 'Q75568259', 'name': 'Matthew Morris'}, {'qId': 'Q92420603', 'name': 'Mackenzie Morris'}, {'qId': 'Q1188592', 'name': 'MattyBRaps'}]

Sarah S. Osmundson
[{'qId': 'Q57059281', 'name': 'Sarah S. Osmundson'}]

Philip D. Walker
[{'qId': 'Q96400978', 'name': 'Philip D. Walker'}, {'qId': 'Q60025039', 'name': 'Philip Walker'}, {'qId': 'Q76759075', 'name': 'Philip Walker'}, {'qId': 'Q59292007', 'name': 'Peter Walker'}]


In [76]:
name = 'Christine R.F. Rukasin'
generate_name_alternatives(name)

['C Rukasin',
 'Christine R. F. Rukasin',
 'C. R. F. Rukasin',
 'C. Rukasin',
 'CRF Rukasin',
 'Christine R F Rukasin',
 'C R F Rukasin',
 'Christine Rukasin']