In [None]:
# Run only once if you need to install the python-Levenshtein package

# Install a pip package in the current Jupyter kernel
import sys
!{sys.executable} -m pip install python-Levenshtein

In [87]:
import requests   # best library to manage HTTP transactions
#from bs4 import BeautifulSoup # web-scraping library
import json
from time import sleep
import csv
#import math
#from fuzzywuzzy import fuzz # fuzzy logic matching
#from fuzzywuzzy import process
#import xml.etree.ElementTree as et # library to traverse XML tree
import urllib
#import datetime
#import string

accept_media_type = 'application/json'

# Determine the current CrossRef rate limit by an initial ping
crossref_headers = requests.get('https://api.crossref.org/works/10.3233/SW-150203', headers={'Accept' : accept_media_type}).headers
limit_count = int(crossref_headers['x-rate-limit-limit'])
interval_string = crossref_headers['x-rate-limit-interval']
interval_sec = int(interval_string[:len(interval_string)-1]) # remove the "s" from the end
api_sleep = interval_sec / limit_count + 0.005

# generates a dictionary to be passed in a requests GET method to generate the request header
def generate_header_dictionary(accept_media_type):
    user_agent_header = 'VanderBot/1.7 (https://github.com/HeardLibrary/linked-data/tree/master/vanderbot; mailto:steve.baskauf@vanderbilt.edu)'
    request_header_dictionary = {
        'Accept' : accept_media_type,
        'User-Agent': user_agent_header
    }
    return request_header_dictionary


In [88]:
# See https://github.com/CrossRef/rest-api-doc for API details
# Note: no authentication required, but must be "nice": observe rate limit, provide mailto:
def retrieve_crossref_data(doi):
    crossref_endpoint_url = 'https://api.crossref.org/works/'
    # urllib.parse.quote performs URL encoding of a string
    encoded_doi = urllib.parse.quote(doi)
    search_url = crossref_endpoint_url + encoded_doi
    response = requests.get(search_url, headers=generate_header_dictionary(accept_media_type))
    article_dict = {}
    if response.status_code != 404: # return empty dict if not found
        author_list = []
        try:
            response_structure = response.json()
            data = response_structure['message']
        except:
            # if not JSON, just return the response text
            article_dict['message'] = response.text
            sleep(api_sleep)
            return article_dict
        #print(json.dumps(data, indent = 2))
        if 'author' in data:
            authors = data['author']
            for author in authors:
                authorDict = {}
                if 'ORCID' in author:
                    authorDict['orcid'] = author['ORCID']
                else:
                    authorDict['orcid'] = ''
                if 'sequence' in author:
                    authorDict['sequence'] = author['sequence']
                else:
                    authorDict['sequence'] = ''
                if 'given' in author:
                    authorDict['givenName'] = author['given']
                else:
                    authorDict['givenName'] = ''
                if 'family' in author:
                    authorDict['familyName'] = author['family']
                else:
                    authorDict['familyName'] = ''
                affiliationList = []
                if 'affiliation' in author:
                    for affiliation in author['affiliation']:
                        affiliationList.append(affiliation['name'])
                # if there aren't any affiliations, the list will remain empty
                authorDict['affiliation'] = affiliationList
                author_list.append(authorDict)
            article_dict['authors'] = author_list
        if 'issued' in data:
            issued = data['issued']['date-parts'][0]
            issued_date = str(issued[0])
            if len(issued) > 1:
                if len(str(issued[1])) == 1:
                    issued_date += '-0'+ str(issued[1])
                else:
                    issued_date += '-'+ str(issued[1])
                if len(issued) > 2:                
                    if len(str(issued[2])) == 1:
                        issued_date += '-0'+ str(issued[2])
                    else:
                        issued_date += '-'+ str(issued[2])
            article_dict['issued'] = issued_date
        else:
            article_dict['issued'] = ''
        if 'volume' in data:
            article_dict['volume'] = data['volume']
        else:
            article_dict['volume'] = ''
        if 'issue' in data:
            article_dict['issue'] = data['issue']
        else:
            article_dict['issue'] = ''
        if 'page' in data:
            article_dict['pages'] = data['page']
        else:
            article_dict['pages'] = ''
        if 'type' in data:
            article_dict['type'] = data['type']
        else:
            article_dict['type'] = ''
        if 'ISSN' in data:
            article_dict['journal_issn'] = data['ISSN']
        else:
            article_dict['journal_issn'] = []
        if 'container-title' in data:
            article_dict['journal_title'] = data['container-title'][0]
        else:
            article_dict['journal_title'] = ''
         
    sleep(api_sleep)
    return article_dict


In [89]:
doi = '10.1186/S13643-020-01393-8'
#doi = '10.1603/0046-225X-32.5.915'
#doi = '10.3233/SW-150203'
results = retrieve_crossref_data(doi)

In [90]:
print(json.dumps(results, indent=2))

{
  "authors": [
    {
      "orcid": "http://orcid.org/0000-0003-2551-019X",
      "sequence": "first",
      "givenName": "Lori",
      "familyName": "Schirle",
      "affiliation": []
    },
    {
      "orcid": "",
      "sequence": "additional",
      "givenName": "Amanda L.",
      "familyName": "Stone",
      "affiliation": []
    },
    {
      "orcid": "",
      "sequence": "additional",
      "givenName": "Matthew C.",
      "familyName": "Morris",
      "affiliation": []
    },
    {
      "orcid": "",
      "sequence": "additional",
      "givenName": "Sarah S.",
      "familyName": "Osmundson",
      "affiliation": []
    },
    {
      "orcid": "",
      "sequence": "additional",
      "givenName": "Philip D.",
      "familyName": "Walker",
      "affiliation": []
    },
    {
      "orcid": "",
      "sequence": "additional",
      "givenName": "Mary S.",
      "familyName": "Dietrich",
      "affiliation": []
    },
    {
      "orcid": "",
      "sequence": "additional