# Scripts associated with Otolaryngology network analysis

Settings, function definitions, global variables, etc.

In [None]:
# (c) 2023 Vanderbilt University. This program is released under a GNU General Public License v3.0 http://www.gnu.org/licenses/gpl-3.0
# The sparqler class is (c) 2023 Steve Baskauf and is released under a GNU General Public License v3.0 http://www.gnu.org/licenses/gpl-3.0
# Author: Steve Baskauf

# ----------------
# Module imports
# ----------------

from typing import List, Dict, Tuple, Any, Optional
#import yaml
import sys
import time
#import csv
import datetime
from pathlib import Path
import json
import pandas as pd
import requests
import requests_cache
from fuzzywuzzy import fuzz  # fuzzy logic matching
# import re # regex
import logging  # See https://docs.python.org/3/howto/logging.html

# Set up cache for HTTP requests
# In this case, don't expire the cache for an hour since there's a limit to the number of API calls per week
requests_cache.install_cache(
    'http_cache', backend='sqlite', expire_after=3600, allowable_methods=['GET', 'POST'])

# Set up log for warnings
# This is a system file and hard to look at, so its data are harvested and put into a plain text log file later.
logging.basicConfig(filename='warnings.log', filemode='w',
                    format='%(message)s', level=logging.WARNING)

# The low cutoff for the fuzzy match score for an institution to be considered a match
INSTITUTION_NO_MATCH_CUTOFF = 80
# The low cutoff for the fuzzy match score for an institution to be considered a match that does not require review
INSTITUTION_REVIEW_CUTOFF = 90

# ----------------
# Utility functions
# ----------------


def load_credential(filename: str, directory: str) -> str:
    """Load a credential string from a plain text file. The string is a single line of text without a newline character.
    Keeping the credential in the home directory prevents accidentally exposing the credential if the directory containing the script is shared.

    Args:
        filename: The name of the file containing the credential.
        directory: The directory where the file is located. Value of 'home' loads from the home directory. For any other value, 
            the filename argument is an absolute or relative path to the file.

        Returns:
            A string containing the credential. If the credential file is not found, an empty string is returned.
    """
    if directory == 'home':
        # Gets path to home directory; works for both Win and Mac.
        home = str(Path.home())
        credential_path = home + '/' + filename
    else:
        directory = 'working'
        credential_path = filename
    try:
        with open(credential_path, 'rt', encoding='utf-8') as file_object:
            cred = file_object.read()
    except:
        print(filename + ' credentials file not found in ' +
              directory + ' directory.')
        cred = ''
    return(cred)


def csv_read(path: str, **kwargs) -> pd.DataFrame:
    """Loads a CSV table into a Pandas DataFrame with all cells as strings and blank cells as empty strings

    Keyword argument:
    rows -- the number of rows of the table to return when used for testing. When omitted, all rows are returned.
    """
    dataframe = pd.read_csv(path, na_filter=False, dtype=str)
    if 'rows' in kwargs:
        return dataframe.head(kwargs['rows']).copy(deep=True)
    else:
        return dataframe


def look_up_alternative_institutional_ids(ror_iri: str) -> dict:
    """Look up alternative institutional IDs for a ROR ID using the Wikidata Query Service.

    Args:
        ror_iri: The ROR ID for the institution in IRI form.

    Returns:
        dict: A dictionary with the alternative IDs.
    """
    # Extract the ROR ID from the IRI
    ror_id = ror_iri.split('/')[-1]

    query_string = '''SELECT DISTINCT ?qid ?ringgold ?grid ?label WHERE {
    ?qid wdt:P6782 "''' + ror_id + '''".
    ?qid rdfs:label ?label.
    FILTER (LANG(?label) = "en")
    OPTIONAL { ?qid wdt:P3500 ?ringgold. }
    OPTIONAL { ?qid wdt:P2427 ?grid. }
    }
'''

    # put your own script name and email address here
    user_agent = 'id_lookup/0.1 (mailto:steve.baskauf@vanderbilt.edu)'
    wdqs = Sparqler(useragent=user_agent)
    data = wdqs.query(query_string)
    #print(json.dumps(data, indent=2))

    # Handle case where no results are returned
    if len(data) == 0:
        return {
            'label': '',
            'ror': '',
            'ringgold': '',
            'grid': '',
            'qid': ''
        }

    if 'label' in data[0]:
        label = data[0]['label']['value']
    else:
        label = ''
    if 'ringgold' in data[0]:
        ringgold = data[0]['ringgold']['value']
    else:
        ringgold = ''
    if 'grid' in data[0]:
        grid = data[0]['grid']['value']
    else:
        grid = ''
    if 'qid' in data[0]:
        qid = data[0]['qid']['value'].split('/')[-1]
    else:
        qid = ''

    # Create a dictionary to store the results
    results_dict = {
        'label': label,
        'ror': ror_iri,
        'ringgold': ringgold,
        'grid': grid,
        'qid': qid
    }

    return results_dict

# ----------------
# ROR API functions
# ----------------

# ORCID supports Ringgold, GRID, and ROR identifiers.
# https://www.ringgold.com/ Limited to 10 searches per day
# https://www.grid.ac/institutes GRID discontinued public releases at the end of 2021
# https://ror.org/ ROR is now the principal identifier for organizations

# ROR documentation: https://ror.readme.io/
# ROR API documentation: https://ror.readme.io/docs/rest-api
# ROR API endpoint URL: https://api.ror.org/organizations


def search_for_institution_id(institution: str, query_type: str) -> List[Dict]:
    """Search for the ROR ID for an institution using the ROR API.

    Args:
        institution: The name of the institution to search for.
        query_type: The type of query to perform. Must be one of 'query' or 'affiliation'.

    Returns:
        A list of dictionaries for possible institution matches with the name, id, and score.
    """
    # ROR API endpoint
    ror_api_endpoint = 'https://api.ror.org/organizations'

    # ROR API parameters
    if query_type == 'query' or query_type == 'affiliation':
        # Institution search (generic search string)
        ror_api_params = {
            query_type: institution
        }
    else:
        print(f'Error: Unknown query type: {query_type}')
        return ''

    # Send the request to the ROR API
    ror_api_response = requests.get(ror_api_endpoint, params=ror_api_params)

    # Get the status code
    status_code = ror_api_response.status_code
    if status_code != 200:
        print('Error: ROR API returned status code', status_code)
        return ''

    # Convert the response to JSON
    ror_api_response_json = ror_api_response.json()

    # Get the list of organizations
    organizations = ror_api_response_json['items']

    results = []
    # Loop through the organizations and extract the name and ROR ID
    for organization in organizations:
        org_dict = {}
        # Get the name
        org_dict['name'] = organization['organization']['name']

        # Get the ROR ID
        org_dict['id'] = organization['organization']['id']

        results.append(org_dict)
    return results


def fuzzy_match_institutions(institution_name: str, search_results: List[Dict]) -> Tuple:
    """Fuzzy match an institution name to a list of search results.

    Args:
        institution_name: The name of the institution to match.
        search_results: A list of dictionaries with the name and id of the institution.

    Returns:
        A tuple with the top match dictionary, score, and a mismatch flag.
    """
    top_w_ratio_match = {}
    top_w_ratio_score = 0
    top_token_set_ratio_match = {}
    top_token_set_ratio_score = 0
    flagged = False

    for search_result in search_results:
        # Get the name of the institution from the search result
        search_result_name = search_result['name']

        # Calculate the fuzzy match ratio
        w_ratio = fuzz.WRatio(institution_name, search_result_name)
        #print(w_ratio, institution_name, search_result_name)
        token_set_ratio = fuzz.token_set_ratio(
            institution_name, search_result_name)
        #print(token_set_ratio, institution_name, search_result_name)
        # print()

        # Check if this is the top w_ratio match
        if w_ratio > top_w_ratio_score:
            top_w_ratio_match = search_result
            top_w_ratio_score = w_ratio

        # Check if this is the top token_set_ratio match
        if token_set_ratio > top_token_set_ratio_score:
            top_token_set_ratio_match = search_result
            top_token_set_ratio_score = token_set_ratio

    # Check whether the top w_ratio match is also the top token_set_ratio match
    if top_w_ratio_match != top_token_set_ratio_match:
        # Warn that the top w_ratio match is not the top token_set_ratio match
        print('Warning: Top w_ratio match is not the top token_set_ratio match for', institution_name)
        logging.warning(
            'Top w_ratio match is not the top token_set_ratio match for ' + institution_name)
        print('Top w_ratio match:', top_w_ratio_score, top_w_ratio_match)
        logging.warning('Top w_ratio match: ' +
                        str(top_w_ratio_score) + ' ' + str(top_w_ratio_match))
        print('Top token_set_ratio match:',
              top_token_set_ratio_score, top_token_set_ratio_match)
        logging.warning('Top token_set_ratio match: ' +
                        str(top_token_set_ratio_score) + ' ' + str(top_token_set_ratio_match))
        logging.warning('')
        flagged = True
    # Return the top w_ration match and score
    return top_w_ratio_match, top_w_ratio_score, flagged

# ----------------
# Elsevier API functions
# ----------------

def extract_author_data_from_scopus(results: List[Dict]) -> List[Dict]:
    """Extracts the name, ORCID, current affiliation, subject area, document count, and Scopus ID from the Scopus author search results."""
    author_list = []
    for result in results:
        author_data = {
            'name': result['preferred-name']['given-name'] + ' ' + result['preferred-name']['surname'],
            'document_count': result['document-count'],
            'scopus_id': result['dc:identifier'].split(':')[1]
        }
        if 'orcid' in result:
            author_data['orcid'] = result['orcid']
        else:
            author_data['orcid'] = ''
        if 'affiliation-current' in result:
            author_data['affiliation'] = result['affiliation-current']['affiliation-name']
        else:
            author_data['affiliation'] = ''

        # Stupidly, if there is a single subject area, a dictionary is returned. If there are multiple subject areas, a list of dictionaries is returned.
        try: # Trap for case where there is no subject area.
            # If the result[subject-area] is a dictionary, extract the abbreviation.
            if isinstance(result['subject-area'], dict):
                author_data['subject_area'] = result['subject-area']['@abbrev']
            # If the result[subject-area] is a list, extract the abbreviation from the first dictionary.
            else:
                author_data['subject_area'] = result['subject-area'][0]['@abbrev']
        except KeyError:
            author_data['subject_area'] = ''

        author_list.append(author_data)
    return author_list

def find_author_at_elsevier(query_string: str) -> Tuple:
    """Find an author SCOPUS ID at Elsevier using their search API."""

    # API specifiction landing page: https://dev.elsevier.com/api_docs.html
    # For author search info, see https://dev.elsevier.com/documentation/AuthorSearchAPI.wadl
    # General search tips are at: https://dev.elsevier.com/sc_author_search_tips.html
    # ORCID is one of the possible field restrictions.

    # Endpoint for author metrics
    endpoint_resource_url = 'https://api.elsevier.com/content/search/author'

    # NOTE: I was able to generate the API key myself using the Elsevier Developer Portal. However, for some reason it gave me access to the
    # Scopus API, but not the Author Search API. I had to request access to the Author Search API from Elsevier support and they gave me
    # an institutional token that is an additional requirement for me to get the Author Search API to work.

    api_key = load_credential('elsevier-api-key.txt', 'home')
    inst_token = load_credential('elsevier-inst-token.txt', 'home')
    if api_key == '' or inst_token == '':
        print('Error: API key or institutional token not found. Search not performed.')
        return ''

    header_parameters = {
        'Accept': 'application/json',
        'X-ELS-APIKey': api_key,
        'X-ELS-Insttoken': inst_token
    }
    # See https://dev.elsevier.com/sc_author_search_views.html for the list of possible fields
    query_parameters = {
        'query': query_string,
        'field': 'dc:identifier,affiliation-current,preferred-name,orcid,subject-area,document-count'
    }

    response = requests.get(endpoint_resource_url, headers=header_parameters, params=query_parameters)

    # Get the number of remaining queries for the week
    remaining_queries = response.headers['X-RateLimit-Remaining']
    #print('Remaining queries: ' + remaining_queries)
    reset_date = response.headers['X-RateLimit-Reset']
    # convert the reset date to a datetime object
    reset_date_string = datetime.datetime.fromtimestamp(int(reset_date)).strftime('%Y-%m-%dT%H:%M:%S')
    #print('Reset date: ' + reset_date_string)

    # Get the status code
    status_code = response.status_code
    if status_code != 200:
        print('Error: API returned status code', status_code)
        print(response.text)
        return None, remaining_queries, reset_date_string

    results_list = response.json()['search-results']['entry']
    #print(json.dumps(results_list, indent=2))

    # Check for error conditions
    if len(results_list) == 0: # This doesn't actually happen, since a single result with an error is returned where there are no hits.
        return None, remaining_queries, reset_date_string
    elif len(results_list) > 1:
        print('Error: Multiple results found for query', query_string)
        #print(json.dumps(results_list, indent=2))
    
    # Check whether the single result reports an empty result set.
    if 'error' in results_list[0]:
        if results_list[0]['error'] == 'Result set was empty':
            return None, remaining_queries, reset_date_string

    abbreviated_results_list = extract_author_data_from_scopus(results_list)

    return abbreviated_results_list, remaining_queries, reset_date_string

def find_author_at_elsevier_by_orcid(orcid: str) -> Optional[str]:
    """Find an author SCOPUS ID at Elsevier from an ORCID."""
    # List of field restrictions is at https://dev.elsevier.com/sc_author_search_tips.html
    query_string = 'ORCID(' + orcid + ')'
    scopus_id, remaining_queries, reset_date_string = find_author_at_elsevier(query_string)
    return scopus_id, remaining_queries, reset_date_string

def find_author_at_elsevier_by_names(family_name: str, given_name: str, affiliation: str, middle_name=None) -> Optional[str]:
    """Find an author SCOPUS ID at Elsevier using name and affiliation. Skip affiliation if empty string."""
    # General search tips are at: https://dev.elsevier.com/sc_author_search_tips.html
    if middle_name is not None:
        given_name += ' ' + middle_name
    if affiliation == '':
        query_string = 'AUTHLASTNAME(' + family_name + ') AND AUTHFIRST(' + given_name + ')'
    else:
        query_string = 'AFFIL(' + affiliation + ') AND AUTHLASTNAME(' + family_name + ') AND AUTHFIRST(' + given_name + ')'

    scopus_id, remaining_queries, reset_date_string = find_author_at_elsevier(query_string)
    return scopus_id, remaining_queries, reset_date_string


def find_affiliation_ids_at_elsevier(search_string: str) -> Optional[Dict]:
    """Search the Elsevier Affiliation Search API to get the affiliation identifier for the institution.
    
    Returns the name, ID, and count of documents for the affiliation.
    """
     # Endpoint for affiliation API
    endpoint_resource_url = 'https://api.elsevier.com/content/search/affiliation'

    api_key = load_credential('elsevier-api-key.txt', 'home')
    inst_token = load_credential('elsevier-inst-token.txt', 'home')
    if api_key == '' or inst_token == '':
        print('Error: API key or institutional token not found. Search not performed.')
        return ''

    header_parameters = {
        'Accept': 'application/json',
        'X-ELS-APIKey': api_key,
        'X-ELS-Insttoken': inst_token
    }
    # See https://dev.elsevier.com/sc_author_search_views.html for the list of possible fields
    query_parameters = {
        'query': 'affil(' + search_string + ')'
    }

    response = requests.get(endpoint_resource_url, headers=header_parameters, params=query_parameters)

    # Get the number of remaining queries for the week
    remaining_queries = response.headers['X-RateLimit-Remaining']
    #print('Remaining queries: ' + remaining_queries)
    reset_date = response.headers['X-RateLimit-Reset']
    # convert the reset date to a datetime object
    reset_date_string = datetime.datetime.fromtimestamp(int(reset_date)).strftime('%Y-%m-%dT%H:%M:%S')
    #print('Reset date: ' + reset_date_string)

    # Get the status code
    status_code = response.status_code
    if status_code != 200:
        print('Error: API returned status code', status_code)
        print(response.text)
        return None

    results_list = response.json()['search-results']['entry']
    #print(json.dumps(results_list, indent=2))

    # Check for error conditions
    if len(results_list) == 0:
        return None

    # Extract the names and IDs from the results, then do fuzzy matching to find the best matches
    match_list = []
    for result in results_list:
        scopus_institution_name = result['affiliation-name']
        scopus_institution_id = result['dc:identifier'].split(':')[1]
        document_count = result['document-count']

        #print(scopus_institution_name, scopus_institution_id, document_count)
        match_list.append({'name': scopus_institution_name, 'id': scopus_institution_id, 'document_count': document_count})

    # The data are really bad because there are a lot of duplicates. However, usually there is one with a lot more documents
    # than the others. However, the results seem to be sorted descending by document count.
    # So if there are equally good matches, the first one encountered will be used. Since they are sorted descending, it will
    # be the one with the most documents.
    print(json.dumps(match_list, indent=2))

    # NOTE: Optimally, the matching function would be modified to return the IDs of all of the good matches.
    # However, in the interest of time, I'm going to use the function as it's already written and just return the
    # match with the largest number of articles. That might cause some authors to be missed, but there will be some
    # that will have to be looked up manually anyway.

    id_match, score, flagged_mismatch = fuzzy_match_institutions(search_string, match_list)
    #print(id_match, score, flagged_mismatch)

    return id_match, remaining_queries, reset_date_string

def get_single_metric_from_elsevier_author_api(scopus_author_id: str) -> Optional[str]:

    """Search the Elsevier Author Search API for bibliometric data such as h-Index."""
    # API specifiction landing page: https://dev.elsevier.com/api_docs.html
    # For author metrics info, see https://dev.elsevier.com/documentation/SciValAuthorAPI.wadl

    # Endpoint for author metrics
    endpoint_resource_url = 'https://api.elsevier.com/analytics/scival/author/metrics'

    # NOTE: I was able to generate the API key myself using the Elsevier Developer Portal. However, for some reason it gave me access to the
    # Scopus API, but not the Author Search API. I had to request access to the Author Search API from Elsevier support and they gave me
    # an institutional token that is an additional requirement for me to get the Author Search API to work.

    api_key = load_credential('elsevier-api-key.txt', 'home')
    inst_token = load_credential('elsevier-inst-token.txt', 'home')
    if api_key == '' or inst_token == '':
        print('Error: API key or institutional token not found. Search not performed.')
        return ''

    header_parameters = {
        'Accept': 'application/json',
        'X-ELS-APIKey': api_key,
        'X-ELS-Insttoken': inst_token
    }
    query_parameters = {
        'metricTypes': 'HIndices',
        'byYear': False,
        'authors': scopus_author_id
    }

    response = requests.get(endpoint_resource_url, headers=header_parameters, params=query_parameters)

    # Get the status code
    status_code = response.status_code
    if status_code != 200:
        print('Error: API returned status code', status_code)
        print(response.text)
        return None

    data_list = response.json()['results']
    #print(json.dumps(data_list, indent=2))

    if len(data_list) == 0:
        print('Error: No author metrics found for Scopus author ID', scopus_author_id)
        return None

    found = False
    for metric in data_list[0]['metrics']:
        if metric['indexType'] == 'h-index':
            h_index = metric['value']
            found = True
            break

    if not found:
        print('Error: No h-index found for Scopus author ID', scopus_author_id)
        return None

    return h_index

def get_metrics_from_elsevier_author_api(scopus_author_id: List[str]) -> Optional[List[Dict]]:

    """Search the Elsevier Author Search API for bibliometric data such as h-Index.
    
    The query will accept up to 200 author IDs per call.
    """
    # API specifiction landing page: https://dev.elsevier.com/api_docs.html
    # For author metrics info, see https://dev.elsevier.com/documentation/SciValAuthorAPI.wadl

    if len(scopus_author_id) > 200:
        print('Error: Too many author IDs. Maximum is 200.')
        return None
    else:
        # Concatenate the author IDs into a comma-separated string
        author_id_string = ','.join(scopus_author_id)

    # Endpoint for author metrics
    endpoint_resource_url = 'https://api.elsevier.com/analytics/scival/author/metrics'

    # NOTE: I was able to generate the API key myself using the Elsevier Developer Portal. However, for some reason it gave me access to the
    # Scopus API, but not the Author Search API. I had to request access to the Author Search API from Elsevier support and they gave me
    # an institutional token that is an additional requirement for me to get the Author Search API to work.

    api_key = load_credential('elsevier-api-key.txt', 'home')
    inst_token = load_credential('elsevier-inst-token.txt', 'home')
    if api_key == '' or inst_token == '':
        print('Error: API key or institutional token not found. Search not performed.')
        return ''

    header_parameters = {
        'Accept': 'application/json',
        'X-ELS-APIKey': api_key,
        'X-ELS-Insttoken': inst_token
    }
    query_parameters = {
        'metricTypes': 'HIndices',
        'byYear': False,
        'authors': author_id_string
    }

    response = requests.get(endpoint_resource_url, headers=header_parameters, params=query_parameters)

    # Get the status code
    status_code = response.status_code
    if status_code != 200:
        print('Error: API returned status code', status_code)
        print(response.text)
        return None

    data_list = response.json()['results']
    #print(json.dumps(data_list, indent=2))

    if len(data_list) == 0:
        print('Error: No author metrics found for Scopus author ID', author_id_string)
        return ''
    
    # Extract the h-index for each author and create a dictionary of author IDs and h-indices
    h_index_list = []
    for data in data_list:
        found = False
        for metric in data['metrics']:
            if metric['indexType'] == 'h-index':
                h_index = metric['value']
                found = True
                scopus_id = data['author']['id']
                h_index_list.append({'scopus_id': scopus_id, 'h_index': h_index})
                break

        if not found:
            print('Error: No h-index found for Scopus author ID', author_id_string)

    return h_index_list

# ----------------
# ORCID API functions
# ----------------


def query_orcid_api(given_name_string: str, family_name_string: str, **kwargs) -> List:
    """Query the ORCID API for a person's ORCID ID."""
    # ORCID API search information: https://info.orcid.org/documentation/api-tutorials/api-tutorial-searching-the-orcid-registry/
    # ORCID FAQ on finding record holders: https://info.orcid.org/ufaqs/how-do-i-find-orcid-record-holders-at-my-institution/
    # ORCID API information on organization identifiers https://info.orcid.org/documentation/integration-guide/working-with-organization-identifiers/#Determining_your_Identifier
    # Limit is 1000 results per call, so paging is required. I did that on vb1_process_department.ipynb

    # Solr searches are supported: https://solr.apache.org/guide/6_6/the-standard-query-parser.html

    # Construct institution part of search string OR and the identifiers in the kwargs.
    count = 0
    built_search_string = ''
    for key, value in kwargs.items():
        #print(key, value)

        if 'ror' == key:
            # Contrary to the example, the ROR ID is the full IRI, not just the local name. It must be enclosed in quotes because it has a colon.
            # Extract local_name from ROR IRI
            #ror_id = kwargs['ror'].split('/')[-1]
            #search_string = 'ror-org-id:' + ror_id
            search_string = 'ror-org-id:"' + kwargs['ror'] + '"'
        elif 'ringgold' == key:
            search_string = 'ringgold-org-id:' + kwargs['ringgold']
        elif 'grid' == key:
            search_string = 'grid-org-id:' + kwargs['grid']
        elif 'email' == key:
            search_string = 'email:*@' + kwargs['email']

        # If quotes are not used around the search string, it does an OR search. So searching for Vanderbilt University returns 3095010 results.
        # while searching with quotes returns 7475 results. It is not clear to me what the parentheses accomplish.
        elif 'name' == key:  # Documentation says exact match with name.
            search_string = 'affiliation-org-name:("' + kwargs['name'] + '")'
        # The text keyword argument has a list value. It is a list of strings that are ORed together.
        elif 'text' == key:
            if len(kwargs['text']) == 1:
                search_string = 'affiliation-org-name:"' + kwargs['text'][0] + '"'
            elif len(kwargs['text']) > 1:
                search_string = ''
                for text in kwargs['text']:
                    search_string += 'affiliation-org-name:"' + text + '" OR '
                search_string = search_string[:-4]  # Remove the last ' OR '
        else:
            print('Error: unknown key', key)
            print('Not included in search string')
            continue

        if count == 0:
            built_search_string += search_string
        else:
            built_search_string += ' OR ' + search_string
        count += 1

    # Construct the name part of the search string
    names_string = 'given-names:' + given_name_string + \
        ' AND family-name:' + family_name_string
    if built_search_string != '':
        built_search_string = names_string + \
            ' AND (' + built_search_string + ')'
    else:
        built_search_string = names_string
    #print('Search string:', built_search_string)

    # Search endpoint
    endpoint_url = 'https://pub.orcid.org/v3.0/search/'

    # Header parameters
    header_parameters = {
        'Accept': 'application/json'
        # 'Accept': 'application/vnd.orcid+xml'
    }

    # Try to load an authorization token from a file in the user's home directory. If none are loaded, the query will be unauthenticated.
    # Determined the form of this parameter by setting up Bearer Token Authentication in Postman and then looking at the request headers.
    # I think this is correct because if an invalid token is sent, I get a 401 status code.

    # NOTE: As of 2023-04-06 there are no errors for a valid but unauthenticated search. All of the instructions show how to authenticate.
    # So at some point in the future authentication may be required.
    access_token = load_credential('orcid_access_token.txt', 'home')
    if access_token != '':
        header_parameters['Authorization'] = 'Bearer ' + access_token

    # Query parameters
    # Example name search q=family-name:Haak+AND+given-names:Laurel+AND+digital-object-ids:%2210.1087/20120404%22
    # The 'fl' field specification parameter seems to only work for CSV format. For JSON, only the ORCID ID is returned.
    query_parameters = {
        # 'fl': 'orcid-identifier,given-names,family-name',
        'q': built_search_string
    }

    # Make the request
    response = requests.get(
        endpoint_url, headers=header_parameters, params=query_parameters)
    # Print the request URL
    # print(response.url)

    # Get the status code
    status_code = response.status_code
    if status_code != 200:
        print('Error: API returned status code', status_code)
        print(response.text)
        return []

    # Get the results
    data = response.json()
    #print(json.dumps(data, indent=2))

    # Extract the number of hits
    num_hits = data['num-found']
    #print('Number of hits:', num_hits)

    if num_hits == 0:
        return []

    # Extract the ORCID IDs
    orcid_ids = []
    for result in data['result']:
        orcid_ids.append(result['orcid-identifier']['path'])

    return orcid_ids

# ----------------
# Wikidata Query Service functions
# ----------------


class Sparqler:
    """Build SPARQL queries of various sorts

    Parameters
    -----------
    method: str
        Possible values are "post" (default) or "get". Use "get" if read-only query endpoint.
        Must be "post" for update endpoint.
    endpoint: URL
        Defaults to Wikidata Query Service if not provided.
    useragent : str
        Required if using the Wikidata Query Service, otherwise optional.
        Use the form: appname/v.v (URL; mailto:email@domain.com)
        See https://meta.wikimedia.org/wiki/User-Agent_policy
    session: requests.Session
        If provided, the session will be used for all queries. Note: required for the Commons Query Service.
        If not provided, a generic requests method (get or post) will be used.
        NOTE: Currently only implemented for the .query() method since I don't have any way to test the mehtods that write.
    sleep: float
        Number of seconds to wait between queries. Defaults to 0.1

    Required modules:
    -------------
    requests, datetime, time
    """

    def __init__(self, method='post', endpoint='https://query.wikidata.org/sparql', useragent=None, session=None, sleep=0.1):
        # attributes for all methods
        self.http_method = method
        self.endpoint = endpoint
        if useragent is None:
            if self.endpoint == 'https://query.wikidata.org/sparql':
                print(
                    'You must provide a value for the useragent argument when using the Wikidata Query Service.')
                print()
                # Use keyboard interrupt instead of sys.exit() because it works in Jupyter notebooks
                raise KeyboardInterrupt
        self.session = session
        self.sleep = sleep

        self.requestheader = {}
        if useragent:
            self.requestheader['User-Agent'] = useragent

        if self.http_method == 'post':
            self.requestheader['Content-Type'] = 'application/x-www-form-urlencoded'

    def query(self, query_string, form='select', verbose=False, **kwargs):
        """Sends a SPARQL query to the endpoint.

        Parameters
        ----------
        form : str
            The SPARQL query form.
            Possible values are: "select" (default), "ask", "construct", and "describe".
        mediatype: str
            The response media type (MIME type) of the query results.
            Some possible values for "select" and "ask" are: "application/sparql-results+json" (default) and "application/sparql-results+xml".
            Some possible values for "construct" and "describe" are: "text/turtle" (default) and "application/rdf+xml".
            See https://docs.aws.amazon.com/neptune/latest/userguide/sparql-media-type-support.html#sparql-serialization-formats-neptune-output
            for response serializations supported by Neptune.
        verbose: bool
            Prints status when True. Defaults to False.
        default: list of str
            The graphs to be merged to form the default graph. List items must be URIs in string form.
            If omitted, no graphs will be specified and default graph composition will be controlled by FROM clauses
            in the query itself. 
            See https://www.w3.org/TR/sparql11-query/#namedGraphs and https://www.w3.org/TR/sparql11-protocol/#dataset
            for details.
        named: list of str
            Graphs that may be specified by IRI in a query. List items must be URIs in string form.
            If omitted, named graphs will be specified by FROM NAMED clauses in the query itself.

        Returns
        -------
        If the form is "select" and mediatype is "application/json", a list of dictionaries containing the data.
        If the form is "ask" and mediatype is "application/json", a boolean is returned.
        If the mediatype is "application/json" and an error occurs, None is returned.
        For other forms and mediatypes, the raw output is returned.

        Notes
        -----
        To get UTF-8 text in the SPARQL queries to work properly, send URL-encoded text rather than raw text.
        That is done automatically by the requests module for GET. I guess it also does it for POST when the
        data are sent as a dict with the urlencoded header. 
        See SPARQL 1.1 protocol notes at https://www.w3.org/TR/sparql11-protocol/#query-operation        
        """
        query_form = form
        if 'mediatype' in kwargs:
            media_type = kwargs['mediatype']
        else:
            if query_form == 'construct' or query_form == 'describe':
                # if query_form == 'construct':
                media_type = 'text/turtle'
            else:
                # default for SELECT and ASK query forms
                media_type = 'application/sparql-results+json'
        self.requestheader['Accept'] = media_type

        # Build the payload dictionary (query and graph data) to be sent to the endpoint
        payload = {'query': query_string}
        if 'default' in kwargs:
            payload['default-graph-uri'] = kwargs['default']

        if 'named' in kwargs:
            payload['named-graph-uri'] = kwargs['named']

        if verbose:
            print('querying SPARQL endpoint')

        start_time = datetime.datetime.now()
        if self.http_method == 'post':
            if self.session is None:
                response = requests.post(
                    self.endpoint, data=payload, headers=self.requestheader)
            else:
                response = self.session.post(
                    self.endpoint, data=payload, headers=self.requestheader)
        else:
            if self.session is None:
                response = requests.get(
                    self.endpoint, params=payload, headers=self.requestheader)
            else:
                response = self.session.get(
                    self.endpoint, params=payload, headers=self.requestheader)
        elapsed_time = (datetime.datetime.now() - start_time).total_seconds()
        self.response = response.text
        # Throttle as a courtesy to avoid hitting the endpoint too fast.
        time.sleep(self.sleep)

        if verbose:
            print('done retrieving data in', int(elapsed_time), 's')

        if query_form == 'construct' or query_form == 'describe':
            return response.text
        else:
            if media_type != 'application/sparql-results+json':
                return response.text
            else:
                try:
                    data = response.json()
                except:
                    return None  # Returns no value if an error.

                if query_form == 'select':
                    # Extract the values from the response JSON
                    results = data['results']['bindings']
                else:
                    # True or False result from ASK query
                    results = data['boolean']
                return results


## Script to retrieve institutional identifiers from ROR

In [None]:
# Load the raw data from the CSV file
raw_data = csv_read('oto_network_analysis.csv')
# raw_data = csv_read('oto_network_analysis.csv', rows=15) # Use this for testing

# Pull the unique values from the INSTITUTION column
institutions = list(raw_data['INSTITUTION'].unique())
# print(institutions)

# Remove null np.nan values
for institution in institutions:
    if '' in institutions:
        institutions.remove('')

# print(institutions)

# Create a data frame to store the results
results_df = pd.DataFrame(
    columns=['match_score', 'flagged', 'name', 'ror_label', 'ror_id'])

# Loop through the institutions and search for the ROR ID
for institution in institutions:
    print(institution.strip())
    # Search for the institution
    institution_search_results = search_for_institution_id(
        institution, 'affiliation')
    #print(json.dumps(institution_search_results, indent=2))

    if len(institution_search_results) == 0:
        results_dict = {
            'match_score': 0,
            'name': institution.strip(),
            'ror_label': '',
            'ror_id': '',
            'flagged': 'no match'
        }
    else:
        # Fuzzy match the institution name to the search results
        id_match, score, flagged_mismatch = fuzzy_match_institutions(
            institution, institution_search_results)
        # print(id_match)

        if score < INSTITUTION_NO_MATCH_CUTOFF:  # Score too low to be a match
            results_dict = {
                'match_score': 0,
                'name': institution.strip(),
                'ror_label': '',
                'ror_id': '',
                'flagged': 'no match'
            }
        else:  # Score high enough to be a match
            # Create a dictionary with the results
            results_dict = {
                'match_score': score,
                'name': institution.strip(),
                'ror_label': id_match['name'],
                'ror_id': id_match['id']
            }
            if flagged_mismatch:  # w_ratio match disagrees with token_set_ratio match
                results_dict['flagged'] = 'mismatch'
            else:
                if score < INSTITUTION_REVIEW_CUTOFF:  # Score too low to be accepted without review
                    results_dict['flagged'] = 'review'
                else:
                    results_dict['flagged'] = ''

    # Add the results to the data frame
    results_df = results_df.append(results_dict, ignore_index=True)
    print()

    # Save the results to a CSV file after each institution in case the script crashes
    results_df.to_csv('ror_id_search_results.csv', index=False)

# direct output to text log file instead of sys.stdout
error_log_object = open('log_error.txt', 'at', encoding='utf-8')

# Read the warnings log
# For some reason, the log is considered considered a binary file. So when it is read in as text,
# it contains many null characters. So they are removed from the string read from the file.
with open('warnings.log', 'rt') as file_object:
    warnings_text = file_object.read().replace('\0', '')
if warnings_text == '':
    print('No errors occurred.', file=error_log_object)
else:
    print(warnings_text, file=error_log_object)
print('', file=error_log_object)

# Close the log file
error_log_object.close()
print('done')


## Look up alternative identifiers in Wikidata

In [None]:
# Load the institutions data from the CSV file
institutions_df = csv_read('ror_id_search_results.csv')
# institutions_df = csv_read('ror_id_search_results.csv', rows=1) # Use this for testing

# Loop throught the institutions and look up the alternative IDs
for index, row in institutions_df.iterrows():
    print(row['name'])
    if row['ror_id'] != '':
        alt_ids = look_up_alternative_institutional_ids(row['ror_id'])
        # print(alt_ids)
        institutions_df.at[index, 'wikidata_label'] = alt_ids['label']
        institutions_df.at[index, 'qid'] = alt_ids['qid']
        institutions_df.at[index, 'ringgold'] = alt_ids['ringgold']
        institutions_df.at[index, 'grid'] = alt_ids['grid']
        # Delay to avoid hitting the API too fast.
        time.sleep(0.1)  # Delay for 0.1 seconds

institutions_df.head()

# Save the results to a CSV file
institutions_df.to_csv('institutional_identifiers.csv', index=False)
print('done')


## Wrangle author spreadsheet

The author spreadsheet is not tidy data and has the institution name on a separate row ahead of the list of authors from that institution

In [None]:
# Load the raw data from the CSV file
raw_data = csv_read('oto_network_analysis.csv')

# Loop through each row to see if has the name of an institution
for index, row in raw_data.iterrows():
    if row['INSTITUTION'] != '':
        # Get the institution name
        next_institution = row['INSTITUTION'].strip()
        print(next_institution)
        # Remove that row from the data frame
        raw_data.drop(index, inplace=True)
        continue
    else:
        raw_data.at[index, 'INSTITUTION'] = next_institution

# Save the results to a CSV file
raw_data.to_csv('authors.csv', index=False)

print('done')

## Script to search ORCID for an author's ORCID ID

Note: this does not perform a search for the person without specifying any institutional information. This will cause some people to be missed. However, without the institutional affiliation, the probability of getting the wrong person goes way up. Since the point of this part of the search process is to get reliable globally unique identifiers for the person, if ORCID doesn't have the institional affiliation for the person being searched for, it's probably better to just do the search with affiliation when searching the Elsivier API to try to find their Scopus ID.

In [None]:
# Load the institutions data from the CSV file
institutions_df = csv_read('institutional_identifiers.csv')

# Load the authors data from the CSV file
authors_df = csv_read('authors.csv')

# Add a column to the authors data frame to hold the ORCID
authors_df['ORCID'] = ''

# Loop throught the institutions
for index, row in institutions_df.iterrows():
    # Use this for testing (one hospital only)
    #if index != 11:
    #    continue
    print(row['name'])

    # Construct the kwargs to pass into the ORCID API query function
    kwargs = {}
    if row['ringgold'] != '':
        kwargs['ringgold'] = row['ringgold']

    if row['ror_id'] != '':
        kwargs['ror'] = row['ror_id']

    if row['grid'] != '':
        kwargs['grid'] = row['grid']

    # Always include the name in the search
    label_list = [ row['ror_label'] ]

    # Try including the raw name in the search (there always is one)
    if row['name'] != row['ror_label']:
        label_list.append(row['name'])

    # If the Wikidata label exists and is different from the ROR label, include it in the search
    if row['wikidata_label'] != '' and row['wikidata_label'] != row['ror_label']:
        label_list.append(row['wikidata_label'])
    kwargs['text'] = label_list
    # print(kwargs)

    # Process all of the names in the authors data frame for this institution
    n_authors = 0
    for author_index, author_row in authors_df.iterrows():
        if author_row['INSTITUTION'] != row['name']:
            continue
        n_authors += 1

        # Get the author's name parts
        first_name = author_row['FIRST'].strip()
        middle_name = author_row['MIDDLE'].strip()
        last_name = author_row['LAST'].strip()

        # Pass the kwargs into the ORCID API query function
        # NOTE: Use the wildcard to search for first name and any middle names or longer versions of the first name
        orcid_results = query_orcid_api(first_name + '*', last_name, **kwargs)
        if len(orcid_results) > 1:
            # Try querying again with the first name and middle name
            orcid_results_middle = query_orcid_api(first_name + ' ' + middle_name, last_name, **kwargs)
            if len(orcid_results_middle) > 1:
                print(first_name, middle_name, last_name)
                print('Multiple results found. Need to disambiguate manually.')
                print(orcid_results_middle)
                print()
            elif len(orcid_results_middle) == 1:
                #print('Single result found.')
                #print(orcid_results_middle[0])
                # Update the authors data
                authors_df.at[author_index, 'ORCID'] = orcid_results_middle[0]
                continue
            else: # No results found when middle name added, so manual disambiguation of results with no middle name is needed.
                print(first_name, middle_name, last_name)
                print('Multiple results found. Need to disambiguate manually.')
                print(orcid_results)
                print()

        elif len(orcid_results) == 1:
            #print('Single result found.')
            #print(orcid_results[0])
            # Update the authors data
            authors_df.at[author_index, 'ORCID'] = orcid_results[0]
            continue

        else:
            #print('No results found.')
            pass
    
    if n_authors == 0:
        print('Warning! No authors found for', row['name'])

# Save the results to a CSV file
authors_df.to_csv('authors_with_orcids.csv', index=False)

print('done')

## Search for an author in the Elsivier API



First look for SCOPUS IDs for authors that have ORCIDs

In [None]:
# !!! This code was modified after the find_author_at_elsevier function was changed to support multiple results.
# It has not been tested yet.

# NOTE: At https://dev.elsevier.com/api_key_settings.html
# the rate limits for Author search is 5000 requests per week and 2 requests per second.
requests_per_second = 2

# Load the authors data from the CSV file
authors_df = csv_read('authors_with_orcids.csv')

# Loop through the authors and look up the scopus ID using the ORCID
for index, row in authors_df.iterrows():
    if row['ORCID'] == '':
        continue
    print(row['FIRST'], row['MIDDLE'], row['LAST'])
    results, remaining_queries, reset_date_string = find_author_at_elsevier_by_orcid(row['ORCID'])
    if results is None:
        print('No results found.')
        continue
    elif len(results) > 1:
        print('Multiple results found. Need to disambiguate manually.')
        print(results)
        authors_df.at[index, 'SCOPUS_ID'] = str(results)
        continue
    else:
        scopus_id = results[0]['scopus_id']
        print(scopus_id)
    authors_df.at[index, 'SCOPUS_ID'] = scopus_id
    # Delay to avoid hitting the API too fast.
    time.sleep(1/requests_per_second)  # Delay to avoid hitting the API too fast.

    # Save the results to a CSV file after each author to avoid having to start over if there is an error.
    authors_df.to_csv('authors_with_orcids_and_scopus_ids.csv', index=False)

print('done')

I thought that to search effectively, we need to find the Elsevier affiliation codes for the institutions. However, after experimenting and looking at the results, there were so many duplicate IDs with the same name (some with a significant number of publications) that there would be a lot of false negatives if the search were done with only one.

So I've decided that the safest thing is to do the author search using all of the labels we have ORed together. That might result in some false positives, but those could probably be checked out by getting the author field codes or descriptions and seeing if they listed something OTO related or not.

In [None]:
# DID NOT END UP USING THIS CODE

# Add the Elsevier affiliation code to the institutions data frame

# The Affiliation Retrieval API allows 5000 requests per week and 6 requests per second.
requests_per_second = 6

# Load the institutions data from the CSV file
institutions_df = csv_read('institutional_identifiers.csv')

# For testing, use only the first 3 rows
institutions_df = institutions_df.head(3)

# Add a column to the institutions data frame to hold the Elsevier affiliation ID, the name as it appears in Elsevier, 
# and count of documents for that institution.
institutions_df['elsevier_affiliation_id'] = ''
institutions_df['elsevier_name'] = ''
institutions_df['elsevier_document_count'] = ''

# Step through each institution and search for it at Elsevier
for index, row in institutions_df.iterrows():
    print(row['ror_label'])
    affiliation_id, remaining_queries, reset_date_string = find_affiliation_ids_at_elsevier(row['ror_label'])
    institutions_df.at[index, 'elsevier_affiliation_id'] = affiliation_id['id']
    institutions_df.at[index, 'elsevier_name'] = affiliation_id['name']
    institutions_df.at[index, 'elsevier_document_count'] = affiliation_id['document_count']
    # Delay to avoid hitting the API too fast.
    time.sleep(1/requests_per_second)  # Delay to avoid hitting the API too fast.

    # Save the results to a CSV file after each institution to avoid having to start over if there is an error.
    institutions_df.to_csv('institutional_identifiers_with_elsevier.csv', index=False)

print('done')


Search for authors by name and institutional name strings.

Note: this block of code was run again at the end to look for authors without using institutional affiliation as a search term. This results in a more permissive search that finds more of the authors, but is also prone to false positives.

In [None]:
# Add a flag to determine if searches should be done with institutional affiliation (first pass=True, second pass=False)
search_with_institution = True

# ----------------------------------------------
# Construct a dictionary of unique institutional names
# ----------------------------------------------

# Scopus Affiliation Search Guide: https://dev.elsevier.com/sc_affil_search_tips.html
# API playground: https://dev.elsevier.com/scopus.html#!/Affiliation_Search/AffiliationSearch

if search_with_institution:
    # Load the institutions data from the CSV file
    institutions_df = csv_read('institutional_identifiers.csv')

    # Get the name column as a list
    name_list = institutions_df['name'].tolist()
    # Sort the list
    name_list.sort()
    #print(name_list)
    #print(len(name_list))

    # Remove duplicates
    name_list = list(set(name_list))
    # Alphabetize the list
    name_list.sort()
    #print(name_list)
    #print(len(name_list))

    # Create a dictionary to hold the list of labels for each name
    name_label_dict = {}

    # For each name on the list, find all of the labels for rows whose name matches the name on the list.
    for name in name_list:
        label_list = []
        # Step through the rows and if the name matches, extract all of the labels and add them to the list.
        clean_df = institutions_df.copy()
        for index, row in clean_df.iterrows():
            if row['name'] == name:
                #print(name)
                label_list.append(name)
                label_list.append(row['ror_label'])
                label_list.append(row['wikidata_label'])
        
        # Remove duplicates
        label_list = list(set(label_list))
        # Add the list of labels to the dictionary
        name_label_dict[name] = label_list
        #print(name_label_dict)

# ---------------------------------------------
# Author search
# ---------------------------------------------

# Author search guide: https://dev.elsevier.com/sc_author_search_tips.html
# Search returns these fields: dc:identifier,affiliation-current,preferred-name,orcid,subject-area,document-count

# NOTE: At https://dev.elsevier.com/api_key_settings.html
# the rate limits for Author search is 5000 requests per week and 2 requests per second.
requests_per_second = 2

# Open the author CSV file
if search_with_institution:
    authors_df = csv_read('authors_with_orcids_and_scopus_ids.csv')
else:
    # This file is the output of the following cell after the first pass of this cell is run.
    authors_df = csv_read('authors_full_results_transfer.csv')

# For testing, use only the first 3 rows
#authors_df = authors_df.head(20)

# Add columns for Scopus name, affiliation, document count, and subject area
if search_with_institution: # These columns only need to be added for the first pass. They are already there in the second pass.
    authors_df['SCOPUS_NAME'] = ''
    authors_df['AFFILIATION'] = ''
    authors_df['DOCUMENT_COUNT'] = ''
    authors_df['SUBJECT_AREA'] = ''
    authors_df['MULTIPLE_RESULTS'] = ''
else:
    # Add column for multiple results when no affiliation is provided
    authors_df['MULTIPLE_RESULTS_NO_AFFIL'] = ''

# Step through each author and search for them at Elsevier
for index, row in authors_df.iterrows():
    given_name = row['FIRST']
    middle_name = row['MIDDLE']
    family_name = row['LAST']
    print(given_name, middle_name, family_name)
    if search_with_institution:
        affiliation_to_search = row['INSTITUTION']

    # If the author already has a Scopus ID, skip them
    if row['SCOPUS_ID'] != '':
        print('Skipping because they already have a Scopus ID.')
        print()
        continue

    if search_with_institution:
        # Build the affiliation search string
        search_term = ''
        for name in name_label_dict[affiliation_to_search]:
            search_term += '"' + name + '" OR '
        search_term = search_term[:-4]  # Remove the last OR
    else:
        search_term = ''

    if middle_name == '':
        scopus_author_results, remaining_queries, reset_date_string = find_author_at_elsevier_by_names(family_name, given_name, search_term)
    else:
        scopus_author_results, remaining_queries, reset_date_string = find_author_at_elsevier_by_names(family_name, given_name, search_term, middle_name=middle_name)
    print(scopus_author_results)
    print()

    if scopus_author_results is None:
        # Delay to avoid hitting the API too fast.
        time.sleep(1/requests_per_second + 0.1)  # Delay to avoid hitting the API too fast.
        continue

    # If there is a single result, add the data to the authors data frame.
    elif len(scopus_author_results) == 1:
        authors_df.at[index, 'SCOPUS_ID'] = scopus_author_results[0]['scopus_id']
        authors_df.at[index, 'SCOPUS_NAME'] = scopus_author_results[0]['name']
        authors_df.at[index, 'AFFILIATION'] = scopus_author_results[0]['affiliation']
        authors_df.at[index, 'DOCUMENT_COUNT'] = scopus_author_results[0]['document_count']
        authors_df.at[index, 'SUBJECT_AREA'] = scopus_author_results[0]['subject_area']
        # Don't overwrite the ORCID if it is already there.
        if row['ORCID'] == '':
            authors_df.at[index, 'ORCID'] = scopus_author_results[0]['orcid']

    # If there are multiple results, just put the results in the multiple results column in the CSV file.
    else:
        if search_with_institution:
            authors_df.at[index, 'MULTIPLE_RESULTS'] = json.dumps(scopus_author_results)
        else:
            authors_df.at[index, 'MULTIPLE_RESULTS_NO_AFFIL'] = json.dumps(scopus_author_results)

    # Save the results to a CSV file after each author to avoid having to start over if there is an error.
    if search_with_institution:
        authors_df.to_csv('authors_full_results.csv', index=False)
    else:
        authors_df.to_csv('authors_full_results_no_affil.csv', index=False)

    # Delay to avoid hitting the API too fast.
    time.sleep(1/requests_per_second + 0.1)  # Delay to avoid hitting the API too fast.

print('Remaining queries:', remaining_queries)
print('Reset date:', reset_date_string)
print('done')

This cell is run after the first pass using the cell above with institutional affiliation. It is not run after the second pass.

It appears that the first of multiple results contains most of the publications, with second and beyond only containing one or a few. So copy the data from the first result into the cells of the table.

In [None]:
# Open the author CSV file
authors_df = csv_read('authors_full_results.csv')

# Test with first 15 rows
#authors_df = authors_df.head(15)

# Loop through the rows and extract the fields from the first item in the MULTIPLE_RESULTS column.
for index, row in authors_df.iterrows():
    if row['MULTIPLE_RESULTS'] != '':
        # !!!!! Since I originally forgot to use json.dumps(), substitute single quotes for double quotes so that the string can be converted to a dictionary.
        # Also, replace None with an empty string. If rerun in the future, comment out this line.
        #converted = row['MULTIPLE_RESULTS'].replace("{'", '{"').replace("':", '":').replace(", '", ', "').replace("'}", '"}').replace(": '", ': "').replace("',", '",').replace('None', '""')
        #multiple_results = json.loads(converted)
        multiple_results = json.loads(row['MULTIPLE_RESULTS'])
        authors_df.at[index, 'SCOPUS_ID'] = multiple_results[0]['scopus_id']
        authors_df.at[index, 'SCOPUS_NAME'] = multiple_results[0]['name']
        authors_df.at[index, 'AFFILIATION'] = multiple_results[0]['affiliation']
        authors_df.at[index, 'DOCUMENT_COUNT'] = multiple_results[0]['document_count']
        authors_df.at[index, 'SUBJECT_AREA'] = multiple_results[0]['subject_area']
        # Don't overwrite the ORCID if it is already there.
        if row['ORCID'] == '':
            authors_df.at[index, 'ORCID'] = multiple_results[0]['orcid']

# Save the results to a CSV file.
authors_df.to_csv('authors_full_results_transfer.csv', index=False)

print('done')

## Find duplicate Scopus IDs, which will crash the next part of the script

In [None]:
import collections

# Find duplicate Scopus IDs
scopus_id_worksheet_df = pd.read_csv('scopus_id_worksheet.csv')

# List of Scopus IDs
scopus_id_list = scopus_id_worksheet_df['SCOPUS_ID'].tolist()

# Remove NaNs
scopus_id_list = [x for x in scopus_id_list if str(x) != 'nan']

# Find duplicates
duplicates = [item for item, count in collections.Counter(scopus_id_list).items() if count > 1]

# Print duplicates
print(duplicates)




# h-index for the author

If the author is found, use the author metrics API to get the h-index for the author.



In [None]:
import pandas as pd
import numpy as np

requests_per_second = 2 # Limit for Author Search API

# Open the scopus_id_worksheet CSV file
scopus_id_worksheet_df = pd.read_csv('scopus_id_worksheet.csv', na_filter=False, dtype=str)
#scopus_id_worksheet_df = scopus_id_worksheet_df.head(20) # Test with first 20 rows

# Add a column to the dataframe for the metrics.
scopus_id_worksheet_df['METRICS'] = ''
# Set the scopus_id column as the index, leaving the column intact.
scopus_id_worksheet_df = scopus_id_worksheet_df.set_index('SCOPUS_ID', drop=False)

# Extract the Scopus IDs from the dataframe and create a list of them.
scopus_id_list = []
number_of_scopus_ids = 0
row_number = 0

for index, row in scopus_id_worksheet_df.iterrows():
    row_number += 1
    if row['SCOPUS_ID'] != '':
    #if type(row['SCOPUS_ID']) == type(''):    
        print(row['SCOPUS_ID'])
        number_of_scopus_ids += 1
        scopus_id_list.append(row['SCOPUS_ID'])
    # Add another Scopus ID to the list if there aren't 100 yet.
    # Despite what the API says about a limit of 200, for the metrics, the limit is 100.
    if number_of_scopus_ids < 100:
        continue # Go to the next row

    # When the count gets to 200, retrieve the metrics for the Scopus IDs.
    author_data = get_metrics_from_elsevier_author_api(scopus_id_list)

    # Loop through the author data and add the metrics to the dataframe.
    for author in author_data:
        # Add the metrics to the dataframe.
        scopus_id_worksheet_df.at[str(author['scopus_id']), 'H_INDEX'] = author['h_index']

    # Reset the count and the list of Scopus IDs.
    number_of_scopus_ids = 0
    scopus_id_list = []
    print(row_number)

    # Delay to avoid hitting the API too fast.
    time.sleep(1/requests_per_second + 0.1)  # Delay to avoid hitting the API too fast.

# Handle the last set of Scopus IDs that is less than 200.
author_data = get_metrics_from_elsevier_author_api(scopus_id_list)

# Loop through the author data and add the metrics to the dataframe.
for author in author_data:
    # Add the metrics to the dataframe.
    scopus_id_worksheet_df.at[str(author['scopus_id']), 'H_INDEX'] = author['h_index']

# Save the results to a CSV file.
scopus_id_worksheet_df.to_csv('scopus_id_worksheet_with_metrics.csv', index=False)

print('done')



In [None]:
get_single_metric_from_elsevier_author_api('58195276000')

# Code to find number of remaining Elsivier API calls

This cell was developed for testing and parts of its code were used to provide the remaining calls after completion of the cell that searches for authors.

In [None]:
# Query the Scopus Author Search API to find out how many API calls are left for the week and when the week resets.
# https://dev.elsevier.com/api_key_settings.html

# Make a minimal search to get some response headers
#response = requests.get('https://api.elsevier.com/content/search/author?query=authlast(Anderson)&apiKey=' + api_key)
api_key = load_credential('elsevier-api-key.txt', 'home')
inst_token = load_credential('elsevier-inst-token.txt', 'home')

header_parameters = {
    'Accept': 'application/json',
    'X-ELS-APIKey': api_key,
    'X-ELS-Insttoken': inst_token
}
# See https://dev.elsevier.com/sc_author_search_views.html for the list of possible fields
query_parameters = {
    'query': 'authlast(Baskauf)',
    'field': 'dc:identifier,affiliation-current,preferred-name,orcid,subject-area,document-count'
}
endpoint_resource_url = 'https://api.elsevier.com/content/search/author'
response = requests.get(endpoint_resource_url, headers=header_parameters, params=query_parameters)



## Generate an ORCID access token from credentials

The token created here is long-lived, so this only needs to be run once. In many cases, read-only functions at the ORCID API don't seem to require it. But it's probably better to use it.

In [None]:
# !!!!!!!!!!!!!!!!!!!!!!!
# RUN THIS CELL ONLY ONCE
# !!!!!!!!!!!!!!!!!!!!!!!

# Get an access token
# See https://info.orcid.org/documentation/api-tutorials/api-tutorial-searching-the-orcid-registry/
# "Obtain a search token" section for details.

# NOTE: The access token is long-lived (approximately 20 years). It can be used for multiple queries.
# I suppose there is a way to revoke it if it is compromised, but I don't know how to do that yet.

# The credentials file should be plain text with the client ID on the first line and the client secret on the second line.
filename = 'orcid_client_credentials.txt'
# gets path to home directory; works for both Win and Mac
home = str(Path.home())
credential_path = home + '/' + filename
try:
    with open(credential_path, 'rt', encoding='utf-8') as file_object:
        cred = file_object.read()
    lines = cred.split('\n')
    client_id = lines[0]
    client_secret = lines[1]
except:
    print(filename + ' file not found - is it in your home directory?')
    sys.exit()

url = 'https://orcid.org/oauth/token'
headers = {
    'Accept': 'application/json',
    'Content-Type': 'application/x-www-form-urlencoded'
}
data = {
    'client_id': client_id,
    'client_secret': client_secret,
    'grant_type': 'client_credentials',
    'scope': '/read-public'
}

response = requests.post(url, headers=headers, data=data)
print(response.status_code)
# print(response.text) # If you uncomment this, make sure that you don't upload this notebook to GitHub without clearing the output.

data = response.json()
access_token = data['access_token']
# print(access_token)

# Save the access token to a file in the home directory.
with open(home + '/orcid_access_token.txt', 'wt') as file_object:
    file_object.write(access_token)

print('access token saved')
