# Scripts associated with Otolaryngology network analysis

Settings, function definitions, global variables, etc.

In [None]:
# (c) 2023 Vanderbilt University. This program is released under a GNU General Public License v3.0 http://www.gnu.org/licenses/gpl-3.0
# The sparqler class is (c) 2023 Steve Baskauf and is released under a GNU General Public License v3.0 http://www.gnu.org/licenses/gpl-3.0
# Author: Steve Baskauf

# ----------------
# Module imports
# ----------------

from typing import List, Dict, Tuple, Any, Optional
#import yaml
import sys
import time
#import csv
import datetime
from pathlib import Path
import json
import pandas as pd
import requests
import requests_cache
from fuzzywuzzy import fuzz  # fuzzy logic matching
# import re # regex
import logging  # See https://docs.python.org/3/howto/logging.html

# Set up cache for HTTP requests
requests_cache.install_cache(
    'http_cache', backend='sqlite', expire_after=300, allowable_methods=['GET', 'POST'])

# Set up log for warnings
# This is a system file and hard to look at, so its data are harvested and put into a plain text log file later.
logging.basicConfig(filename='warnings.log', filemode='w',
                    format='%(message)s', level=logging.WARNING)

# The low cutoff for the fuzzy match score for an institution to be considered a match
INSTITUTION_NO_MATCH_CUTOFF = 80
# The low cutoff for the fuzzy match score for an institution to be considered a match that does not require review
INSTITUTION_REVIEW_CUTOFF = 90

# ----------------
# Utility functions
# ----------------


def load_credential(filename: str, directory: str) -> str:
    """Load a credential string from a plain text file. The string is a single line of text without a newline character.
    Keeping the credential in the home directory prevents accidentally exposing the credential if the directory containing the script is shared.

    Args:
        filename: The name of the file containing the credential.
        directory: The directory where the file is located. Value of 'home' loads from the home directory. For any other value, 
            the filename argument is an absolute or relative path to the file.

        Returns:
            A string containing the credential. If the credential file is not found, an empty string is returned.
    """
    if directory == 'home':
        # Gets path to home directory; works for both Win and Mac.
        home = str(Path.home())
        credential_path = home + '/' + filename
    else:
        directory = 'working'
        credential_path = filename
    try:
        with open(credential_path, 'rt', encoding='utf-8') as file_object:
            cred = file_object.read()
    except:
        print(filename + ' credentials file not found in ' +
              directory + ' directory.')
        cred = ''
    return(cred)


def csv_read(path: str, **kwargs) -> pd.DataFrame:
    """Loads a CSV table into a Pandas DataFrame with all cells as strings and blank cells as empty strings

    Keyword argument:
    rows -- the number of rows of the table to return when used for testing. When omitted, all rows are returned.
    """
    dataframe = pd.read_csv(path, na_filter=False, dtype=str)
    if 'rows' in kwargs:
        return dataframe.head(kwargs['rows']).copy(deep=True)
    else:
        return dataframe


def look_up_alternative_institutional_ids(ror_iri: str) -> dict:
    """Look up alternative institutional IDs for a ROR ID using the Wikidata Query Service.

    Args:
        ror_iri: The ROR ID for the institution in IRI form.

    Returns:
        dict: A dictionary with the alternative IDs.
    """
    # Extract the ROR ID from the IRI
    ror_id = ror_iri.split('/')[-1]

    query_string = '''SELECT DISTINCT ?qid ?ringgold ?grid ?label WHERE {
    ?qid wdt:P6782 "''' + ror_id + '''".
    ?qid rdfs:label ?label.
    FILTER (LANG(?label) = "en")
    OPTIONAL { ?qid wdt:P3500 ?ringgold. }
    OPTIONAL { ?qid wdt:P2427 ?grid. }
    }
'''

    # put your own script name and email address here
    user_agent = 'id_lookup/0.1 (mailto:steve.baskauf@vanderbilt.edu)'
    wdqs = Sparqler(useragent=user_agent)
    data = wdqs.query(query_string)
    #print(json.dumps(data, indent=2))

    # Handle case where no results are returned
    if len(data) == 0:
        return {
            'label': '',
            'ror': '',
            'ringgold': '',
            'grid': '',
            'qid': ''
        }

    if 'label' in data[0]:
        label = data[0]['label']['value']
    else:
        label = ''
    if 'ringgold' in data[0]:
        ringgold = data[0]['ringgold']['value']
    else:
        ringgold = ''
    if 'grid' in data[0]:
        grid = data[0]['grid']['value']
    else:
        grid = ''
    if 'qid' in data[0]:
        qid = data[0]['qid']['value'].split('/')[-1]
    else:
        qid = ''

    # Create a dictionary to store the results
    results_dict = {
        'label': label,
        'ror': ror_iri,
        'ringgold': ringgold,
        'grid': grid,
        'qid': qid
    }

    return results_dict

# ----------------
# ROR API functions
# ----------------

# ORCID supports Ringgold, GRID, and ROR identifiers.
# https://www.ringgold.com/ Limited to 10 searches per day
# https://www.grid.ac/institutes GRID discontinued public releases at the end of 2021
# https://ror.org/ ROR is now the principal identifier for organizations

# ROR documentation: https://ror.readme.io/
# ROR API documentation: https://ror.readme.io/docs/rest-api
# ROR API endpoint URL: https://api.ror.org/organizations


def search_for_institution_id(institution: str, query_type: str) -> List[Dict]:
    """Search for the ROR ID for an institution using the ROR API.

    Args:
        institution: The name of the institution to search for.
        query_type: The type of query to perform. Must be one of 'query' or 'affiliation'.

    Returns:
        A list of dictionaries for possible institution matches with the name, id, and score.
    """
    # ROR API endpoint
    ror_api_endpoint = 'https://api.ror.org/organizations'

    # ROR API parameters
    if query_type == 'query' or query_type == 'affiliation':
        # Institution search (generic search string)
        ror_api_params = {
            query_type: institution
        }
    else:
        print(f'Error: Unknown query type: {query_type}')
        return ''

    # Send the request to the ROR API
    ror_api_response = requests.get(ror_api_endpoint, params=ror_api_params)

    # Get the status code
    status_code = ror_api_response.status_code
    if status_code != 200:
        print('Error: ROR API returned status code', status_code)
        return ''

    # Convert the response to JSON
    ror_api_response_json = ror_api_response.json()

    # Get the list of organizations
    organizations = ror_api_response_json['items']

    results = []
    # Loop through the organizations and extract the name and ROR ID
    for organization in organizations:
        org_dict = {}
        # Get the name
        org_dict['name'] = organization['organization']['name']

        # Get the ROR ID
        org_dict['id'] = organization['organization']['id']

        results.append(org_dict)
    return results


def fuzzy_match_institutions(institution_name: str, search_results: List[Dict]) -> Tuple:
    """Fuzzy match an institution name to a list of search results.

    Args:
        institution_name: The name of the institution to match.
        search_results: A list of dictionaries with the name and id of the institution.

    Returns:
        A tuple with the top match dictionary, score, and a mismatch flag.
    """
    top_w_ratio_match = {}
    top_w_ratio_score = 0
    top_token_set_ratio_match = {}
    top_token_set_ratio_score = 0
    flagged = False

    for search_result in search_results:
        # Get the name of the institution from the search result
        search_result_name = search_result['name']

        # Calculate the fuzzy match ratio
        w_ratio = fuzz.WRatio(institution_name, search_result_name)
        #print(w_ratio, institution_name, search_result_name)
        token_set_ratio = fuzz.token_set_ratio(
            institution_name, search_result_name)
        #print(token_set_ratio, institution_name, search_result_name)
        # print()

        # Check if this is the top w_ratio match
        if w_ratio > top_w_ratio_score:
            top_w_ratio_match = search_result
            top_w_ratio_score = w_ratio

        # Check if this is the top token_set_ratio match
        if token_set_ratio > top_token_set_ratio_score:
            top_token_set_ratio_match = search_result
            top_token_set_ratio_score = token_set_ratio

    # Check whether the top w_ratio match is also the top token_set_ratio match
    if top_w_ratio_match != top_token_set_ratio_match:
        # Warn that the top w_ratio match is not the top token_set_ratio match
        print('Warning: Top w_ratio match is not the top token_set_ratio match for', institution_name)
        logging.warning(
            'Top w_ratio match is not the top token_set_ratio match for ' + institution_name)
        print('Top w_ratio match:', top_w_ratio_score, top_w_ratio_match)
        logging.warning('Top w_ratio match: ' +
                        str(top_w_ratio_score) + ' ' + str(top_w_ratio_match))
        print('Top token_set_ratio match:',
              top_token_set_ratio_score, top_token_set_ratio_match)
        logging.warning('Top token_set_ratio match: ' +
                        str(top_token_set_ratio_score) + ' ' + str(top_token_set_ratio_match))
        logging.warning('')
        flagged = True
    # Return the top w_ration match and score
    return top_w_ratio_match, top_w_ratio_score, flagged

# ----------------
# Elsevier API functions
# ----------------

def find_author_at_elsevier(query_string: str) -> Optional[str]:
    """Find an author SCOPUS ID at Elsevier from an ORCID."""

    # API specifiction landing page: https://dev.elsevier.com/api_docs.html
    # For author search info, see https://dev.elsevier.com/documentation/AuthorSearchAPI.wadl
    # General search tips are at: https://dev.elsevier.com/sc_author_search_tips.html
    # ORCID is one of the possible field restrictions.

    # Endpoint for author metrics
    endpoint_resource_url = 'https://api.elsevier.com/content/search/author'

    # NOTE: I was able to generate the API key myself using the Elsevier Developer Portal. However, for some reason it gave me access to the
    # Scopus API, but not the Author Search API. I had to request access to the Author Search API from Elsevier support and they gave me
    # an institutional token that is an additional requirement for me to get the Author Search API to work.

    api_key = load_credential('elsevier-api-key.txt', 'home')
    inst_token = load_credential('elsevier-inst-token.txt', 'home')
    if api_key == '' or inst_token == '':
        print('Error: API key or institutional token not found. Search not performed.')
        return ''

    header_parameters = {
        'Accept': 'application/json',
        'X-ELS-APIKey': api_key,
        'X-ELS-Insttoken': inst_token
    }
    # See https://dev.elsevier.com/sc_author_search_views.html for the list of possible fields
    query_parameters = {
        'query': query_string,
        'field': 'dc:identifier,affiliation-current,preferred-name'
    }

    response = requests.get(endpoint_resource_url, headers=header_parameters, params=query_parameters)

    # Get the status code
    status_code = response.status_code
    if status_code != 200:
        print('Error: API returned status code', status_code)
        print(response.text)
        return None

    results_list = response.json()['search-results']['entry']
    #print(json.dumps(results_list, indent=2))

    # Check for error conditions
    if len(results_list) == 0: # This doesn't actually happen, since a single result with an error is returned where there are no hits.
        return None
    elif len(results_list) > 1:
        print('Error: Multiple results found for ORCID', orcid)
        print(json.dumps(results_list, indent=2))
        return None
    
    # Check whether the single result reports an empty result set.
    if 'error' in results_list[0]:
        if results_list[0]['error'] == 'Result set was empty':
            return None

    # Extract the SCOPUS ID from the result
    scopus_id = results_list[0]['dc:identifier'].split(':')[-1]

    return scopus_id

def find_author_at_elsevier_by_orcid(orcid: str) -> Optional[str]:
    """Find an author SCOPUS ID at Elsevier from an ORCID."""
    # List of field restrictions is at https://dev.elsevier.com/sc_author_search_tips.html
    query_string = 'ORCID(' + orcid + ')'
    scopus_id = find_author_at_elsevier(query_string)
    return scopus_id

def get_metrics_from_elsevier_author_api(scopus_author_id: str) -> Optional[str]:

    """Search the Elsevier Author Search API for bibliometric data such as h-Index."""
    # API specifiction landing page: https://dev.elsevier.com/api_docs.html
    # For author metrics info, see https://dev.elsevier.com/documentation/SciValAuthorAPI.wadl

    # Endpoint for author metrics
    endpoint_resource_url = 'https://api.elsevier.com/analytics/scival/author/metrics'

    # NOTE: I was able to generate the API key myself using the Elsevier Developer Portal. However, for some reason it gave me access to the
    # Scopus API, but not the Author Search API. I had to request access to the Author Search API from Elsevier support and they gave me
    # an institutional token that is an additional requirement for me to get the Author Search API to work.

    api_key = load_credential('elsevier-api-key.txt', 'home')
    inst_token = load_credential('elsevier-inst-token.txt', 'home')
    if api_key == '' or inst_token == '':
        print('Error: API key or institutional token not found. Search not performed.')
        return ''

    header_parameters = {
        'Accept': 'application/json',
        'X-ELS-APIKey': api_key,
        'X-ELS-Insttoken': inst_token
    }
    query_parameters = {
        'metricTypes': 'HIndices',
        'byYear': False,
        'authors': scopus_author_id
    }

    response = requests.get(endpoint_resource_url, headers=header_parameters, params=query_parameters)

    # Get the status code
    status_code = response.status_code
    if status_code != 200:
        print('Error: API returned status code', status_code)
        print(response.text)
        return None

    data_list = response.json()['results']
    #print(json.dumps(data_list, indent=2))

    if len(data_list) == 0:
        print('Error: No author metrics found for Scopus author ID', scopus_author_id)
        return ''

    found = False
    for metric in data_list[0]['metrics']:
        if metric['indexType'] == 'h-index':
            h_index = metric['value']
            found = True
            break

    if not found:
        print('Error: No h-index found for Scopus author ID', scopus_author_id)
        return None

    return h_index

# ----------------
# ORCID API functions
# ----------------


def query_orcid_api(given_name_string: str, family_name_string: str, **kwargs) -> List:
    """Query the ORCID API for a person's ORCID ID."""
    # ORCID API search information: https://info.orcid.org/documentation/api-tutorials/api-tutorial-searching-the-orcid-registry/
    # ORCID FAQ on finding record holders: https://info.orcid.org/ufaqs/how-do-i-find-orcid-record-holders-at-my-institution/
    # ORCID API information on organization identifiers https://info.orcid.org/documentation/integration-guide/working-with-organization-identifiers/#Determining_your_Identifier
    # Limit is 1000 results per call, so paging is required. I did that on vb1_process_department.ipynb

    # Solr searches are supported: https://solr.apache.org/guide/6_6/the-standard-query-parser.html

    # Construct institution part of search string OR and the identifiers in the kwargs.
    count = 0
    built_search_string = ''
    for key, value in kwargs.items():
        #print(key, value)

        if 'ror' == key:
            # Contrary to the example, the ROR ID is the full IRI, not just the local name. It must be enclosed in quotes because it has a colon.
            # Extract local_name from ROR IRI
            #ror_id = kwargs['ror'].split('/')[-1]
            #search_string = 'ror-org-id:' + ror_id
            search_string = 'ror-org-id:"' + kwargs['ror'] + '"'
        elif 'ringgold' == key:
            search_string = 'ringgold-org-id:' + kwargs['ringgold']
        elif 'grid' == key:
            search_string = 'grid-org-id:' + kwargs['grid']
        elif 'email' == key:
            search_string = 'email:*@' + kwargs['email']

        # If quotes are not used around the search string, it does an OR search. So searching for Vanderbilt University returns 3095010 results.
        # while searching with quotes returns 7475 results. It is not clear to me what the parentheses accomplish.
        elif 'name' == key:  # Documentation says exact match with name.
            search_string = 'affiliation-org-name:("' + kwargs['name'] + '")'
        # The text keyword argument has a list value. It is a list of strings that are ORed together.
        elif 'text' == key:
            if len(kwargs['text']) == 1:
                search_string = 'affiliation-org-name:"' + kwargs['text'][0] + '"'
            elif len(kwargs['text']) > 1:
                search_string = ''
                for text in kwargs['text']:
                    search_string += 'affiliation-org-name:"' + text + '" OR '
                search_string = search_string[:-4]  # Remove the last ' OR '
        else:
            print('Error: unknown key', key)
            print('Not included in search string')
            continue

        if count == 0:
            built_search_string += search_string
        else:
            built_search_string += ' OR ' + search_string
        count += 1

    # Construct the name part of the search string
    names_string = 'given-names:' + given_name_string + \
        ' AND family-name:' + family_name_string
    if built_search_string != '':
        built_search_string = names_string + \
            ' AND (' + built_search_string + ')'
    else:
        built_search_string = names_string
    #print('Search string:', built_search_string)

    # Search endpoint
    endpoint_url = 'https://pub.orcid.org/v3.0/search/'

    # Header parameters
    header_parameters = {
        'Accept': 'application/json'
        # 'Accept': 'application/vnd.orcid+xml'
    }

    # Try to load an authorization token from a file in the user's home directory. If none are loaded, the query will be unauthenticated.
    # Determined the form of this parameter by setting up Bearer Token Authentication in Postman and then looking at the request headers.
    # I think this is correct because if an invalid token is sent, I get a 401 status code.

    # NOTE: As of 2023-04-06 there are no errors for a valid but unauthenticated search. All of the instructions show how to authenticate.
    # So at some point in the future authentication may be required.
    access_token = load_credential('orcid_access_token.txt', 'home')
    if access_token != '':
        header_parameters['Authorization'] = 'Bearer ' + access_token

    # Query parameters
    # Example name search q=family-name:Haak+AND+given-names:Laurel+AND+digital-object-ids:%2210.1087/20120404%22
    # The 'fl' field specification parameter seems to only work for CSV format. For JSON, only the ORCID ID is returned.
    query_parameters = {
        # 'fl': 'orcid-identifier,given-names,family-name',
        'q': built_search_string
    }

    # Make the request
    response = requests.get(
        endpoint_url, headers=header_parameters, params=query_parameters)
    # Print the request URL
    # print(response.url)

    # Get the status code
    status_code = response.status_code
    if status_code != 200:
        print('Error: API returned status code', status_code)
        print(response.text)
        return []

    # Get the results
    data = response.json()
    #print(json.dumps(data, indent=2))

    # Extract the number of hits
    num_hits = data['num-found']
    #print('Number of hits:', num_hits)

    if num_hits == 0:
        return []

    # Extract the ORCID IDs
    orcid_ids = []
    for result in data['result']:
        orcid_ids.append(result['orcid-identifier']['path'])

    return orcid_ids

# ----------------
# Wikidata Query Service functions
# ----------------


class Sparqler:
    """Build SPARQL queries of various sorts

    Parameters
    -----------
    method: str
        Possible values are "post" (default) or "get". Use "get" if read-only query endpoint.
        Must be "post" for update endpoint.
    endpoint: URL
        Defaults to Wikidata Query Service if not provided.
    useragent : str
        Required if using the Wikidata Query Service, otherwise optional.
        Use the form: appname/v.v (URL; mailto:email@domain.com)
        See https://meta.wikimedia.org/wiki/User-Agent_policy
    session: requests.Session
        If provided, the session will be used for all queries. Note: required for the Commons Query Service.
        If not provided, a generic requests method (get or post) will be used.
        NOTE: Currently only implemented for the .query() method since I don't have any way to test the mehtods that write.
    sleep: float
        Number of seconds to wait between queries. Defaults to 0.1

    Required modules:
    -------------
    requests, datetime, time
    """

    def __init__(self, method='post', endpoint='https://query.wikidata.org/sparql', useragent=None, session=None, sleep=0.1):
        # attributes for all methods
        self.http_method = method
        self.endpoint = endpoint
        if useragent is None:
            if self.endpoint == 'https://query.wikidata.org/sparql':
                print(
                    'You must provide a value for the useragent argument when using the Wikidata Query Service.')
                print()
                # Use keyboard interrupt instead of sys.exit() because it works in Jupyter notebooks
                raise KeyboardInterrupt
        self.session = session
        self.sleep = sleep

        self.requestheader = {}
        if useragent:
            self.requestheader['User-Agent'] = useragent

        if self.http_method == 'post':
            self.requestheader['Content-Type'] = 'application/x-www-form-urlencoded'

    def query(self, query_string, form='select', verbose=False, **kwargs):
        """Sends a SPARQL query to the endpoint.

        Parameters
        ----------
        form : str
            The SPARQL query form.
            Possible values are: "select" (default), "ask", "construct", and "describe".
        mediatype: str
            The response media type (MIME type) of the query results.
            Some possible values for "select" and "ask" are: "application/sparql-results+json" (default) and "application/sparql-results+xml".
            Some possible values for "construct" and "describe" are: "text/turtle" (default) and "application/rdf+xml".
            See https://docs.aws.amazon.com/neptune/latest/userguide/sparql-media-type-support.html#sparql-serialization-formats-neptune-output
            for response serializations supported by Neptune.
        verbose: bool
            Prints status when True. Defaults to False.
        default: list of str
            The graphs to be merged to form the default graph. List items must be URIs in string form.
            If omitted, no graphs will be specified and default graph composition will be controlled by FROM clauses
            in the query itself. 
            See https://www.w3.org/TR/sparql11-query/#namedGraphs and https://www.w3.org/TR/sparql11-protocol/#dataset
            for details.
        named: list of str
            Graphs that may be specified by IRI in a query. List items must be URIs in string form.
            If omitted, named graphs will be specified by FROM NAMED clauses in the query itself.

        Returns
        -------
        If the form is "select" and mediatype is "application/json", a list of dictionaries containing the data.
        If the form is "ask" and mediatype is "application/json", a boolean is returned.
        If the mediatype is "application/json" and an error occurs, None is returned.
        For other forms and mediatypes, the raw output is returned.

        Notes
        -----
        To get UTF-8 text in the SPARQL queries to work properly, send URL-encoded text rather than raw text.
        That is done automatically by the requests module for GET. I guess it also does it for POST when the
        data are sent as a dict with the urlencoded header. 
        See SPARQL 1.1 protocol notes at https://www.w3.org/TR/sparql11-protocol/#query-operation        
        """
        query_form = form
        if 'mediatype' in kwargs:
            media_type = kwargs['mediatype']
        else:
            if query_form == 'construct' or query_form == 'describe':
                # if query_form == 'construct':
                media_type = 'text/turtle'
            else:
                # default for SELECT and ASK query forms
                media_type = 'application/sparql-results+json'
        self.requestheader['Accept'] = media_type

        # Build the payload dictionary (query and graph data) to be sent to the endpoint
        payload = {'query': query_string}
        if 'default' in kwargs:
            payload['default-graph-uri'] = kwargs['default']

        if 'named' in kwargs:
            payload['named-graph-uri'] = kwargs['named']

        if verbose:
            print('querying SPARQL endpoint')

        start_time = datetime.datetime.now()
        if self.http_method == 'post':
            if self.session is None:
                response = requests.post(
                    self.endpoint, data=payload, headers=self.requestheader)
            else:
                response = self.session.post(
                    self.endpoint, data=payload, headers=self.requestheader)
        else:
            if self.session is None:
                response = requests.get(
                    self.endpoint, params=payload, headers=self.requestheader)
            else:
                response = self.session.get(
                    self.endpoint, params=payload, headers=self.requestheader)
        elapsed_time = (datetime.datetime.now() - start_time).total_seconds()
        self.response = response.text
        # Throttle as a courtesy to avoid hitting the endpoint too fast.
        time.sleep(self.sleep)

        if verbose:
            print('done retrieving data in', int(elapsed_time), 's')

        if query_form == 'construct' or query_form == 'describe':
            return response.text
        else:
            if media_type != 'application/sparql-results+json':
                return response.text
            else:
                try:
                    data = response.json()
                except:
                    return None  # Returns no value if an error.

                if query_form == 'select':
                    # Extract the values from the response JSON
                    results = data['results']['bindings']
                else:
                    # True or False result from ASK query
                    results = data['boolean']
                return results


## Script to retrieve institutional identifiers from ROR

In [None]:
# Load the raw data from the CSV file
raw_data = csv_read('oto_network_analysis.csv')
# raw_data = csv_read('oto_network_analysis.csv', rows=15) # Use this for testing

# Pull the unique values from the INSTITUTION column
institutions = list(raw_data['INSTITUTION'].unique())
# print(institutions)

# Remove null np.nan values
for institution in institutions:
    if '' in institutions:
        institutions.remove('')

# print(institutions)

# Create a data frame to store the results
results_df = pd.DataFrame(
    columns=['match_score', 'flagged', 'name', 'ror_label', 'ror_id'])

# Loop through the institutions and search for the ROR ID
for institution in institutions:
    print(institution.strip())
    # Search for the institution
    institution_search_results = search_for_institution_id(
        institution, 'affiliation')
    #print(json.dumps(institution_search_results, indent=2))

    if len(institution_search_results) == 0:
        results_dict = {
            'match_score': 0,
            'name': institution.strip(),
            'ror_label': '',
            'ror_id': '',
            'flagged': 'no match'
        }
    else:
        # Fuzzy match the institution name to the search results
        id_match, score, flagged_mismatch = fuzzy_match_institutions(
            institution, institution_search_results)
        # print(id_match)

        if score < INSTITUTION_NO_MATCH_CUTOFF:  # Score too low to be a match
            results_dict = {
                'match_score': 0,
                'name': institution.strip(),
                'ror_label': '',
                'ror_id': '',
                'flagged': 'no match'
            }
        else:  # Score high enough to be a match
            # Create a dictionary with the results
            results_dict = {
                'match_score': score,
                'name': institution.strip(),
                'ror_label': id_match['name'],
                'ror_id': id_match['id']
            }
            if flagged_mismatch:  # w_ratio match disagrees with token_set_ratio match
                results_dict['flagged'] = 'mismatch'
            else:
                if score < INSTITUTION_REVIEW_CUTOFF:  # Score too low to be accepted without review
                    results_dict['flagged'] = 'review'
                else:
                    results_dict['flagged'] = ''

    # Add the results to the data frame
    results_df = results_df.append(results_dict, ignore_index=True)
    print()

    # Save the results to a CSV file after each institution in case the script crashes
    results_df.to_csv('ror_id_search_results.csv', index=False)

# direct output to text log file instead of sys.stdout
error_log_object = open('log_error.txt', 'at', encoding='utf-8')

# Read the warnings log
# For some reason, the log is considered considered a binary file. So when it is read in as text,
# it contains many null characters. So they are removed from the string read from the file.
with open('warnings.log', 'rt') as file_object:
    warnings_text = file_object.read().replace('\0', '')
if warnings_text == '':
    print('No errors occurred.', file=error_log_object)
else:
    print(warnings_text, file=error_log_object)
print('', file=error_log_object)

# Close the log file
error_log_object.close()
print('done')


## Look up alternative identifiers in Wikidata

In [None]:
# Load the institutions data from the CSV file
institutions_df = csv_read('ror_id_search_results.csv')
# institutions_df = csv_read('ror_id_search_results.csv', rows=1) # Use this for testing

# Loop throught the institutions and look up the alternative IDs
for index, row in institutions_df.iterrows():
    print(row['name'])
    if row['ror_id'] != '':
        alt_ids = look_up_alternative_institutional_ids(row['ror_id'])
        # print(alt_ids)
        institutions_df.at[index, 'wikidata_label'] = alt_ids['label']
        institutions_df.at[index, 'qid'] = alt_ids['qid']
        institutions_df.at[index, 'ringgold'] = alt_ids['ringgold']
        institutions_df.at[index, 'grid'] = alt_ids['grid']
        # Delay to avoid hitting the API too fast.
        time.sleep(0.1)  # Delay for 0.1 seconds

institutions_df.head()

# Save the results to a CSV file
institutions_df.to_csv('institutional_identifiers.csv', index=False)
print('done')


## Wrangle author spreadsheet

The author spreadsheet is not tidy data and has the institution name on a separate row ahead of the list of authors from that institution

In [None]:
# Load the raw data from the CSV file
raw_data = csv_read('oto_network_analysis.csv')

# Loop through each row to see if has the name of an institution
for index, row in raw_data.iterrows():
    if row['INSTITUTION'] != '':
        # Get the institution name
        next_institution = row['INSTITUTION'].strip()
        print(next_institution)
        # Remove that row from the data frame
        raw_data.drop(index, inplace=True)
        continue
    else:
        raw_data.at[index, 'INSTITUTION'] = next_institution

# Save the results to a CSV file
raw_data.to_csv('authors.csv', index=False)

print('done')

## Script to search ORCID for an author's ORCID ID

Note: this does not perform a search for the person without specifying any institutional information. This will cause some people to be missed. However, without the institutional affiliation, the probability of getting the wrong person goes way up. Since the point of this part of the search process is to get reliable globally unique identifiers for the person, if ORCID doesn't have the institional affiliation for the person being searched for, it's probably better to just do the search with affiliation when searching the Elsivier API to try to find their Scopus ID.

In [None]:
# Load the institutions data from the CSV file
institutions_df = csv_read('institutional_identifiers.csv')

# Load the authors data from the CSV file
authors_df = csv_read('authors.csv')

# Add a column to the authors data frame to hold the ORCID
authors_df['ORCID'] = ''

# Loop throught the institutions
for index, row in institutions_df.iterrows():
    # Use this for testing (Baylor only)
    if index != 3:
        continue
    print(row['name'])

    # Construct the kwargs to pass into the ORCID API query function
    kwargs = {}
    if row['ringgold'] != '':
        kwargs['ringgold'] = row['ringgold']

    if row['ror_id'] != '':
        kwargs['ror'] = row['ror_id']

    if row['grid'] != '':
        kwargs['grid'] = row['grid']

    # Always include the name in the search
    label_list = [ row['ror_label'] ]
    # If the Wikidata label exists and is different from the ROR label, include it in the search
    if row['wikidata_label'] != '' and row['wikidata_label'] != row['ror_label']:
        label_list.append(row['wikidata_label'])
    kwargs['text'] = label_list
    # print(kwargs)

    # Process all of the names in the authors data frame for this institution
    for author_index, author_row in authors_df.iterrows():
        if author_row['INSTITUTION'] != row['name']:
            continue

        # Get the author's name parts
        first_name = author_row['FIRST'].strip()
        middle_name = author_row['MIDDLE'].strip()
        last_name = author_row['LAST'].strip()

        # Pass the kwargs into the ORCID API query function
        # NOTE: Use the wildcard to search for first name and any middle names or longer versions of the first name
        orcid_results = query_orcid_api(first_name + '*', last_name, **kwargs)
        if len(orcid_results) > 1:
            # Try querying again with the first name and middle name
            orcid_results_middle = query_orcid_api(first_name + ' ' + middle_name, last_name, **kwargs)
            if len(orcid_results_middle) > 1:
                print(first_name, middle_name, last_name)
                print('Multiple results found. Need to disambiguate manually.')
                print(orcid_results_middle)
                print()
            elif len(orcid_results_middle) == 1:
                #print('Single result found.')
                #print(orcid_results_middle[0])
                # Update the authors data
                authors_df.at[author_index, 'ORCID'] = orcid_results_middle[0]
                
                
                continue
            else: # No results found when middle name added, so manual disambiguation of results with no middle name is needed.
                print(first_name, middle_name, last_name)
                print('Multiple results found. Need to disambiguate manually.')
                print(orcid_results)
                print()

        elif len(orcid_results) == 1:
            #print('Single result found.')
            #print(orcid_results[0])
            # Update the authors data
            authors_df.at[author_index, 'ORCID'] = orcid_results[0]
            continue

        else:
            #print('No results found.')
            pass

# Save the results to a CSV file
authors_df.to_csv('authors_with_orcids.csv', index=False)

print('done')

## Search for an author in the Elsivier API

If the author is found, use the author metrics API to get the h-index for the author.

In [None]:
orcid = '0000-0003-4365-3135' # Steve Baskauf, produces results
#orcid = '0000-0002-1393-4174' # Meha Fox, produces no results
author_id = find_author_at_elsevier_by_orcid(orcid)
print('Scopus author ID:', author_id)

if author_id is not None:
    author_data = get_metrics_from_elsevier_author_api(author_id)
    print(json.dumps(author_data, indent=2))


In [None]:

author_id = '57194519977'
author_data = get_metrics_from_elsevier_author_api(author_id)
print(json.dumps(author_data, indent=2))


In [None]:
def find_author_at_elsevier_by_names(family_name: str, given_name: str, affiliation: str, middle_name=None) -> Optional[str]:
    """Find an author SCOPUS ID at Elsevier using name and affiliation."""
    # General search tips are at: https://dev.elsevier.com/sc_author_search_tips.html
    if middle_name is not None:
        given_name += ' ' + middle_name
    query_string = 'AFFIL(' + affiliation + ') AND AUTHLASTNAME(' + family_name + ') AND AUTHFIRST(' + given_name + ')'

    scopus_id = find_author_at_elsevier(query_string)
    return scopus_id

# Examples of names with multiple results:
# Meha Fox, Baylor College of Medicine
# Douglas Appling, Baylor College of Medicine and W. Douglas Appling, Baylor College of Medicine
# N. Liou, Baylor College of Medicine produces 3 results
# Clifford B. Anderson, Vanderbilt University
# Carl H* Johnson, Vanderbilt University

given_name = 'N.'
middle_name = None
family_name = 'Liou'
affiliation = 'Baylor College of Medicine'
scopus_author_id = find_author_at_elsevier_by_names(family_name, given_name, affiliation, middle_name=middle_name)
print(scopus_author_id)

## Generate an ORCID access token from credentials 

In [None]:
# !!!!!!!!!!!!!!!!!!!!!!!
# RUN THIS CELL ONLY ONCE
# !!!!!!!!!!!!!!!!!!!!!!!

# Get an access token
# See https://info.orcid.org/documentation/api-tutorials/api-tutorial-searching-the-orcid-registry/
# "Obtain a search token" section for details.

# NOTE: The access token is long-lived (approximately 20 years). It can be used for multiple queries.
# I suppose there is a way to revoke it if it is compromised, but I don't know how to do that yet.

# The credentials file should be plain text with the client ID on the first line and the client secret on the second line.
filename = 'orcid_client_credentials.txt'
# gets path to home directory; works for both Win and Mac
home = str(Path.home())
credential_path = home + '/' + filename
try:
    with open(credential_path, 'rt', encoding='utf-8') as file_object:
        cred = file_object.read()
    lines = cred.split('\n')
    client_id = lines[0]
    client_secret = lines[1]
except:
    print(filename + ' file not found - is it in your home directory?')
    sys.exit()

url = 'https://orcid.org/oauth/token'
headers = {
    'Accept': 'application/json',
    'Content-Type': 'application/x-www-form-urlencoded'
}
data = {
    'client_id': client_id,
    'client_secret': client_secret,
    'grant_type': 'client_credentials',
    'scope': '/read-public'
}

response = requests.post(url, headers=headers, data=data)
print(response.status_code)
# print(response.text) # If you uncomment this, make sure that you don't upload this notebook to GitHub without clearing the output.

data = response.json()
access_token = data['access_token']
# print(access_token)

# Save the access token to a file in the home directory.
with open(home + '/orcid_access_token.txt', 'wt') as file_object:
    file_object.write(access_token)

print('access token saved')
