# Scripts associated with Otolaryngology network analysis

In [None]:
# (c) 2023 Vanderbilt University. This program is released under a GNU General Public License v3.0 http://www.gnu.org/licenses/gpl-3.0
# Author: Steve Baskauf

# ----------------
# Module imports
# ----------------

from typing import List, Dict, Tuple, Any, Optional
import yaml
import sys
#import csv
from datetime import datetime
from time import sleep
import json
import pandas as pd
import numpy as np
import requests
import requests_cache
from fuzzywuzzy import fuzz # fuzzy logic matching
import re # regex
import logging # See https://docs.python.org/3/howto/logging.html

# Set up cache for HTTP requests
requests_cache.install_cache('http_cache', backend='sqlite', expire_after=300, allowable_methods=['GET', 'POST'])

# Set up log for warnings
# This is a system file and hard to look at, so its data are harvested and put into a plain text log file later.
logging.basicConfig(filename='warnings.log', filemode='w', format='%(message)s', level=logging.WARNING)

INSTITUTION_NO_MATCH_CUTOFF = 80 # The low cutoff for the fuzzy match score for an institution to be considered a match
INSTITUTION_REVIEW_CUTOFF = 90 # The low cutoff for the fuzzy match score for an institution to be considered a match that does not require review

# ----------------
# Functions
# ----------------

# ORCID supports Ringgold, GRID, and ROR identifiers.
# https://www.ringgold.com/ Limited to 10 searches per day
# https://www.grid.ac/institutes GRID discontinued public releases at the end of 2021
# https://ror.org/ ROR is now the principal identifier for organizations

# ROR documentation: https://ror.readme.io/
# ROR API documentation: https://ror.readme.io/docs/rest-api
# ROR API endpoint URL: https://api.ror.org/organizations

def search_for_institution_id(institution: str, query_type: str) -> List[Dict]:
    """Search for the ROR ID for an institution using the ROR API.
    
    Args:
        institution: The name of the institution to search for.
        query_type: The type of query to perform. Must be one of 'query' or 'affiliation'.
    
    Returns:
        A list of dictionaries for possible institution matches with the name, id, and score.
    """
    # ROR API endpoint
    ror_api_endpoint = 'https://api.ror.org/organizations'

    # ROR API parameters
    if query_type == 'query' or query_type == 'affiliation':
        # Institution search (generic search string)
        ror_api_params = {
            query_type: institution
        }
    else:
        print(f'Error: Unknown query type: {query_type}')
        return ''

    # Send the request to the ROR API
    ror_api_response = requests.get(ror_api_endpoint, params=ror_api_params)

    # Get the status code
    status_code = ror_api_response.status_code
    if status_code != 200:
        print(f'Error: ROR API returned status code {status_code}')
        return ''
    else:
        # Convert the response to JSON
        ror_api_response_json = ror_api_response.json()

        # Get the list of organizations
        organizations = ror_api_response_json['items']

        results = []
        # Loop through the organizations and extract the name and ROR ID
        for organization in organizations:
            org_dict = {}
            # Get the name
            org_dict['name'] = organization['organization']['name']

            # Get the ROR ID
            org_dict['id'] = organization['organization']['id']

            results.append(org_dict)
        return results

def fuzzy_match_institutions(institution_name: str, search_results: List[Dict]) -> Tuple:
    """Fuzzy match an institution name to a list of search results.

    Args:
        institution_name: The name of the institution to match.
        search_results: A list of dictionaries with the name and id of the institution.

    Returns:
        A tuple with the top match dictionary and the score.
    """
    top_w_ratio_match = {}
    top_w_ratio_score = 0
    top_token_set_ratio_match = {}
    top_token_set_ratio_score = 0
    flagged = False

    for search_result in search_results:
        # Get the name of the institution from the search result
        search_result_name = search_result['name']

        # Calculate the fuzzy match ratio
        w_ratio = fuzz.WRatio(institution_name, search_result_name)
        #print(w_ratio, institution_name, search_result_name)
        token_set_ratio = fuzz.token_set_ratio(institution_name, search_result_name)
        #print(token_set_ratio, institution_name, search_result_name)
        #print()

        # Check if this is the top w_ratio match
        if w_ratio > top_w_ratio_score:
            top_w_ratio_match = search_result
            top_w_ratio_score = w_ratio

        # Check if this is the top token_set_ratio match
        if token_set_ratio > top_token_set_ratio_score:
            top_token_set_ratio_match = search_result
            top_token_set_ratio_score = token_set_ratio

    # Check whether the top w_ratio match is also the top token_set_ratio match
    if top_w_ratio_match != top_token_set_ratio_match:
        # Warn that the top w_ratio match is not the top token_set_ratio match
        print('Warning: Top w_ratio match is not the top token_set_ratio match for', institution_name)
        logging.warning('Top w_ratio match is not the top token_set_ratio match for ' + institution_name)
        print('Top w_ratio match:', top_w_ratio_score, top_w_ratio_match)
        logging.warning('Top w_ratio match: ' + str(top_w_ratio_score) + ' ' + str(top_w_ratio_match))
        print('Top token_set_ratio match:', top_token_set_ratio_score, top_token_set_ratio_match)
        logging.warning('Top token_set_ratio match: ' + str(top_token_set_ratio_score) + ' ' + str(top_token_set_ratio_match))
        logging.warning('')
        flagged = True
    # Return the top w_ration match and score
    return top_w_ratio_match, top_w_ratio_score, flagged



In [None]:
# Load the data from the CSV file
raw_data = pd.read_csv('oto_network_analysis.csv', dtype=str)

# Pull the unique values from the INSTITUTION column
institutions = list(raw_data['INSTITUTION'].unique())
#print(institutions)

# Remove null np.nan values
for institution in institutions:
    if institution is np.nan:
        institutions.remove(institution)

#print(institutions)

# For testing, limit the number of institutions to 15
#institutions = institutions[:15]

# Create a data frame to store the results
results_df = pd.DataFrame(columns=['match_score', 'flagged', 'name', 'ror_label', 'ror_id'])

# Loop through the institutions and search for the ROR ID
for institution in institutions:
    print(institution.strip())
    # Search for the institution
    institution_search_results = search_for_institution_id(institution, 'affiliation')
    #print(json.dumps(institution_search_results, indent=2))

    if len(institution_search_results) == 0:
        results_dict = {
            'match_score': 0,
            'name': institution.strip(),
            'ror_label': '',
            'ror_id': '',
            'flagged': 'no match'
        }
    else:
        # Fuzzy match the institution name to the search results
        id_match, score, flagged_mismatch = fuzzy_match_institutions(institution, institution_search_results)
        #print(id_match)

        if score < INSTITUTION_NO_MATCH_CUTOFF: # Score too low to be a match
            results_dict = {
                'match_score': 0,
                'name': institution.strip(),
                'ror_label': '',
                'ror_id': '',
                'flagged': 'no match'
            }
        else: # Score high enough to be a match
            # Create a dictionary with the results
            results_dict = {
                'match_score': score,
                'name': institution.strip(),
                'ror_label': id_match['name'],
                'ror_id': id_match['id']
            }
            if flagged_mismatch: # w_ratio match disagrees with token_set_ratio match
                results_dict['flagged'] = 'mismatch'
            else:
                if score < INSTITUTION_REVIEW_CUTOFF: # Score too low to be accepted without review
                    results_dict['flagged'] = 'review'
                else:
                    results_dict['flagged'] = ''

    # Add the results to the data frame
    results_df = results_df.append(results_dict, ignore_index=True)
    print()

    # Save the results to a CSV file after each institution in case the script crashes
    results_df.to_csv('ror_id_search_results.csv', index=False)

# direct output to text log file instead of sys.stdout
error_log_object = open('log_error.txt', 'at', encoding='utf-8')

# Read the warnings log
# For some reason, the log is considered considered a binary file. So when it is read in as text, 
# it contains many null characters. So they are removed from the string read from the file.
with open('warnings.log', 'rt') as file_object:
    warnings_text = file_object.read().replace('\0', '')
if warnings_text == '':
    print('No errors occurred.', file=error_log_object)
else:
    print(warnings_text, file=error_log_object)
print('', file=error_log_object)

# Close the log file
error_log_object.close()
print('done')
