## Interrater comparison

In [45]:
import os
import pandas as pd 
import numpy as np  
#import text2term

import ast
import csv
import json

from collections import Counter, deque

In [3]:
def clean_curation_results(topic_list):
    if topic_list == 'None':
        clean_list = []
    else:
        temp_list = topic_list.replace('[','').replace(']','')
        dirty_list = temp_list.split('|')
        clean_list = [x.strip().strip('{').strip('}') for x in dirty_list]
    clean_set = set(clean_list)
    return clean_set

In [26]:
script_path = os.getcwd()
data_path = os.path.join(script_path,'data')
janet_ratings = pd.read_excel(os.path.join(data_path,'GPT categorization validation.xlsx'), 'janet_rating', engine='openpyxl')
janet_ratings.fillna('None',inplace=True)
janet_ratings['Curator_1'] = janet_ratings.apply(lambda row: clean_curation_results(row['Topics']), axis=1)
curator1 = janet_ratings[['Data Repository','Name','Description','Curator_1']].copy()
ginger_ratings = pd.read_excel(os.path.join(data_path,'GPT categorization validation.xlsx'), 'ginger_rating', engine='openpyxl')
ginger_ratings.fillna('None',inplace=True)
ginger_ratings['Curator_2'] = ginger_ratings.apply(lambda row: clean_curation_results(row['Topics']), axis=1)
curator2 = ginger_ratings[['Data Repository','Name','Description','Curator_2']].copy()


In [27]:
data_df = curator1.merge(curator2,on=['Data Repository','Name','Description'],how='inner')

## Jaccard Similarity

In [6]:
def jaccard_similarity(row):
    set1, set2 = row['Curator_1'], row['Curator_2']
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2)) 
    return intersection / union

## only used for alternative weighted similarity calculation (alt_sim)
def count_matches(row):
    set1, set2 = row['Curator_1'], row['Curator_2']
    intersection = len(set1.intersection(set2))    
    return intersection

def count_terms(row):
    set1, set2 = row['Curator_1'], row['Curator_2']
    union = len(set1.union(set2))
    return union

In [28]:
data_df['Jaccard Similarity'] = data_df.apply(jaccard_similarity, axis=1)

## only used for alternative weighted similarity calculation (alt sim)
data_df['match_count'] = data_df.apply(count_matches, axis=1)
data_df['term_count'] = data_df.apply(count_terms, axis=1)

## Gather more metrics

Precision, Recall, Jaccard Similarity

In [29]:
def calculate_precision_recall_per_row(ground_truth, predictions):
    precision_per_row = []
    recall_per_row = []

    for truth_labels, predicted_labels in zip(ground_truth, predictions):

        true_positives = len(truth_labels.intersection(predicted_labels))
        false_positives = len(predicted_labels - truth_labels)
        false_negatives = len(truth_labels - predicted_labels)

        precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) != 0 else 0
        recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) != 0 else 0

        precision_per_row.append(precision)
        recall_per_row.append(recall)

    return precision_per_row, recall_per_row


precision, recall = calculate_precision_recall_per_row(data_df['Curator_1'],data_df['Curator_2'])

#print(f'Precision: {precision}')
#print(f'Recall: {recall}')
data_df['Precision'] = precision
data_df['Recall'] = recall

In [None]:
# Remove any EDAM not in the list
with open('EDAM/edam_topics.txt', 'r') as edam_file:
    full_edam_topics = edam_file.readlines()

full_edam_topics = [topic.strip().replace('\"', '') for topic in full_edam_topics]

## New Scoring Technique

1. Calculate J-similarity, precision, recall and then remove the terms in common

2. Check if a prediction term is within the same tree as a ground truth term

3. If two terms are within the same tree:

- Treat the two terms as a match
- Determine which term is closer to the root ([http://edamontology.org/topic\_0003](http://edamontology.org/topic_0003))
  - Calculate a weight to apply depending on how close the closest term is to the root (Weight 1)
    - If closest term is one step away from root it should have a lower score than a closest term that is 2 steps away. This is to lower the weight of excessively generic terms (use: 1-(1/(# of steps to closest term))
  - Calculate a weight to apply depending on the number of steps between the two 'matching' terms (Weight 2)
    - If the closest term to root is the ground truth term, use: (1/(# of steps between the terms))
    - If the closest term to root is the prediction term, use: -(1/(# of steps between the terms)): It is negative only to ensure we will later be able to inspect the directionality
- The overall weight should be a combination of the two, for example:
- Overall weight = Weight 1 + ABS(Weight 2)
- Weighted similarity: Add the overall weight for all matches and divide by the total number of matches, then add the j-sim (since we previously removed exact matches)
  * Note, investigated this as an alternative, but it doesn't account for the fact that when you do pairwise matches, it's possible that terms will match more than once.:
    - alt sim: (Overall weight)/(len(union(terms))) + J-sim, or (overall weight + match count)/len(union(terms))) 

**Adjusting for prediction number biases:**

- The weighted similarity will be advantageous to ChatGPT 4 due to its tendency to dump every relevant term
- To account for that, we can penalize it for making excessive guesses by multiplying the weighted similarity against the ratio of (# of gold standard terms)/max(# of gold standard terms),(# of predicted terms)).
  * If LLM guesses the same number of terms as # of gold standard, this ratio = 1: No penalty
  * If LLM guesses more terms than # of gold standard, this ratio >1: penalty, greater number, greater penalty

**Evaluating whether the prediction has a tendency to be more specific or less specific than the gold standard**

- Broadness evaluation: (# of positive Weight 2 values)/(# of negative Weight 2 values)
  - Number of times broader term is ground truth / Number of times broader term is predicted term
- If broadness evaluation is \>1, LLM model predictions are more specific than Ground truth/gold standard terms
  - If broadness evaluation is \<1, LLM model predictions are less specific than ground truth/gold standard terms

### Prep the topic library

In [9]:
import pandas as pd

edam_data = pd.read_csv("EDAM/EDAM.csv")
edam_data = edam_data[edam_data['Class ID'].str.startswith("http://edamontology.org/topic_")].sort_values(by='Preferred Label')

In [10]:
edam_data[~edam_data['Parents'].str.contains("http://edamontology.org/topic_")].head()[['Class ID', 'Preferred Label', 'Parents']]

Unnamed: 0,Class ID,Preferred Label,Parents
2996,http://edamontology.org/topic_3521,2D PAGE experiment,http://www.w3.org/2002/07/owl#DeprecatedClass
3002,http://edamontology.org/topic_0174,Ab initio structure prediction,http://www.w3.org/2002/07/owl#DeprecatedClass
3329,http://edamontology.org/topic_0083,Alignment,http://www.w3.org/2002/07/owl#DeprecatedClass
1690,http://edamontology.org/topic_0786,Arabidopsis,http://www.w3.org/2002/07/owl#DeprecatedClass
2435,http://edamontology.org/topic_3075,Biological system modelling,http://www.w3.org/2002/07/owl#DeprecatedClass


In [11]:
edam_data = edam_data[edam_data['Parents'].str.contains("http://edamontology.org/topic_")]

In [12]:
edam_data['Parents #'] = edam_data['Parents'].str.extractall(r'topic_(\d+)').groupby(level=0).agg(lambda parents: parents.tolist())
edam_data['Topic #'] = edam_data['Class ID'].apply(lambda url: url.split('topic_')[1])

In [13]:
from collections import defaultdict

topic_dict = defaultdict(list)

for index, row in edam_data.iterrows():
    topic = row['Topic #']
    parents = row['Parents #']
    
    topic_dict[topic].extend(parents)

In [14]:
plabel_topic_dict = dict(zip(edam_data['Preferred Label'], edam_data['Topic #']))

In [15]:
def shortest_distance(topic_dict, topic, verbose=False):
    
    queue = [(topic, 0, [topic])]
    visited = set()
    
    while queue:
        current_topic, distance, path = queue.pop(0)
        
        if current_topic == '0003':
            if verbose:
                print('Path to root: ', path)
            return distance
        
        visited.add(current_topic)
        
        if current_topic in topic_dict:
            parents = topic_dict[current_topic]
            
            for parent in parents:
                if parent not in visited:
                    queue.append((parent, distance + 1, path + [parent]))
    
    return -1  # If the root topic is not found

dist = shortest_distance(topic_dict, '4030')

print(dist)


2


In [16]:
parent_topic = '0003'
subtree_dict = {}

for topic, parents in topic_dict.items():
    if parent_topic in parents:
        subtree_dict[topic] = []

def get_children_topics(parent_id):
    # children_ids = edam_data[edam_data['Parents'].str.contains(parent_id)]['Class ID'].apply(lambda url: url.split('topic_')[1]).to_list()
    children_ids = edam_data[edam_data['Parents #'].apply(\
        lambda parent_ids: parent_id in parent_ids)]['Topic #'].to_list()
    
    if not len(children_ids):
        return []
    
    # print(parent_id, children_ids)
    
    grandchildren = []
    for child_id in children_ids:
        grandchildren.append(get_children_topics(child_id))
    
    children_ids.append(grandchildren)
    return children_ids

for parent_topic in subtree_dict.keys():
    subtree_dict[parent_topic] = get_children_topics(parent_topic)
# We technically also have the topics which we removed from the EDAM list (laboratory techniques, etc.) but they are inconsequential here.

def flatten_list(nested_list):
    flattened = []
    for item in nested_list:
        if isinstance(item, list):
            flattened.extend(flatten_list(item))
        else:
            flattened.append(item)
    return flattened

# Flatten each value in the dictionary
subtree_dict = {key: flatten_list(value) for key, value in subtree_dict.items()}

# Print the flattened dictionary
print()
for key, value in subtree_dict.items():
    print(f"{key}: {value}")


4019: ['3070', '3344', '3307', '0610', '3303', '3391', '3810', '3292', '3360', '3306', '3297', '2229', '3369', '3064', '3299', '3573', '3053', '2815', '3387', '3301', '0621', '3047', '0780', '1317', '3895', '2259', '0781', '3500', '4030', '3398', '3368', '3912', '3576', '3939', '4030', '3895', '3065', '3945', '3298', '0084', '0637', '3944', '3293', '3295', '3912', '0625', '3574', '3930', '3321', '3056', '3173', '4037', '3974', '3945', '3298', '3055', '3959', '0623', '0203', '0114', '0199', '2830', '0204', '3941', '3320', '0749', '3308', '3941', '4027', '0659', '3512', '0798', '2533', '2885', '3175', '3958', '4013', '3697', '4038', '0611', '0593', '3448', '0122', '2828', '4017', '3067', '0804', '3386', '3376', '3304', '3390', '3302', '0202', '3395', '3277', '3930', '3948', '2830', '3679', '3374', '3373', '3336', '3375', '3379', '3393', '3394', '3377', '3966', '3343', '0209', '3378', '2840', '0208', '3337', '3340', '3341', '3339', '3338', '3892', '1775', '3321', '0602', '0077', '0084', 

In [17]:
## Get the top-level non-root classes to treat as root
parentlist = list(subtree_dict.keys())
print(parentlist)

['4019', '3314', '3316', '3071', '3855', '3678', '0605', '3361', '3068', '3315', '4010', '3318']


In [18]:
topic_subtree_dict = {}

for index, row in edam_data.iterrows():
    topic = row['Topic #']
    subtrees = []
    
    for subtree, topics in subtree_dict.items():
        if topic in topics:
            subtrees.append(subtree)
    
    topic_subtree_dict[topic] = subtrees

topic_subtree_dict

{'4029': ['3318'],
 '3810': ['4019'],
 '3400': ['4019'],
 '3402': ['4019'],
 '3370': ['3314'],
 '3067': ['4019'],
 '3679': ['4019', '3678'],
 '4013': ['4019'],
 '3569': ['3315'],
 '3337': ['4019'],
 '3292': ['4019', '3314'],
 '3050': ['4019', '3855'],
 '3398': ['4019'],
 '3383': ['3361'],
 '0091': ['0605'],
 '3070': ['4019'],
 '3360': ['4019'],
 '3368': ['4019'],
 '3344': ['4019'],
 '3892': ['4019'],
 '3306': ['4019', '3318'],
 '4019': [],
 '3297': ['4019'],
 '3374': ['4019'],
 '0152': ['4019'],
 '4020': ['4019', '3314', '3855', '3318'],
 '3335': ['4019'],
 '2229': ['4019'],
 '3340': ['4019'],
 '3179': ['3361'],
 '3169': ['3361'],
 '3369': ['4019', '3314'],
 '2258': ['0605'],
 '3314': [],
 '3931': ['0605'],
 '3940': ['3361'],
 '3944': ['4019'],
 '3341': ['4019'],
 '0797': ['4019'],
 '3423': ['4019'],
 '3343': ['4019', '3314'],
 '3307': ['4019'],
 '3332': ['3314', '3316'],
 '3316': [],
 '3958': ['4019'],
 '3403': ['4019'],
 '4017': ['4019', '3361'],
 '3959': ['4019'],
 '3934': ['3361'],

In [19]:
def shortest_path(topic_dict, topic):
    
    queue = [(topic, 0, [topic])]
    visited = set()
    
    while queue:
        current_topic, distance, path = queue.pop(0)
        
        if current_topic == '0003':
        #if current_topic in parentlist:
            return path
        
        visited.add(current_topic)
        
        if current_topic in topic_dict:
            parents = topic_dict[current_topic]
            
            for parent in parents:
                if parent not in visited:
                    queue.append((parent, distance + 1, path + [parent]))
    
    return []  # If the root topic is not found

shortest_paths = defaultdict(list)

for topic in topic_subtree_dict.keys():
    shortest_paths[topic] = shortest_path(topic_dict, topic)

shortest_paths

defaultdict(list,
            {'4029': ['4029', '3318', '0003'],
             '3810': ['3810', '3070', '4019', '0003'],
             '3400': ['3400', '3303', '4019', '0003'],
             '3402': ['3402', '3303', '4019', '0003'],
             '3370': ['3370', '3314', '0003'],
             '3067': ['3067', '3344', '4019', '0003'],
             '3679': ['3679', '3678', '0003'],
             '4013': ['4013', '3301', '3070', '4019', '0003'],
             '3569': ['3569', '3315', '0003'],
             '3337': ['3337', '3277', '3344', '4019', '0003'],
             '3292': ['3292', '3314', '0003'],
             '3050': ['3050', '0610', '4019', '0003'],
             '3398': ['3398', '3297', '3070', '4019', '0003'],
             '3383': ['3383', '3382', '3361', '0003'],
             '0091': ['0091', '0605', '0003'],
             '3070': ['3070', '4019', '0003'],
             '3360': ['3360', '3070', '4019', '0003'],
             '3368': ['3368', '3297', '3070', '4019', '0003'],
             '33

In [20]:
microfluidics = shortest_distance(topic_dict, '4030')
microfluidics_path = shortest_path(topic_dict, '4030')
biotechnology = shortest_distance(topic_dict, '3297')
biotechnology_path = shortest_path(topic_dict, '3297')
biology = shortest_distance(topic_dict, '3070')
biology_path = shortest_path(topic_dict, '3070')
biosciences = shortest_distance(topic_dict, '4019')
biosciences_path = shortest_path(topic_dict, '4019')
print(microfluidics, microfluidics_path, 
      biotechnology, biotechnology_path, 
      biology, biology_path, 
      biosciences, biosciences_path)

2 ['4030', '3318', '0003'] 3 ['3297', '3070', '4019', '0003'] 2 ['3070', '4019', '0003'] 1 ['4019', '0003']


In [21]:
steps_to_term1 = shortest_distance(topic_dict, '4030')
steps_to_term2 = shortest_distance(topic_dict, '4019')
print(steps_to_term1, steps_to_term2)

2 1


### Calculate conceptual similarity scores

In [24]:
# Calculate weights for each row
def calculate_weights(tree, term1, term2, paths=shortest_paths):

    # Weight 1: Distance from root
    steps_to_term1 = shortest_distance(tree, term1)
    steps_to_term2 = shortest_distance(tree, term2)

    # determine the steps between the two terms
    def steps_between_terms(paths, topic1, topic2):
        path1 = paths[topic1]
        path2 = paths[topic2]

        index1 = index2 = 0

        for i, (n1, n2) in enumerate(zip(path1, path2)):
            if n1 != n2:
                break

            index1 = i + 1
            index2 = i + 1

        return len(path1) - index1 + len(path2) - index2

    steps_between = steps_between_terms(paths, term1, term2)

    # Assuming term1 is ground truth and term2 is prediction
    if steps_to_term1 < steps_to_term2:
        weight1 = 1 - (1 / steps_to_term1)
        weight2 = (1 / steps_between)
    else:
        weight1 = 1 - (1 / steps_to_term2)
        weight2 = -1 / steps_between
    
    weight = weight1 + abs(weight2)

    return weight, (weight1, weight2)

In [30]:
reviewer1 = data_df['Curator_1']
reviewer2 = data_df['Curator_2']

exclusive_reviewer1 = []
exclusive_reviewer2 = []

for reviewer1, reviewer2 in zip(reviewer1, reviewer2):
    exclusive_reviewer1.append(set([label for label in reviewer1 if label not in reviewer2]))
    exclusive_reviewer2.append(set([label for label in reviewer2 if label not in reviewer1]))

In [31]:
data_df['Exclusive Curator 1'] = exclusive_reviewer1
data_df['Exclusive Curator 2'] = exclusive_reviewer2

In [32]:
for idx, row in data_df.iterrows():
    ground_truth = row['Exclusive Curator 1']
    prediction = row['Exclusive Curator 2']

    weights = []
    num_positive_w2 = num_negative_w2 = 0
    for truth_label in ground_truth:
        for pred_label in prediction:
            truth_topic, pred_topic = plabel_topic_dict[truth_label], plabel_topic_dict[pred_label]
            # If labels are not in the same subtree
            if not set(topic_subtree_dict[pred_topic]) & set(topic_subtree_dict[truth_topic]):
                continue
            total_weight, (w1, w2) = calculate_weights(topic_dict, truth_topic, pred_topic)
            weights.append(total_weight)

            if w2 >= 0:
                num_positive_w2 += 1
            elif w2 < 0:
                num_negative_w2 += 1
    
    data_df.loc[idx, 'Weight'] = sum(weights)/max(1,(len(weights))) + row['Jaccard Similarity'] ## use 1 if no weights
    # data.loc[idx, 'Weight'] = sum(weights) + row['Jaccard Similarity'] ## use 1 if no weights
    # data.loc[idx, 'Weight Sum'] = sum(weights) ## alt sim
    # data.loc[idx, 'Weight'] = (sum(weights) + row['match_count'])/row['term_count'] ## alternate sim
    # data.loc[idx, 'Broadness Score'] = num_positive_w2 / (num_positive_w2 + num_negative_w2
    try:
        data_df.loc[idx, 'Broadness Score'] = num_positive_w2 / (num_negative_w2)
    except:
        data_df.loc[idx, 'Broadness Score'] = 0

In [33]:
# Adjust weights for prediction number bias, penalized overprediction without incentivizing underprediction
data_df['overprediction penalty'] = data_df['Curator_1'].apply(len) / data_df['Curator_2'].apply(len)
data_df['penalty'] = data_df['overprediction penalty'].apply(lambda x: 1 if x>1 else x)
data_df['Adjusted Weights'] = data_df['Weight'] * data_df['penalty']
print(data_df.tail(n=2))

   Data Repository                                               Name  \
73           Qiita  Bacterial density rather than diversity correl...   
74           Qiita                        Baum asphalt 1st submission   

                                          Description  \
73  Bacterial communities within avian nests are c...   
74  Different asphalt sites as well as surrounding...   

                                            Curator_1  \
73  {Microbial collection, Metagenomics, Environme...   
74  {Microbial collection, Metagenomics, Environme...   

                                            Curator_2  Jaccard Similarity  \
73           {Microbial ecology, Zoology, Embryology}                 0.0   
74  {Microbial ecology, Metagenomics, Environmenta...                 0.5   

    match_count  term_count  Precision    Recall  \
73            0           6   0.000000  0.000000   
74            2           4   0.666667  0.666667   

                                  Exclusive Cur

In [36]:
data_df.to_csv(os.path.join('result','interrater_results.tsv'), sep='\t', header=True, index=False)

In [48]:
summary = {
    "Mean Jaccard similarity": data_df['Jaccard Similarity'].mean(),
    "Mean Adjusted Weight": data_df['Adjusted Weights'].mean(),
    "# of records Curator 1 could not curate": len(data_df.loc[data_df['Curator_1']==set()]),
    "# of records Curator 2 could not curate": len(data_df.loc[data_df['Curator_2']==set()]),
    "Total # of records attempted": len(data_df)
}

print(summary)
with open(os.path.join('result','interrater_summary.json'),'w') as outwrite:
    outwrite.write(json.dumps(summary))

{'Mean Jaccard similarity': 0.246, 'Mean Adjusted Weight': 0.7172056978515313, '# of records Curator 1 could not curate': 2, '# of records Curator 2 could not curate': 3, 'Total # of records attempted': 75}
