### RMIT READ-BioMed-Version-2.0
#### Milindi Kodikara : : Karin Verspoor
Date : : 16th Sept 2024

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

plt.rcParams["figure.figsize"] = (14, 10)
plt.rcParams["figure.autolayout"] = True

opal = '#275c4d'
ruby = '#af221d'
topaz = '#c59103'

In [None]:
dataset_details = pd.read_csv('./results/dataset_details/dataset_details.tsv', sep='\t', header=0)

dataset_details

In [None]:
len(dataset_details)

In [None]:
gold_annotation_types = pd.read_csv('./results/dataset_details/gold_annotation_type_count.tsv', sep='\t', header=0)

gold_annotation_types

In [None]:
len(gold_annotation_types)

In [None]:
evaluation_df = pd.read_csv('./results/eval_log.tsv', sep='	', header=0).sort_values(by=['prompt_id'])

evaluation_df

In [None]:
len(evaluation_df)

In [None]:
def generic_chart(title, x_label, y_label):
    plt.title(title)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.xticks(rotation=90)
    plt.show()

In [None]:
# generate multi bar graph
def generate_evaluation_metrics(task_type, all_tasks=False):
    temp_df = evaluation_df
    if not all_tasks:
        temp_df = evaluation_df[evaluation_df['task'] == task_type]

    temp_df.plot(x="prompt_id", y=["precision", "recall", "f1"], kind="bar", color=[opal, topaz, ruby]) 
    generic_chart(f'Evaluation metrics for {task_type}', 'Prompts', 'Value') 
    

In [None]:
# stacked bar char of entities vs prompts
# for NER -> tuple count; RE, NERRE -> Correct triplet count
def generate_stacked_entity_graph(task_type, all_tasks=False):
    temp_df = evaluation_df
    if not all_tasks:
        temp_df = evaluation_df[evaluation_df['task'] == task_type]
    x1 = temp_df["prompt_id"].tolist()
    y1 = temp_df["extracted_tuples_or_triplets_per_prompt"].tolist()
    y2 = temp_df["tuple_or_triplet_hallucinations_per_prompt"].tolist()
    plt.bar(x1, y1, color=ruby)
    plt.bar(x1, y2, bottom=y1, color=topaz)
    plt.legend(["Extracted instances", "Hallucinated instances"])
    generic_chart(f'Instances (entities and relation tuples or triplets) against prompts for {task_type}', 'prompts', 'Instances')

In [None]:
def generate_entity_count_graph(task_type, column_name, column_label, all_tasks=False):
    temp_df = evaluation_df
    if not all_tasks:
        temp_df = evaluation_df[evaluation_df['task'] == task_type]
    x1 = temp_df["prompt_id"].tolist()
    y1 = temp_df[column_name].tolist()
    plt.bar(x1, y1, color=topaz)
    generic_chart(f'{column_label} vs prompts for {task_type}', 'prompts', column_label)

In [None]:
def simple_bar_graph(task_type, x_col, y_col, x_label, y_label, all_tasks=False):
    temp_df = evaluation_df
    if not all_tasks:
        temp_df = evaluation_df[evaluation_df['task'] == task_type]
        
    x = temp_df[x_col].tolist()
    y = temp_df[y_col].tolist()
    plt.bar(x, y, color=opal)
    generic_chart(f'{y_label} vs {x_label} for {task_type}', x_label, y_label) 

### Analysis

In [None]:
# F1, precision, recall for NER, RE, NERRE 
generate_evaluation_metrics('NER')
generate_evaluation_metrics('RE')
generate_evaluation_metrics('NERRE')

In [None]:
# Hallucinations vs matched entities
generate_stacked_entity_graph('NER')
generate_stacked_entity_graph('RE')
generate_stacked_entity_graph('NERRE')

In [None]:
# true positives, false negatives, false positives, total extracted entities; false positive relations, false negative relations for each task
# sub colours
ruby_low = '#EACBBB'
topaz_low = '#CBB17B'
opal_low = '#AE9800'
opal_lower = '#787122'

temp_df = evaluation_df[evaluation_df['task'] == 'NER']

temp_df.plot(x="prompt_id", y=["true_positive", "false_positive", "false_negative"], kind="bar", color=[ruby, ruby_low, topaz_low]) 
generic_chart(f'Entity counts for NER', 'Prompts', 'Entity count') 


In [None]:
temp_df = evaluation_df[evaluation_df['task'] == 'RE']

temp_df.plot(x="prompt_id", y=["true_positive", "false_positive", "false_negative", "false_positive_relations", "false_negative_relations"], kind="bar", color=[ruby, ruby_low, opal_low, opal_lower, topaz_low, ]) 
generic_chart(f'Entity and Relation counts for RE', 'Prompts', 'Entity/relation count') 

In [None]:
temp_df = evaluation_df[evaluation_df['task'] == 'NERRE']

temp_df.plot(x="prompt_id", y=["true_positive", "false_positive", "false_negative", "false_positive_relations", "false_negative_relations"], kind="bar", color=[ruby, ruby_low, opal_low, opal_lower, topaz_low, ]) 
generic_chart(f'Entity and Relation counts for NERRE', 'Prompts', 'Entity/relation count') 