In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import math
%matplotlib inline

plt.rcParams["figure.figsize"] = (14, 10)
plt.rcParams["figure.autolayout"] = True

opal = '#275c4d'
ruby = '#af221d'
topaz = '#c59103'

In [None]:
def generic_chart(title, x_label, y_label):
    plt.title(title)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.xticks()
    plt.show()

In [None]:
# read the files
dataset_details = pd.read_csv('./exact_results/dataset_details/dataset_details.tsv', sep='\t', header=0)
gold_annotation_types = pd.read_csv('./exact_results/dataset_details/gold_annotation_type_count.tsv', sep='\t', header=0)
evaluation_df = pd.read_csv('./exact_results/eval_log.tsv', sep='	', header=0).sort_values(by=['prompt_id'])

def merge_hallucinations(task):
    tsv_files = glob.glob(f'./exact_results/hallucinations/{task}/*.tsv')
    
    combined_df = pd.DataFrame()
    for tsv_file in tsv_files:
        df = pd.read_csv(tsv_file, sep='\t', header=0)
        combined_df = pd.concat([combined_df, df])
        
    return combined_df

ner_hallucinations_df = merge_hallucinations('NER')
re_hallucinations_df = merge_hallucinations('RE')
nerre_hallucinations_df = merge_hallucinations('NERRE')

In [None]:
# Add new column for the language 
def add_language_col(df):
    df['language'] = ["Spanish prompt" if '_es'in row.prompt_id  else 'English prompt' for _, row in df.iterrows()]
    
    return df

evaluation_df = add_language_col(evaluation_df)
ner_hallucinations_df = add_language_col(ner_hallucinations_df)
re_hallucinations_df = add_language_col(re_hallucinations_df)
nerre_hallucinations_df = add_language_col(nerre_hallucinations_df)

In [None]:
# Add formatted prompt id
def add_prompt_name(prompt_id):
    if "zero" in prompt_id:
        return "Zero shot"
    elif "one" in prompt_id:
        return "One shot"
    elif "five" in prompt_id:
        return "Five shot"
    elif "ten" in prompt_id:
        return "Ten shot"
    else: 
        return prompt_id

evaluation_df['formatted_prompt_id'] = [add_prompt_name(row.prompt_id) for _, row in evaluation_df.iterrows()]

ner_hallucinations_df['formatted_prompt_id'] = [add_prompt_name(row.prompt_id) for _, row in ner_hallucinations_df.iterrows()]

re_hallucinations_df['formatted_prompt_id'] = [add_prompt_name(row.prompt_id) for _, row in re_hallucinations_df.iterrows()]

nerre_hallucinations_df['formatted_prompt_id'] = [add_prompt_name(row.prompt_id) for _, row in nerre_hallucinations_df.iterrows()]


#### Performance of the gen LLM based on prompts

In [None]:
# 1.1/2.0 NER performance against different prompts grouped by language 
temp_df = evaluation_df[evaluation_df['task'] == 'NER']
temp_df = temp_df
temp_df = temp_df.loc[:, ['formatted_prompt_id', 'precision', 'recall', 'f1', 'language']].sort_values(by=['language']).reset_index(drop=True)

eng_temp_df = temp_df[temp_df['language'] == 'English prompt']
esp_temp_df = temp_df[temp_df['language'] == 'Spanish prompt']

average_eng_val_row = ['Average', eng_temp_df['precision'].mean(), eng_temp_df['recall'].mean(), eng_temp_df['f1'].mean(), 'English prompt']

temp_df.loc[len(temp_df)] = average_eng_val_row

average_esp_val_row = ['Average', esp_temp_df['precision'].mean(), esp_temp_df['recall'].mean(), esp_temp_df['f1'].mean(), 'Spanish prompt']

temp_df.loc[len(temp_df) + 1] = average_esp_val_row

temp_df = temp_df.sort_values(by=['language']).reset_index(drop=True)

fig = temp_df.plot(x="formatted_prompt_id", kind="bar", color=[opal, topaz, ruby])
 
sec = fig.secondary_xaxis(location=0)
sec.set_xticks([2, 6], labels=['English prompts', 'Spanish prompts'])
sec.tick_params('x', length=80, width=0)

generic_chart(f'Evaluation metrics for NER grouped by prompt language', 'Prompts', 'Score') 

In [None]:
# 1.2, 1.3 F1, precision, recall for NER, RE, NERRE seperated by prompts, add overall average of metrics 

def generate_evalutation_metrics(task_type):
    temp_df = evaluation_df[evaluation_df['task'] == task_type]
    temp_df = temp_df.loc[:, ['formatted_prompt_id', 'precision', 'recall', 'f1']].reset_index(drop=True)
    
    average_val_row = ['Average', temp_df['precision'].mean(), temp_df['recall'].mean(), temp_df['f1'].mean()]

    temp_df.loc[len(temp_df)] = average_val_row
    
    temp_df.plot(x="formatted_prompt_id", y=["precision", "recall", "f1"], kind="bar", color=[opal, topaz, ruby]) 
    generic_chart(f'Evaluation metrics for {task_type}', 'Prompts', 'Score') 

generate_evalutation_metrics('RE')
generate_evalutation_metrics('NERRE')

#### Exploration of the hallucinations

In [None]:
# 3. Per task --> Hallucinations per prompt stacked with entities extracted

# NER based on prompt language
temp_ner_hall_df = evaluation_df[evaluation_df['task'] == 'NER']

temp_ner_hall_df = temp_ner_hall_df.loc[:, ['formatted_prompt_id', 'extracted_tuples_or_triplets_per_prompt', 'tuple_or_triplet_hallucinations_per_prompt', 'language']].sort_values(by=['language']).reset_index(drop=True)

eng_ner_hall_temp_df = temp_ner_hall_df[temp_ner_hall_df['language'] == 'English prompt']
esp_ner_hall_temp_df = temp_ner_hall_df[temp_ner_hall_df['language'] == 'Spanish prompt']

ner_en_hall_average_val_row = ['Average', math.ceil(eng_ner_hall_temp_df['extracted_tuples_or_triplets_per_prompt'].mean()), math.ceil(eng_ner_hall_temp_df ['tuple_or_triplet_hallucinations_per_prompt'].mean()), 'English prompt']

temp_ner_hall_df.loc[len(temp_ner_hall_df)] = ner_en_hall_average_val_row

ner_es_hall_average_val_row = ['Average', math.ceil(esp_ner_hall_temp_df['extracted_tuples_or_triplets_per_prompt'].mean()), math.ceil(esp_ner_hall_temp_df ['tuple_or_triplet_hallucinations_per_prompt'].mean()), 'Spanish prompt']

temp_ner_hall_df.loc[len(temp_ner_hall_df) + 1] = ner_es_hall_average_val_row

temp_ner_hall_df = temp_ner_hall_df.sort_values(by=['language']).reset_index(drop=True)

fig = temp_ner_hall_df.plot(x="formatted_prompt_id", kind="bar", stacked=True, color=[ruby, topaz])
 
sec = fig.secondary_xaxis(location=0)
sec.set_xticks([2, 6], labels=['English prompts', 'Spanish prompts'])
sec.tick_params('x', length=80, width=0)

plt.legend(["Extracted instances", "Hallucinated instances"])
generic_chart(f'Instances for NER', 'Prompts', 'Instances')

In [None]:
def generate_stacked_entity_graph(task_type):
    temp_df = evaluation_df[evaluation_df['task'] == task_type]
    
    temp_df = temp_df.loc[:, ['formatted_prompt_id', 'extracted_tuples_or_triplets_per_prompt', 'tuple_or_triplet_hallucinations_per_prompt']].reset_index(drop=True)
    
    average_val_row = ['Average', temp_df['extracted_tuples_or_triplets_per_prompt'].mean(), temp_df['tuple_or_triplet_hallucinations_per_prompt'].mean()]

    temp_df.loc[len(temp_df)] = average_val_row
    
    x1 = temp_df["formatted_prompt_id"].tolist()
    y1 = temp_df["extracted_tuples_or_triplets_per_prompt"].tolist()
    y2 = temp_df["tuple_or_triplet_hallucinations_per_prompt"].tolist()
    
    plt.bar(x1, y1, color=ruby)
    plt.bar(x1, y2, bottom=y1, color=topaz)
    
    plt.legend(["Extracted instances", "Hallucinated instances"])
    generic_chart(f'Instances for {task_type}', 'Prompts', 'Instances')

# TODO: Add a note specifying that NER shows the span-label tuple instances while RE and NERRE shows spans-labels-relation triplet instances
generate_stacked_entity_graph('RE')
generate_stacked_entity_graph('NERRE')

In [None]:
# 4. Hallucinations broken down by type of the hallucination (basically looking at the over generation of the found instances and fabrication)

# stacked -1 and -2
def order_hallucinations_by_type(task_type, df):
    offset_col_name = 'offset1'
    if task_type == 'RE' or task_type == 'NERRE':
        offset_col_name = 'offset1_start'
    hallucinations_by_type_df = pd.DataFrame(df.groupby('prompt_id')[offset_col_name].value_counts()).reset_index()
    
    hallucinations_by_type_df_fabrications = hallucinations_by_type_df[hallucinations_by_type_df[offset_col_name] == -1].reset_index(drop=True).sort_values(by='prompt_id')
    hallucinations_by_type_df_fabrications = hallucinations_by_type_df_fabrications.rename(columns={'count': 'Fabrications'})
    
    hallucinations_by_type_df_over_generated = hallucinations_by_type_df[hallucinations_by_type_df[offset_col_name] == -2].reset_index(drop=True).sort_values(by='prompt_id')
    hallucinations_by_type_df_over_generated = hallucinations_by_type_df_over_generated.rename(columns={'count': 'Over generated'})
    
    cols = hallucinations_by_type_df_over_generated.columns.difference(hallucinations_by_type_df_fabrications.columns)
    
    refactored_ner_hallucinations_by_type_df = pd.merge(hallucinations_by_type_df_fabrications, hallucinations_by_type_df_over_generated[cols], left_index=True, right_index=True, how='outer')
    
    if task_type == 'NER':
        refactored_ner_hallucinations_by_type_df = add_language_col(refactored_ner_hallucinations_by_type_df)
    
    refactored_ner_hallucinations_by_type_df['formatted_prompt_id'] = [add_prompt_name(row.prompt_id) for _, row in refactored_ner_hallucinations_by_type_df.iterrows()]
    
    if task_type == 'NER':
        refactored_ner_hallucinations_by_type_df = refactored_ner_hallucinations_by_type_df.sort_values(by='language')
    
    refactored_ner_hallucinations_by_type_df = refactored_ner_hallucinations_by_type_df.reset_index(drop=True)
    
    refactored_ner_hallucinations_by_type_df = refactored_ner_hallucinations_by_type_df.loc[:, ['prompt_id', 'formatted_prompt_id', 'Over generated', 'Fabrications']]
    fig = refactored_ner_hallucinations_by_type_df.plot(x="formatted_prompt_id", kind="bar", stacked=True, color=[ruby, topaz])
    
    if task_type == 'NER': 
        sec = fig.secondary_xaxis(location=0)
        sec.set_xticks([2, 6], labels=['English prompts', 'Spanish prompts'])
        sec.tick_params('x', length=80, width=0)
        
    plt.legend(["Over generated instances", "Fabricated instances"])
    generic_chart(f'Hallucinations by type for {task_type}', 'Prompts', 'Hallucinated instances')

In [None]:
order_hallucinations_by_type('NER', ner_hallucinations_df)
order_hallucinations_by_type('RE', re_hallucinations_df)
order_hallucinations_by_type('NERRE', nerre_hallucinations_df)

In [None]:
# TODO: 5. Hallucinations broken down by the type of the entities and relations
hallucinations_by_label_type_df = pd.DataFrame(ner_hallucinations_df.groupby('prompt_id')['label'].value_counts()).reset_index()

hallucinations_by_label_type_df

In [None]:
ner_hallucinations_df

#### Classification of the extracted entities and identified relations

In [None]:
# TODO: 6. true positives, false negatives, false positives, false positive relations, false negative relations (stacked) vs total extracted entities for each task for exact match vs relaxed match

In [None]:
# TODO: 7. Breakdown of the gold entities and relations vs extracted entities and relations for each task