### Analysis

#### Set up

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import glob
import math
import random

import os
from dotenv import load_dotenv

load_dotenv()

%matplotlib inline

plt.rcParams["figure.figsize"] = (14, 10)
plt.rcParams["figure.autolayout"] = True
plt.rcParams.update({'font.size': 20})

colours = ['#ff0000', '#ff8700', '#ffd300', '#deff0a', '#a1ff0a', '#0aff99', '#0aefff', '#147df5', '#580aff', '#be0aff',
           '#54478c', '#240046']

In [None]:
result_folder_path = os.environ["RESULT-FOLDER-PATH"]

In [None]:
def generic_chart(title, x_label, y_label):
    # plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    # plt.subplots_adjust(right=0.7)
    plt.title(title)
    plt.xlabel(x_label, labelpad=55)
    plt.ylabel(y_label)
    plt.xticks(rotation=90)
    plt.savefig(f'{result_folder_path}/results/figures/{title}.png')
    plt.show()

In [None]:
def merge_hallucinations(task):
    tsv_files = glob.glob(f'{result_folder_path}/results/hallucinations/{task}/*.tsv')
    
    combined_df = pd.DataFrame()
    for tsv_file in tsv_files:
        df = pd.read_csv(tsv_file, sep='\t', header=0)
        combined_df = pd.concat([combined_df, df])
        
    return combined_df

In [None]:
# Add new column for the language 
def add_language_col(df):
    df['language'] = ["Spanish prompt" if '_es'in row.prompt_id  else 'English prompt' for _, row in df.iterrows()]
    
    return df

In [None]:
# Add formatted prompt id
def add_prompt_name(prompt_id):
    prompt_label = prompt_id
    if "zero" in prompt_id:
        prompt_label = "0"
    elif "one" in prompt_id:
        prompt_label = "1"
    elif "five" in prompt_id:
        prompt_label = "5"
    elif "ten" in prompt_id:
        prompt_label = "10"
        
    if "output" in prompt_id:
        prompt_label = prompt_label + "_output"
        
    if "guideline" in prompt_id:
        prompt_label = prompt_label + "_guideline"
        
    if "all" in prompt_id:
        prompt_label = prompt_label + "_output_guideline"
        
    if '_es'in prompt_id:
        prompt_label = prompt_label + "_es"
    else:
        prompt_label = prompt_label + "_en"
    
    return prompt_label

In [None]:
def add_new_cols(df):
    df['shots'] = [int(row.formatted_prompt_id.split("_")[0]) for _, row in df.iterrows()]
    df['guideline'] = ["Guideline defined" if "guideline" in row.formatted_prompt_id else "Guideline undefined" for _, row in df.iterrows()]
    df['output'] = ["Output defined" if "output" in row.formatted_prompt_id else "Output undefined" for _, row in df.iterrows()]
    
    return df

In [None]:
# 1.1/2.0 NER performance against different prompts grouped by language 
def generate_cross_linguistic_evaluation_metrics(task_type):
    temp_df = evaluation_df[evaluation_df['task'] == task_type]
    temp_df = temp_df
    temp_df = temp_df.loc[:, ['formatted_prompt_id', 'precision', 'recall', 'f1', 'language']].sort_values(
        by=['language']).reset_index(drop=True)

    eng_temp_df = temp_df[temp_df['language'] == 'English prompt']
    esp_temp_df = temp_df[temp_df['language'] == 'Spanish prompt']

    average_eng_val_row = ['Avg', eng_temp_df['precision'].mean(), eng_temp_df['recall'].mean(), eng_temp_df['f1'].mean(), 'English prompt']

    temp_df.loc[len(temp_df)] = average_eng_val_row

    average_esp_val_row = ['Avg', esp_temp_df['precision'].mean(), esp_temp_df['recall'].mean(), esp_temp_df['f1'].mean(), 'Spanish prompt']

    temp_df.loc[len(temp_df) + 1] = average_esp_val_row
    
    temp_df['shots'] = [20 if row.formatted_prompt_id.split("_")[0] == "Avg" else int(row.formatted_prompt_id.split("_")[0]) for _, row in temp_df.iterrows()]
    
    temp_df['formatted_prompt_id'] = [row.formatted_prompt_id.replace("_es", "")  for _, row in temp_df.iterrows()]
    
    temp_df['formatted_prompt_id'] = [row.formatted_prompt_id.replace("_en", "")  for _, row in temp_df.iterrows()]

    temp_df = temp_df.sort_values(by=['language', 'shots', 'formatted_prompt_id'], ascending=[True, True, True]).reset_index(drop=True)
    
    temp_df = temp_df.loc[:, ['formatted_prompt_id', 'precision', 'recall', 'f1', 'language']]

    fig = temp_df.plot(x="formatted_prompt_id", kind="bar", color=colours[0:3])

    sec = fig.secondary_xaxis(location=0)
    sec.set_xticks([8.5, 25.5], labels=['English prompt (shots)', 'Spanish prompt (shots)'])
    sec.tick_params('x', length=275, width=0)

    generic_chart(f'Results for varying number of shots for NER, grouped by prompt language', 'Prompts', 'Score') 

In [None]:
def generate_cross_linguistic_f1_shots(task_type):
    temp_df = evaluation_df[evaluation_df['task'] == task_type]
    temp_df = temp_df
    temp_df = temp_df.loc[:, ['formatted_prompt_id', 'f1', 'language']].sort_values(
        by=['language']).reset_index(drop=True)

    eng_temp_df = temp_df[temp_df['language'] == 'English prompt']
    esp_temp_df = temp_df[temp_df['language'] == 'Spanish prompt']

    average_eng_val_row = ['Avg', eng_temp_df['f1'].mean(), 'English prompt']

    temp_df.loc[len(temp_df)] = average_eng_val_row

    average_esp_val_row = ['Avg', esp_temp_df['f1'].mean(), 'Spanish prompt']

    temp_df.loc[len(temp_df) + 1] = average_esp_val_row

    temp_df['shots'] = [
        20 if row.formatted_prompt_id.split("_")[0] == "Avg" else int(row.formatted_prompt_id.split("_")[0]) for _, row
        in temp_df.iterrows()]

    temp_df = temp_df.sort_values(by=['language', 'shots', 'formatted_prompt_id'],
                                  ascending=[True, True, True]).reset_index(drop=True)

    temp_df = temp_df.loc[:, ['formatted_prompt_id', 'f1', 'language']]

    column_color = [colours[0] if row.formatted_prompt_id == 'Avg' else (
        colours[10] if '_es' in row.formatted_prompt_id else colours[11]) for _, row in temp_df.iterrows()]
    
    temp_df['formatted_prompt_id'] = [row.formatted_prompt_id.replace("_es", "") for _, row in temp_df.iterrows()]

    temp_df['formatted_prompt_id'] = [row.formatted_prompt_id.replace("_en", "") for _, row in temp_df.iterrows()]

    fig = temp_df.plot(x="formatted_prompt_id", y="f1", kind="bar", color=column_color, legend=False)

    sec = fig.secondary_xaxis(location=0)
    sec.set_xticks([8.5, 25.5], labels=['English prompt (shots)', 'Spanish prompt (shots)'])
    sec.tick_params('x', length=275, width=0)

    generic_chart(f'F1 for varying number of shots for NER, grouped by prompt language', 'Prompts', 'F1 Score')

    print(temp_df)

In [None]:
def generate_cross_linguistic_f1_guideline(task_type):
    temp_df = evaluation_df[evaluation_df['task'] == task_type]
    temp_df = temp_df
    temp_df = add_new_cols(temp_df)
    temp_df = temp_df.loc[:, ['formatted_prompt_id', 'f1', 'guideline']].sort_values(
        by=['guideline']).reset_index(drop=True)

    eng_temp_df = temp_df[temp_df['guideline'] == 'Guideline defined']
    esp_temp_df = temp_df[temp_df['guideline'] == 'Guideline undefined']

    average_eng_val_row = ['Avg', eng_temp_df['f1'].mean(), 'Guideline defined']

    temp_df.loc[len(temp_df)] = average_eng_val_row

    average_esp_val_row = ['Avg', esp_temp_df['f1'].mean(), 'Guideline undefined']

    temp_df.loc[len(temp_df) + 1] = average_esp_val_row
    
    temp_df['shots'] = [20 if row.formatted_prompt_id.split("_")[0] == "Avg" else int(row.formatted_prompt_id.split("_")[0]) for _, row in temp_df.iterrows()]

    temp_df = temp_df.sort_values(by=['guideline', 'shots', 'formatted_prompt_id'], ascending=[True, True, True]).reset_index(drop=True)
    
    temp_df = temp_df.loc[:, ['formatted_prompt_id', 'f1', 'guideline']]
    
    temp_df['formatted_prompt_id'] = [row.formatted_prompt_id.replace("_guideline", "")  for _, row in temp_df.iterrows()]
    
    column_color = [colours[0] if row.formatted_prompt_id == 'Avg' else (
        colours[8] if '_es' in row.formatted_prompt_id else colours[9]) for _, row in temp_df.iterrows()]
    
    fig = temp_df.plot(x="formatted_prompt_id", y="f1", kind="bar", color=column_color, legend=False)

    sec = fig.secondary_xaxis(location=0)
    sec.set_xticks([8.5, 25.5], labels=['Guideline defined', 'Guideline undefined'])
    sec.tick_params('x', length=275, width=0)

    generic_chart(f'F1 for NER, grouped by the conditional provision of the annotation guideline', 'Prompts', 'F1 Score') 
    print(temp_df)

In [None]:
def generate_cross_linguistic_f1_output(task_type):
    temp_df = evaluation_df[evaluation_df['task'] == task_type]
    temp_df = temp_df
    temp_df = add_new_cols(temp_df)
    temp_df = temp_df.loc[:, ['formatted_prompt_id', 'f1', 'output']].sort_values(
        by=['output']).reset_index(drop=True)

    eng_temp_df = temp_df[temp_df['output'] == 'Output defined']
    esp_temp_df = temp_df[temp_df['output'] == 'Output undefined']

    average_eng_val_row = ['Avg', eng_temp_df['f1'].mean(), 'Output defined']

    temp_df.loc[len(temp_df)] = average_eng_val_row

    average_esp_val_row = ['Avg', esp_temp_df['f1'].mean(), 'Output undefined']

    temp_df.loc[len(temp_df) + 1] = average_esp_val_row
    
    temp_df['shots'] = [20 if row.formatted_prompt_id.split("_")[0] == "Avg" else int(row.formatted_prompt_id.split("_")[0]) for _, row in temp_df.iterrows()]

    temp_df = temp_df.sort_values(by=['output', 'shots', 'formatted_prompt_id'], ascending=[True, True, True]).reset_index(drop=True)
    
    temp_df['formatted_prompt_id'] = [row.formatted_prompt_id.replace("_output", "")  for _, row in temp_df.iterrows()]
    
    temp_df = temp_df.loc[:, ['formatted_prompt_id', 'f1', 'output']]
    
    column_color = [colours[0] if row.formatted_prompt_id == 'Avg' else (
        colours[6] if '_es' in row.formatted_prompt_id else colours[7]) for _, row in temp_df.iterrows()]
    
    fig = temp_df.plot(x="formatted_prompt_id", y="f1", kind="bar", color=column_color, legend=False)

    sec = fig.secondary_xaxis(location=0)
    sec.set_xticks([8.5, 25.5], labels=['Output defined', 'Output undefined'])
    sec.tick_params('x', length=275, width=0)

    generic_chart(f'F1 for NER, grouped by the conditional provision of the output structure', 'Prompts', 'F1 Score') 
    
    print(temp_df)


In [None]:
# 1.2, 1.3 F1, precision, recall for NER, RE, NERRE seperated by prompts, add overall average of metrics 
def generate_evalutation_metrics(task_type):
    temp_df = evaluation_df[evaluation_df['task'] == task_type]
    temp_df = temp_df.loc[:, ['formatted_prompt_id', 'precision', 'recall', 'f1']].reset_index(drop=True)
    
    average_val_row = ['Avg', temp_df['precision'].mean(), temp_df['recall'].mean(), temp_df['f1'].mean()]

    temp_df.loc[len(temp_df)] = average_val_row
    
    temp_df.plot(x="formatted_prompt_id", y=["precision", "recall", "f1"], kind="bar", color=colours[0:3]) 
    generic_chart(f'Results for varying the number of shots for {task_type}', 'Prompts', 'Score') 

In [None]:
# 3. Per task --> Hallucinations per prompt stacked with entities extracted
import matplotlib.lines as mlines


# NER based on prompt language
def generate_cross_linguistic_stacked_entity_graph(task_type):
    temp_ner_hall_df = evaluation_df[evaluation_df['task'] == task_type]

    temp_ner_hall_df = temp_ner_hall_df.loc[:, ['formatted_prompt_id', 'extracted_tuples_or_triplets_per_prompt',
                                                'tuple_or_triplet_hallucinations_per_prompt', 'language']].sort_values(
        by=['language']).reset_index(drop=True)

    eng_ner_hall_temp_df = temp_ner_hall_df[temp_ner_hall_df['language'] == 'English prompt']
    esp_ner_hall_temp_df = temp_ner_hall_df[temp_ner_hall_df['language'] == 'Spanish prompt']

    ner_en_hall_average_val_row = ['Avg_en',
                                   math.ceil(eng_ner_hall_temp_df['extracted_tuples_or_triplets_per_prompt'].mean()),
                                   math.ceil(eng_ner_hall_temp_df['tuple_or_triplet_hallucinations_per_prompt'].mean()),
                                   'English prompt']

    temp_ner_hall_df.loc[len(temp_ner_hall_df)] = ner_en_hall_average_val_row

    ner_es_hall_average_val_row = ['Avg_es',
                                   math.ceil(esp_ner_hall_temp_df['extracted_tuples_or_triplets_per_prompt'].mean()),
                                   math.ceil(esp_ner_hall_temp_df['tuple_or_triplet_hallucinations_per_prompt'].mean()),
                                   'Spanish prompt']

    temp_ner_hall_df.loc[len(temp_ner_hall_df) + 1] = ner_es_hall_average_val_row

    temp_ner_hall_df['shots'] = [
        20 if row.formatted_prompt_id.split("_")[0] == "Avg" else int(row.formatted_prompt_id.split("_")[0]) for _, row
        in temp_ner_hall_df.iterrows()]

    temp_ner_hall_df = temp_ner_hall_df.sort_values(by=['language', 'shots', 'formatted_prompt_id'],
                                                    ascending=[True, True, True]).reset_index(drop=True)

    stacked_colours = {
        "extracted_tuples_or_triplets_per_prompt": [(colours[4] if '_es' in row.formatted_prompt_id else colours[5]) for
                                                    _, row in temp_ner_hall_df.iterrows()],
        "tuple_or_triplet_hallucinations_per_prompt": [(
            colours[6] if '_es' in row.formatted_prompt_id else colours[7]) for _, row in temp_ner_hall_df.iterrows()]
    }

    temp_ner_hall_df['formatted_prompt_id'] = [row.formatted_prompt_id.replace("_es", "") for _, row in temp_ner_hall_df.iterrows()]

    temp_ner_hall_df['formatted_prompt_id'] = [row.formatted_prompt_id.replace("_en", "") for _, row in temp_ner_hall_df.iterrows()]

    temp_ner_hall_df = temp_ner_hall_df.loc[:, ['formatted_prompt_id', 'extracted_tuples_or_triplets_per_prompt',
                                                'tuple_or_triplet_hallucinations_per_prompt', 'language']]

    fig = temp_ner_hall_df.plot(x="formatted_prompt_id", kind="bar", stacked=True, color=stacked_colours)

    sec = fig.secondary_xaxis(location=0)
    sec.set_xticks([8.5, 25.5], labels=['English prompt (shots)', 'Spanish prompt (shots)'])
    sec.tick_params('x', length=245, width=0)

    plt.legend(handles=[
        mlines.Line2D([], [], color=colours[5], label="Extracted instances (en)", linewidth=10),
        mlines.Line2D([], [], color=colours[7], label="Hallucinated instances (en)", linewidth=10),
        mlines.Line2D([], [], color=colours[4], label="Extracted instances (es)", linewidth=10),
        mlines.Line2D([], [], color=colours[6], label="Hallucinated instances (es)", linewidth=10)], handlelength=0.5)
    generic_chart(f'Instances for NER', 'Prompts', 'Instances')

In [None]:
def generate_stacked_entity_graph(task_type):
    temp_df = evaluation_df[evaluation_df['task'] == task_type]
    
    temp_df = temp_df.loc[:, ['formatted_prompt_id', 'extracted_tuples_or_triplets_per_prompt', 'tuple_or_triplet_hallucinations_per_prompt']].reset_index(drop=True)
    
    average_val_row = ['Avg', temp_df['extracted_tuples_or_triplets_per_prompt'].mean(), temp_df['tuple_or_triplet_hallucinations_per_prompt'].mean()]

    temp_df.loc[len(temp_df)] = average_val_row
    
    x1 = temp_df["formatted_prompt_id"].tolist()
    y1 = temp_df["extracted_tuples_or_triplets_per_prompt"].tolist()
    y2 = temp_df["tuple_or_triplet_hallucinations_per_prompt"].tolist()
    
    plt.bar(x1, y1, color=colours[4])
    plt.bar(x1, y2, bottom=y1, color=colours[5])
    
    plt.legend(["Extracted instances", "Hallucinated instances"])
    generic_chart(f'Instances for {task_type}', 'Prompts', 'Instances')

In [None]:
# 4. Hallucinations broken down by type of the hallucination (basically looking at the over generation of the found instances and fabrication)
# stacked -1 and -2
def order_hallucinations_by_type(task_type, df):
    offset_col_name = 'offset1'
    if task_type == 'RE' or task_type == 'NERRE':
        offset_col_name = 'offset1_start'
    hallucinations_by_type_df = pd.DataFrame(df.groupby('prompt_id')[offset_col_name].value_counts()).reset_index()

    hallucinations_by_type_df_fabrications = hallucinations_by_type_df[
        hallucinations_by_type_df[offset_col_name] == -1].reset_index(drop=True).sort_values(by='prompt_id')
    hallucinations_by_type_df_fabrications = hallucinations_by_type_df_fabrications.rename(
        columns={'count': 'Fabrications'})

    hallucinations_by_type_df_over_generated = hallucinations_by_type_df[
        hallucinations_by_type_df[offset_col_name] == -2].reset_index(drop=True).sort_values(by='prompt_id')
    hallucinations_by_type_df_over_generated = hallucinations_by_type_df_over_generated.rename(
        columns={'count': 'Over generated'})

    cols = hallucinations_by_type_df_over_generated.columns.difference(hallucinations_by_type_df_fabrications.columns)

    refactored_hallucinations_by_type_df = pd.merge(hallucinations_by_type_df_fabrications,
                                                    hallucinations_by_type_df_over_generated[cols], left_index=True,
                                                    right_index=True, how='outer')

    if task_type == 'NER':
        refactored_hallucinations_by_type_df = add_language_col(refactored_hallucinations_by_type_df)

    refactored_hallucinations_by_type_df['formatted_prompt_id'] = [add_prompt_name(row.prompt_id) for _, row in
                                                                   refactored_hallucinations_by_type_df.iterrows()]

    stacked_colours = colours[6:8]
    if task_type == 'NER':
        refactored_hallucinations_by_type_df['shots'] = [
            20 if row.formatted_prompt_id.split("_")[0] == "Avg" else int(row.formatted_prompt_id.split("_")[0]) for
            _, row
            in refactored_hallucinations_by_type_df.iterrows()]

        refactored_hallucinations_by_type_df = refactored_hallucinations_by_type_df.sort_values(
            by=['language', 'shots', 'formatted_prompt_id'], ascending=[True, True, True]).reset_index(drop=True)

        stacked_colours = {
        "Fabrications": [(colours[6] if '_es' in row.formatted_prompt_id else colours[7]) for
                                                    _, row in refactored_hallucinations_by_type_df.iterrows()],
        "Over generated": [(
            colours[8] if '_es' in row.formatted_prompt_id else colours[9]) for _, row in refactored_hallucinations_by_type_df.iterrows()]
        }

        refactored_hallucinations_by_type_df['formatted_prompt_id'] = [row.formatted_prompt_id.replace("_es", "") for _, row in
                                                   refactored_hallucinations_by_type_df.iterrows()]
    
        refactored_hallucinations_by_type_df['formatted_prompt_id'] = [row.formatted_prompt_id.replace("_en", "") for _, row in
                                                   refactored_hallucinations_by_type_df.iterrows()]

    # refactored_hallucinations_by_type_df = refactored_hallucinations_by_type_df.reset_index(drop=True)

    refactored_hallucinations_by_type_df = refactored_hallucinations_by_type_df.loc[:,
                                           ['prompt_id', 'formatted_prompt_id', 'Over generated', 'Fabrications']]
    fig = refactored_hallucinations_by_type_df.plot(x="formatted_prompt_id", kind="bar", stacked=True,
                                                    color=stacked_colours)

    if task_type == 'NER':
        sec = fig.secondary_xaxis(location=0)
        sec.set_xticks([6.5, 21.5], labels=['English prompt (shots)', 'Spanish prompt (shots)'])
        sec.tick_params('x', length=245, width=0)
        plt.legend(handles=[
        mlines.Line2D([], [], color=colours[7], label="Fabricated instances (en)", linewidth=10),
        mlines.Line2D([], [], color=colours[9], label="Over generated instances (en)", linewidth=10),
        mlines.Line2D([], [], color=colours[6], label="Fabricated instances (es)", linewidth=10),
        mlines.Line2D([], [], color=colours[8], label="Over generated instances (es)", linewidth=10)], handlelength=0.5)
    else:
        plt.legend(["Over generated instances", "Fabricated instances"])
    generic_chart(f'Hallucinations by type for {task_type}', 'Prompts', 'Hallucinated instances')

In [None]:
# 5. Hallucinations broken down by the type of the entities and relations
def broken_by_entity_type_ner(task_type, df, df_type):
    breakdown_by_entity_type_df = pd.DataFrame(df.groupby('prompt_id')['label'].value_counts()).reset_index()

    unique_labels = breakdown_by_entity_type_df['label'].unique().tolist()
    unique_prompts = breakdown_by_entity_type_df['prompt_id'].unique().tolist()

    new_cols = ['prompt_id'] + unique_labels

    new_breakdown_by_entity_type_df = pd.DataFrame(columns=new_cols)
    for prompt in unique_prompts:
        df_row = [prompt]
        for label in unique_labels:
            # find the count from the df 
            value = 0
            value_tuple = breakdown_by_entity_type_df[
                (breakdown_by_entity_type_df['prompt_id'] == prompt) & (breakdown_by_entity_type_df['label'] == label)][
                'count']

            if not value_tuple.empty:
                value = value_tuple.item()

            df_row = df_row + [value]

        new_breakdown_by_entity_type_df.loc[len(new_breakdown_by_entity_type_df)] = df_row

    if task_type == 'NER':
        new_breakdown_by_entity_type_df = add_language_col(new_breakdown_by_entity_type_df)

    new_breakdown_by_entity_type_df['formatted_prompt_id'] = [add_prompt_name(row.prompt_id) for _, row in
                                                              new_breakdown_by_entity_type_df.iterrows()]

    if task_type == 'NER':
        new_breakdown_by_entity_type_df['shots'] = [
            20 if row.formatted_prompt_id.split("_")[0] == "Avg" else int(row.formatted_prompt_id.split("_")[0]) for
            _, row
            in new_breakdown_by_entity_type_df.iterrows()]

        new_breakdown_by_entity_type_df = new_breakdown_by_entity_type_df.sort_values(
            by=['language', 'shots', 'formatted_prompt_id'],
            ascending=[True, True, True]).reset_index(drop=True)

    new_breakdown_by_entity_type_df = new_breakdown_by_entity_type_df.reset_index(drop=True)

    new_cols = ['formatted_prompt_id'] + unique_labels
    new_breakdown_by_entity_type_df = new_breakdown_by_entity_type_df.loc[:, new_cols]
    fig = new_breakdown_by_entity_type_df.plot(x="formatted_prompt_id", kind="bar", stacked=True, color=colours)

    if task_type == 'NER':
        sec = fig.secondary_xaxis(location=0)
        sec.set_xticks([6.5, 21.5], labels=['English prompt (shots)', 'Spanish prompt (shots)'])
        sec.tick_params('x', length=265, width=0)

    plt.legend(unique_labels)
    generic_chart(f'{df_type} by entity type for {task_type}', 'Prompts', 'Entity count')

In [None]:
def broken_by_entity_type_re(task_type, df, df_type):
    
    unique_labels1 = df['label1'].unique().tolist()
    unique_labels2 = df['label2'].unique().tolist()
    unique_labels = unique_labels1 + unique_labels2
    unique_labels = list(set(unique_labels))
    unique_prompts = df['prompt_id'].unique().tolist()
    
    new_cols = ['prompt_id'] + unique_labels
    
    label1 = pd.DataFrame(df.groupby('prompt_id')['label1'].value_counts()).reset_index()
    label2 = pd.DataFrame(df.groupby('prompt_id')['label2'].value_counts()).reset_index()
        
    new_hallucinations_by_entity_type_df = pd.DataFrame(columns=new_cols)
    for prompt in unique_prompts:
        df_row = [prompt]
        for label in unique_labels:
            # find the count from the df 
            value = 0
            value_tuple_label1 = label1[(label1['prompt_id'] == prompt) & (label1['label1'] == label)]['count']
            
            value_tuple_label2 = label2[(label2['prompt_id'] == prompt) & (label2['label2'] == label)]['count']
                        
            if not value_tuple_label1.empty:
                value = value_tuple_label1.item()
            
            if not value_tuple_label2.empty:
                value = value + value_tuple_label2.item()
                
            df_row = df_row + [value]
            
        new_hallucinations_by_entity_type_df.loc[len(new_hallucinations_by_entity_type_df)] = df_row

    new_hallucinations_by_entity_type_df['formatted_prompt_id'] = [add_prompt_name(row.prompt_id) for _, row in new_hallucinations_by_entity_type_df.iterrows()]


    new_hallucinations_by_entity_type_df = new_hallucinations_by_entity_type_df.sort_values(by='prompt_id').reset_index(drop=True)
    
    new_cols = ['formatted_prompt_id'] + unique_labels
    new_hallucinations_by_entity_type_df = new_hallucinations_by_entity_type_df.loc[:, new_cols]
    new_hallucinations_by_entity_type_df.plot(x="formatted_prompt_id", kind="bar", stacked=True, color=colours)
        
    plt.legend(unique_labels)
    generic_chart(f'{df_type} by entity type for {task_type}', 'Prompts', 'Entity count')

In [None]:
def broken_by_relation_type(task_type, df, df_type):
    unique_relations = df['relation_type'].unique().tolist()
    unique_prompts = df['prompt_id'].unique().tolist()
    
    new_cols = ['prompt_id'] + unique_relations
    
    relations_df = pd.DataFrame(df.groupby('prompt_id')['relation_type'].value_counts(dropna=False)).reset_index()
        
    broken_by_relation_type_df = pd.DataFrame(columns=new_cols)
    for prompt in unique_prompts:
        df_row = [prompt]
        for relation in unique_relations:
            # find the count from the df
            value = 0
            relation_value = relations_df[(relations_df['prompt_id'] == prompt) & (relations_df['relation_type'] == relation)]['count']
                        
            if not relation_value.empty:
                value = relation_value.item()
                
            df_row = df_row + [value]
            
        broken_by_relation_type_df.loc[len(broken_by_relation_type_df)] = df_row
    broken_by_relation_type_df['formatted_prompt_id'] = [add_prompt_name(row.prompt_id) for _, row in broken_by_relation_type_df.iterrows()]

    broken_by_relation_type_df = broken_by_relation_type_df.sort_values(by='prompt_id').reset_index(drop=True)
        
    new_cols = ['formatted_prompt_id'] + unique_relations
    
    broken_by_relation_type_df = broken_by_relation_type_df.loc[:, new_cols]
    broken_by_relation_type_df.plot(x="formatted_prompt_id", kind="bar", stacked=True, color=colours)
        
    plt.legend(unique_relations)
    generic_chart(f'{df_type} by relation type for {task_type}', 'Prompts', 'Relation count')

In [None]:
# 6. true positives, false negatives, false positives, false positive relations, false negative relations (stacked) vs total extracted entities for each task for exact match vs relaxed match

# for ner
def cross_linguistic_entity_division(task_type):
    entity_division_ner_df = evaluation_df[evaluation_df['task'] == task_type]
    entity_division_ner_df = entity_division_ner_df.loc[:, ['formatted_prompt_id', 'true_positive', 'false_positive', 'false_negative', 'language']].sort_values(by=['language']).reset_index(drop=True)
    
    eng_entity_division_ner_df = entity_division_ner_df[entity_division_ner_df['language'] == 'English prompt']
    esp_entity_division_ner_df = entity_division_ner_df[entity_division_ner_df['language'] == 'Spanish prompt']
    
    average_eng_val_row = ['Avg', eng_entity_division_ner_df['true_positive'].mean(), eng_entity_division_ner_df['false_positive'].mean(), eng_entity_division_ner_df['false_negative'].mean(),'English prompt']
    
    entity_division_ner_df.loc[len(entity_division_ner_df)] = average_eng_val_row
    
    average_esp_val_row = ['Avg', esp_entity_division_ner_df['true_positive'].mean(), esp_entity_division_ner_df['false_positive'].mean(), esp_entity_division_ner_df['false_negative'].mean(),'Spanish prompt']
    
    entity_division_ner_df.loc[len(entity_division_ner_df) + 1] = average_esp_val_row
    
    entity_division_ner_df['shots'] = [
            20 if row.formatted_prompt_id.split("_")[0] == "Avg" else int(row.formatted_prompt_id.split("_")[0]) for
            _, row
            in entity_division_ner_df.iterrows()]

    entity_division_ner_df = entity_division_ner_df.sort_values(
            by=['language', 'shots', 'formatted_prompt_id'],
            ascending=[True, True, True]).reset_index(drop=True)
    
    fig = entity_division_ner_df.plot(x="formatted_prompt_id", kind="bar", color=colours[5:8])
     
    sec = fig.secondary_xaxis(location=0)
    sec.set_xticks([8.5, 25.5], labels=['English prompts', 'Spanish prompts'])
    sec.tick_params('x', length=275, width=0)
    
    plt.legend(['true_positive', 'false_positive', 'false_negative'])
    
    generic_chart(f'Entity division for NER grouped by prompt language', 'Prompts', 'Count') 

In [None]:
def entity_division_with_relations(task):
    entity_division_df = evaluation_df[evaluation_df['task'] == task]
    
    entity_division_df = entity_division_df.loc[:, ['formatted_prompt_id', 'true_positive', 'false_positive', 'false_negative', 'false_positive_relations','false_negative_relations']].reset_index(drop=True)
    
    average_row = ['Avg', entity_division_df['true_positive'].mean(), entity_division_df['false_positive'].mean(), entity_division_df['false_negative'].mean(), 
     entity_division_df['false_positive_relations'].mean(), entity_division_df['false_negative_relations'].mean()]
    
    entity_division_df.loc[len(entity_division_df)] = average_row
    
    entity_division_df.plot(x="formatted_prompt_id", kind="bar", color=colours[5:10])
    
    plt.legend(['true_positive', 'false_positive', 'false_negative', 'false_positive_relations','false_negative_relations'])
    
    generic_chart(f'Entity and relation division for {task}', 'Prompts', 'Count') 

In [None]:
def gold_broken_by_entity_type_ner(task_type, df, df_type):
    breakdown_by_entity_type = df['label'].value_counts()    
    breakdown_by_entity_type.plot(x="label", kind="bar", color=colours)
        
    generic_chart(f'{df_type} types for {task_type}', 'Type', 'Entity count')

In [None]:
def gold_broken_by_entity_type_re(task_type, df, df_type):
    unique_labels1 = df['label1'].unique().tolist()
    unique_labels2 = df['label2'].unique().tolist()
    unique_labels = unique_labels1 + unique_labels2
    unique_labels = list(set(unique_labels))
    
    new_cols = unique_labels
    
    label1 = pd.DataFrame(df['label1'].value_counts()).reset_index()
    label1.columns = ['label', 'count']
    label2 = pd.DataFrame(df['label2'].value_counts()).reset_index()
    label2.columns = ['label', 'count']
    breakdown_by_entity_type = pd.DataFrame(columns=new_cols)
    
    df_row = []
    for label in unique_labels:
        value = 0
        value_tuple_label1 = label1[(label1['label'] == label)]['count']
        
        value_tuple_label2 = label2[(label2['label'] == label)]['count']
                    
        if not value_tuple_label1.empty:
            value = value_tuple_label1.item()
        
        if not value_tuple_label2.empty:
            value = value + value_tuple_label2.item()
        df_row = df_row + [value] 
    breakdown_by_entity_type.loc[len(breakdown_by_entity_type)] = df_row
    breakdown_by_entity_type.plot(kind="bar", color=colours)
        
    generic_chart(f'{df_type} types for {task_type}', 'Entity type', 'Entity count')

In [None]:
def gold_broken_by_relation_type(task_type, df, df_type):
    breakdown_by_entity_type = df['relation_type'].value_counts(dropna=False) 
    
    breakdown_by_entity_type.plot(x="relation_type", kind="bar", color=colours)
        
    generic_chart(f'{df_type} types for {task_type}', 'Type', 'Relation count')

In [None]:
# read the files
dataset_details = pd.read_csv(f'{result_folder_path}/results/dataset_details/dataset_details.tsv', sep='\t', header=0)
gold_annotation_types = pd.read_csv(f'{result_folder_path}/results/dataset_details/gold_annotation_type_count.tsv', sep='\t', header=0)
evaluation_df = pd.read_csv(f'{result_folder_path}/results/eval_log.tsv', sep='	', header=0).sort_values(by=['prompt_id'])
evaluation_df = add_language_col(evaluation_df)
evaluation_df['formatted_prompt_id'] = [add_prompt_name(row.prompt_id) for _, row in evaluation_df.iterrows()]

In [None]:
evaluation_df

In [None]:
tasks = evaluation_df['task'].unique()

ner_data_exists = 'NER' in tasks
re_data_exists = 'RE' in tasks
nerre_data_exists = 'NERRE' in tasks

In [None]:
if ner_data_exists:
    ner_extracted_entities = pd.read_csv(f'{result_folder_path}/results/entities/NER/results.tsv', sep='\t', header=0)
    ner_gold_entities = pd.read_csv(f'{result_folder_path}/results/entities/NER/gold.tsv', sep='\t', header=0)
    
    ner_hallucinations_df = merge_hallucinations('NER')
    ner_hallucinations_df = add_language_col(ner_hallucinations_df)
    ner_hallucinations_df['formatted_prompt_id'] = [add_prompt_name(row.prompt_id) for _, row in ner_hallucinations_df.iterrows()]

In [None]:
if re_data_exists:
    re_extracted_entities = pd.read_csv(f'{result_folder_path}/results/entities/RE/results.tsv', sep='\t', header=0)
    re_extracted_entities = re_extracted_entities.fillna('NA')
    
    re_gold_entities = pd.read_csv(f'{result_folder_path}/results/entities/RE/gold.tsv', sep='\t', header=0)
    re_gold_entities = re_gold_entities.fillna('NA')
    
    re_hallucinations_df = merge_hallucinations('RE')
    re_hallucinations_df = re_hallucinations_df.fillna('NA')
    re_hallucinations_df = add_language_col(re_hallucinations_df)
    re_hallucinations_df['formatted_prompt_id'] = [add_prompt_name(row.prompt_id) for _, row in re_hallucinations_df.iterrows()]

In [None]:
if nerre_data_exists:
    nerre_extracted_entities = pd.read_csv(f'{result_folder_path}/results/entities/NERRE/results.tsv', sep='\t', header=0)
    nerre_extracted_entities = nerre_extracted_entities.fillna('NA')
    nerre_extracted_entities['prompt_id'] = nerre_extracted_entities['prompt_id'].replace({'p1_one_shot': 'p2_one_shot'})
    
    nerre_gold_entities = pd.read_csv(f'{result_folder_path}/results/entities/NERRE/gold.tsv', sep='\t', header=0)
    nerre_gold_entities = nerre_gold_entities.fillna('NA')
    
    nerre_hallucinations_df = merge_hallucinations('NERRE')
    nerre_hallucinations_df = nerre_hallucinations_df.fillna('NA')
    nerre_hallucinations_df = add_language_col(nerre_hallucinations_df)
    nerre_hallucinations_df['prompt_id'] = nerre_hallucinations_df['prompt_id'].replace({'p1_one_shot': 'p2_one_shot'})
    nerre_hallucinations_df['formatted_prompt_id'] = [add_prompt_name(row.prompt_id) for _, row in nerre_hallucinations_df.iterrows()]

#### Performance of the gen LLM based on prompts

In [None]:
if ner_data_exists: 
    generate_cross_linguistic_evaluation_metrics('NER')
    generate_cross_linguistic_f1_shots('NER')
    generate_cross_linguistic_f1_guideline('NER')
    generate_cross_linguistic_f1_output('NER')
    
if re_data_exists: generate_evalutation_metrics('RE')
if nerre_data_exists: generate_evalutation_metrics('NERRE')

#### Exploration of the hallucinations

In [None]:
if ner_data_exists:
    generate_cross_linguistic_stacked_entity_graph('NER')
    order_hallucinations_by_type('NER', ner_hallucinations_df)
    broken_by_entity_type_ner('NER', ner_hallucinations_df, 'Hallucinations')

In [None]:
if re_data_exists:
    generate_stacked_entity_graph('RE')
    order_hallucinations_by_type('RE', re_hallucinations_df)
    broken_by_entity_type_re('RE', re_hallucinations_df, 'Hallucinations')
    broken_by_relation_type('RE', re_hallucinations_df, 'Hallucinations')

In [None]:
if nerre_data_exists:
    generate_stacked_entity_graph('NERRE')
    order_hallucinations_by_type('NERRE', nerre_hallucinations_df)
    broken_by_relation_type('NERRE', nerre_hallucinations_df, 'Hallucinations')
    broken_by_entity_type_re('NERRE', nerre_hallucinations_df, 'Hallucinations')

#### Classification of the extracted entities and identified relations

In [None]:
# 7. Breakdown of the gold entities and relations vs extracted entities and relations for each task
if ner_data_exists:
    cross_linguistic_entity_division('NER')
    
    broken_by_entity_type_ner('NER', ner_extracted_entities, 'Extracted entities')
    gold_broken_by_entity_type_ner('NER', ner_gold_entities, 'Gold entity')

In [None]:
if re_data_exists:
    entity_division_with_relations('RE')
    
    broken_by_relation_type('RE', re_extracted_entities, 'Extracted entities')
    broken_by_entity_type_re('RE', re_extracted_entities, 'Extracted entities')
    
    gold_broken_by_entity_type_re('RE', re_gold_entities, 'Gold entity')
    gold_broken_by_relation_type('RE', re_gold_entities, 'Gold relation')

In [None]:
if nerre_data_exists:
    entity_division_with_relations('NERRE')
    
    broken_by_relation_type('NERRE', nerre_extracted_entities, 'Extracted entities')
    broken_by_entity_type_re('NERRE', nerre_extracted_entities, 'Extracted entities')
    
    gold_broken_by_entity_type_re('NERRE', nerre_gold_entities, 'Gold entity')
    gold_broken_by_relation_type('NERRE', nerre_gold_entities, 'Gold relation')