In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import glob
import math
import random
%matplotlib inline

plt.rcParams["figure.figsize"] = (14, 10)
plt.rcParams["figure.autolayout"] = True

colours = ['#ff0000','#ff8700','#ffd300','#deff0a','#a1ff0a','#0aff99','#0aefff','#147df5','#580aff','#be0aff','#54478c','#240046']

In [None]:
def generic_chart(title, x_label, y_label):
    plt.title(title)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.xticks()
    plt.savefig(f'./exact_results/figures/{title}.png')
    plt.show()

In [None]:
# read the files
dataset_details = pd.read_csv('./exact_results/dataset_details/dataset_details.tsv', sep='\t', header=0)
gold_annotation_types = pd.read_csv('./exact_results/dataset_details/gold_annotation_type_count.tsv', sep='\t', header=0)
evaluation_df = pd.read_csv('./exact_results/eval_log.tsv', sep='	', header=0).sort_values(by=['prompt_id'])

def merge_hallucinations(task):
    tsv_files = glob.glob(f'./exact_results/hallucinations/{task}/*.tsv')
    
    combined_df = pd.DataFrame()
    for tsv_file in tsv_files:
        df = pd.read_csv(tsv_file, sep='\t', header=0)
        combined_df = pd.concat([combined_df, df])
        
    return combined_df

# hallucinations
ner_hallucinations_df = merge_hallucinations('NER')
re_hallucinations_df = merge_hallucinations('RE')
re_hallucinations_df = re_hallucinations_df.fillna('NA')
nerre_hallucinations_df = merge_hallucinations('NERRE')
nerre_hallucinations_df = nerre_hallucinations_df.fillna('NA')
nerre_hallucinations_df['prompt_id'] = nerre_hallucinations_df['prompt_id'].replace({'p1_one_shot': 'p2_one_shot'})

# extracted entities
ner_extracted_entities = pd.read_csv('./exact_results/entities/NER/results.tsv', sep='\t', header=0)

re_extracted_entities = pd.read_csv('./exact_results/entities/RE/results.tsv', sep='\t', header=0)
re_extracted_entities = re_extracted_entities.fillna('NA')

nerre_extracted_entities = pd.read_csv('./exact_results/entities/NERRE/results.tsv', sep='\t', header=0)
nerre_extracted_entities = nerre_extracted_entities.fillna('NA')
nerre_extracted_entities['prompt_id'] = nerre_extracted_entities['prompt_id'].replace({'p1_one_shot': 'p2_one_shot'})

# gold
ner_gold_entities = pd.read_csv('./exact_results/entities/NER/gold.tsv', sep='\t', header=0)

re_gold_entities = pd.read_csv('./exact_results/entities/RE/gold.tsv', sep='\t', header=0)
re_gold_entities = re_gold_entities.fillna('NA')

nerre_gold_entities = pd.read_csv('./exact_results/entities/NERRE/gold.tsv', sep='\t', header=0)
nerre_gold_entities = nerre_gold_entities.fillna('NA')

In [None]:
# Add new column for the language 
def add_language_col(df):
    df['language'] = ["Spanish prompt" if '_es'in row.prompt_id  else 'English prompt' for _, row in df.iterrows()]
    
    return df

evaluation_df = add_language_col(evaluation_df)
ner_hallucinations_df = add_language_col(ner_hallucinations_df)
re_hallucinations_df = add_language_col(re_hallucinations_df)
nerre_hallucinations_df = add_language_col(nerre_hallucinations_df)

In [None]:
# Add formatted prompt id
def add_prompt_name(prompt_id):
    if "zero" in prompt_id:
        return "Zero shot"
    elif "one" in prompt_id:
        return "One shot"
    elif "five" in prompt_id:
        return "Five shot"
    elif "ten" in prompt_id:
        return "Ten shot"
    else: 
        return prompt_id

evaluation_df['formatted_prompt_id'] = [add_prompt_name(row.prompt_id) for _, row in evaluation_df.iterrows()]

ner_hallucinations_df['formatted_prompt_id'] = [add_prompt_name(row.prompt_id) for _, row in ner_hallucinations_df.iterrows()]

re_hallucinations_df['formatted_prompt_id'] = [add_prompt_name(row.prompt_id) for _, row in re_hallucinations_df.iterrows()]

nerre_hallucinations_df['formatted_prompt_id'] = [add_prompt_name(row.prompt_id) for _, row in nerre_hallucinations_df.iterrows()]


#### Performance of the gen LLM based on prompts

In [None]:
# 1.1/2.0 NER performance against different prompts grouped by language 
temp_df = evaluation_df[evaluation_df['task'] == 'NER']
temp_df = temp_df
temp_df = temp_df.loc[:, ['formatted_prompt_id', 'precision', 'recall', 'f1', 'language']].sort_values(by=['language']).reset_index(drop=True)

eng_temp_df = temp_df[temp_df['language'] == 'English prompt']
esp_temp_df = temp_df[temp_df['language'] == 'Spanish prompt']

average_eng_val_row = ['Average', eng_temp_df['precision'].mean(), eng_temp_df['recall'].mean(), eng_temp_df['f1'].mean(), 'English prompt']

temp_df.loc[len(temp_df)] = average_eng_val_row

average_esp_val_row = ['Average', esp_temp_df['precision'].mean(), esp_temp_df['recall'].mean(), esp_temp_df['f1'].mean(), 'Spanish prompt']

temp_df.loc[len(temp_df) + 1] = average_esp_val_row

temp_df = temp_df.sort_values(by=['language']).reset_index(drop=True)

fig = temp_df.plot(x="formatted_prompt_id", kind="bar", color=colours[0:3])
 
sec = fig.secondary_xaxis(location=0)
sec.set_xticks([2, 7], labels=['English prompts', 'Spanish prompts'])
sec.tick_params('x', length=80, width=0)

generic_chart(f'Evaluation metrics for NER grouped by prompt language', 'Prompts', 'Score') 

In [None]:
# 1.2, 1.3 F1, precision, recall for NER, RE, NERRE seperated by prompts, add overall average of metrics 

def generate_evalutation_metrics(task_type):
    temp_df = evaluation_df[evaluation_df['task'] == task_type]
    temp_df = temp_df.loc[:, ['formatted_prompt_id', 'precision', 'recall', 'f1']].reset_index(drop=True)
    
    average_val_row = ['Average', temp_df['precision'].mean(), temp_df['recall'].mean(), temp_df['f1'].mean()]

    temp_df.loc[len(temp_df)] = average_val_row
    
    temp_df.plot(x="formatted_prompt_id", y=["precision", "recall", "f1"], kind="bar", color=colours[0:3]) 
    generic_chart(f'Evaluation metrics for {task_type}', 'Prompts', 'Score') 

generate_evalutation_metrics('RE')
generate_evalutation_metrics('NERRE')

#### Exploration of the hallucinations

In [None]:
# 3. Per task --> Hallucinations per prompt stacked with entities extracted

# NER based on prompt language
temp_ner_hall_df = evaluation_df[evaluation_df['task'] == 'NER']

temp_ner_hall_df = temp_ner_hall_df.loc[:, ['formatted_prompt_id', 'extracted_tuples_or_triplets_per_prompt', 'tuple_or_triplet_hallucinations_per_prompt', 'language']].sort_values(by=['language']).reset_index(drop=True)

eng_ner_hall_temp_df = temp_ner_hall_df[temp_ner_hall_df['language'] == 'English prompt']
esp_ner_hall_temp_df = temp_ner_hall_df[temp_ner_hall_df['language'] == 'Spanish prompt']

ner_en_hall_average_val_row = ['Average', math.ceil(eng_ner_hall_temp_df['extracted_tuples_or_triplets_per_prompt'].mean()), math.ceil(eng_ner_hall_temp_df ['tuple_or_triplet_hallucinations_per_prompt'].mean()), 'English prompt']

temp_ner_hall_df.loc[len(temp_ner_hall_df)] = ner_en_hall_average_val_row

ner_es_hall_average_val_row = ['Average', math.ceil(esp_ner_hall_temp_df['extracted_tuples_or_triplets_per_prompt'].mean()), math.ceil(esp_ner_hall_temp_df ['tuple_or_triplet_hallucinations_per_prompt'].mean()), 'Spanish prompt']

temp_ner_hall_df.loc[len(temp_ner_hall_df) + 1] = ner_es_hall_average_val_row

temp_ner_hall_df = temp_ner_hall_df.sort_values(by=['language']).reset_index(drop=True)

fig = temp_ner_hall_df.plot(x="formatted_prompt_id", kind="bar", stacked=True, color=colours[4:6])
 
sec = fig.secondary_xaxis(location=0)
sec.set_xticks([2, 7], labels=['English prompts', 'Spanish prompts'])
sec.tick_params('x', length=80, width=0)

plt.legend(["Extracted instances", "Hallucinated instances"])
generic_chart(f'Instances for NER', 'Prompts', 'Instances')

In [None]:
def generate_stacked_entity_graph(task_type):
    temp_df = evaluation_df[evaluation_df['task'] == task_type]
    
    temp_df = temp_df.loc[:, ['formatted_prompt_id', 'extracted_tuples_or_triplets_per_prompt', 'tuple_or_triplet_hallucinations_per_prompt']].reset_index(drop=True)
    
    average_val_row = ['Average', temp_df['extracted_tuples_or_triplets_per_prompt'].mean(), temp_df['tuple_or_triplet_hallucinations_per_prompt'].mean()]

    temp_df.loc[len(temp_df)] = average_val_row
    
    x1 = temp_df["formatted_prompt_id"].tolist()
    y1 = temp_df["extracted_tuples_or_triplets_per_prompt"].tolist()
    y2 = temp_df["tuple_or_triplet_hallucinations_per_prompt"].tolist()
    
    plt.bar(x1, y1, color=colours[4])
    plt.bar(x1, y2, bottom=y1, color=colours[5])
    
    plt.legend(["Extracted instances", "Hallucinated instances"])
    generic_chart(f'Instances for {task_type}', 'Prompts', 'Instances')

# TODO: Add a note specifying that NER shows the span-label tuple instances while RE and NERRE shows spans-labels-relation triplet instances
generate_stacked_entity_graph('RE')
generate_stacked_entity_graph('NERRE')

In [None]:
# 4. Hallucinations broken down by type of the hallucination (basically looking at the over generation of the found instances and fabrication)

# stacked -1 and -2
def order_hallucinations_by_type(task_type, df):
    offset_col_name = 'offset1'
    if task_type == 'RE' or task_type == 'NERRE':
        offset_col_name = 'offset1_start'
    hallucinations_by_type_df = pd.DataFrame(df.groupby('prompt_id')[offset_col_name].value_counts()).reset_index()
    
    hallucinations_by_type_df_fabrications = hallucinations_by_type_df[hallucinations_by_type_df[offset_col_name] == -1].reset_index(drop=True).sort_values(by='prompt_id')
    hallucinations_by_type_df_fabrications = hallucinations_by_type_df_fabrications.rename(columns={'count': 'Fabrications'})
    
    hallucinations_by_type_df_over_generated = hallucinations_by_type_df[hallucinations_by_type_df[offset_col_name] == -2].reset_index(drop=True).sort_values(by='prompt_id')
    hallucinations_by_type_df_over_generated = hallucinations_by_type_df_over_generated.rename(columns={'count': 'Over generated'})
    
    cols = hallucinations_by_type_df_over_generated.columns.difference(hallucinations_by_type_df_fabrications.columns)
    
    refactored_hallucinations_by_type_df = pd.merge(hallucinations_by_type_df_fabrications, hallucinations_by_type_df_over_generated[cols], left_index=True, right_index=True, how='outer')
    
    if task_type == 'NER':
        refactored_hallucinations_by_type_df = add_language_col(refactored_hallucinations_by_type_df)
    
    refactored_hallucinations_by_type_df['formatted_prompt_id'] = [add_prompt_name(row.prompt_id) for _, row in refactored_hallucinations_by_type_df.iterrows()]
    
    if task_type == 'NER':
        refactored_hallucinations_by_type_df = refactored_hallucinations_by_type_df.sort_values(by='language')
    
    refactored_hallucinations_by_type_df = refactored_hallucinations_by_type_df.reset_index(drop=True)
    
    refactored_hallucinations_by_type_df = refactored_hallucinations_by_type_df.loc[:, ['prompt_id', 'formatted_prompt_id', 'Over generated', 'Fabrications']]
    fig = refactored_hallucinations_by_type_df.plot(x="formatted_prompt_id", kind="bar", stacked=True, color=colours[6:8])
    
    if task_type == 'NER': 
        sec = fig.secondary_xaxis(location=0)
        sec.set_xticks([2, 6], labels=['English prompts', 'Spanish prompts'])
        sec.tick_params('x', length=80, width=0)
        
    plt.legend(["Over generated instances", "Fabricated instances"])
    generic_chart(f'Hallucinations by type for {task_type}', 'Prompts', 'Hallucinated instances')

In [None]:
order_hallucinations_by_type('NER', ner_hallucinations_df)
order_hallucinations_by_type('RE', re_hallucinations_df)
order_hallucinations_by_type('NERRE', nerre_hallucinations_df)

In [None]:
# 5. Hallucinations broken down by the type of the entities and relations
def broken_by_entity_type_ner(task_type, df, df_type):
    breakdown_by_entity_type_df = pd.DataFrame(df.groupby('prompt_id')['label'].value_counts()).reset_index()
    
    unique_labels = breakdown_by_entity_type_df['label'].unique().tolist()
    unique_prompts = breakdown_by_entity_type_df['prompt_id'].unique().tolist()
    
    new_cols = ['prompt_id'] + unique_labels
    
    new_breakdown_by_entity_type_df = pd.DataFrame(columns=new_cols)
    for prompt in unique_prompts:
        df_row = [prompt]
        for label in unique_labels:
            # find the count from the df 
            value = 0
            value_tuple =  breakdown_by_entity_type_df[(breakdown_by_entity_type_df['prompt_id'] == prompt) & (breakdown_by_entity_type_df['label'] == label)]['count']
            
            if not value_tuple.empty:
                value = value_tuple.item()
                
            df_row = df_row + [value]
            
        new_breakdown_by_entity_type_df.loc[len(new_breakdown_by_entity_type_df)] = df_row
    
    if task_type == 'NER':
        new_breakdown_by_entity_type_df = add_language_col(new_breakdown_by_entity_type_df)

    new_breakdown_by_entity_type_df['formatted_prompt_id'] = [add_prompt_name(row.prompt_id) for _, row in new_breakdown_by_entity_type_df.iterrows()]

    if task_type == 'NER':
        new_breakdown_by_entity_type_df = new_breakdown_by_entity_type_df.sort_values(by='language')

    new_breakdown_by_entity_type_df = new_breakdown_by_entity_type_df.reset_index(drop=True)
    
    new_cols = ['formatted_prompt_id'] + unique_labels
    new_breakdown_by_entity_type_df = new_breakdown_by_entity_type_df.loc[:, new_cols]
    fig = new_breakdown_by_entity_type_df.plot(x="formatted_prompt_id", kind="bar", stacked=True, color=colours)
    
    if task_type == 'NER': 
        sec = fig.secondary_xaxis(location=0)
        sec.set_xticks([2, 6], labels=['English prompts', 'Spanish prompts'])
        sec.tick_params('x', length=80, width=0)
        
    plt.legend(unique_labels)
    generic_chart(f'{df_type} by entity type for {task_type}', 'Prompts', 'Entity count')
    
broken_by_entity_type_ner('NER', ner_hallucinations_df, 'Hallucinations')


In [None]:
def broken_by_entity_type_re(task_type, df, df_type):
    
    unique_labels1 = df['label1'].unique().tolist()
    unique_labels2 = df['label2'].unique().tolist()
    unique_labels = unique_labels1 + unique_labels2
    unique_labels = list(set(unique_labels))
    unique_prompts = df['prompt_id'].unique().tolist()
    
    new_cols = ['prompt_id'] + unique_labels
    
    label1 = pd.DataFrame(df.groupby('prompt_id')['label1'].value_counts()).reset_index()
    label2 = pd.DataFrame(df.groupby('prompt_id')['label2'].value_counts()).reset_index()
        
    new_hallucinations_by_entity_type_df = pd.DataFrame(columns=new_cols)
    for prompt in unique_prompts:
        df_row = [prompt]
        for label in unique_labels:
            # find the count from the df 
            value = 0
            value_tuple_label1 = label1[(label1['prompt_id'] == prompt) & (label1['label1'] == label)]['count']
            
            value_tuple_label2 = label2[(label2['prompt_id'] == prompt) & (label2['label2'] == label)]['count']
                        
            if not value_tuple_label1.empty:
                value = value_tuple_label1.item()
            
            if not value_tuple_label2.empty:
                value = value + value_tuple_label2.item()
                
            df_row = df_row + [value]
            
        new_hallucinations_by_entity_type_df.loc[len(new_hallucinations_by_entity_type_df)] = df_row

    new_hallucinations_by_entity_type_df['formatted_prompt_id'] = [add_prompt_name(row.prompt_id) for _, row in new_hallucinations_by_entity_type_df.iterrows()]


    new_hallucinations_by_entity_type_df = new_hallucinations_by_entity_type_df.sort_values(by='prompt_id').reset_index(drop=True)
    
    new_cols = ['formatted_prompt_id'] + unique_labels
    new_hallucinations_by_entity_type_df = new_hallucinations_by_entity_type_df.loc[:, new_cols]
    new_hallucinations_by_entity_type_df.plot(x="formatted_prompt_id", kind="bar", stacked=True, color=colours)
        
    plt.legend(unique_labels)
    generic_chart(f'{df_type} by entity type for {task_type}', 'Prompts', 'Entity count')
    
broken_by_entity_type_re('RE', re_hallucinations_df, 'Hallucinations')
broken_by_entity_type_re('NERRE', nerre_hallucinations_df, 'Hallucinations')

In [None]:
def broken_by_relation_type(task_type, df, df_type):
    unique_relations = df['relation_type'].unique().tolist()
    unique_prompts = df['prompt_id'].unique().tolist()
    
    new_cols = ['prompt_id'] + unique_relations
    
    relations_df = pd.DataFrame(df.groupby('prompt_id')['relation_type'].value_counts(dropna=False)).reset_index()
        
    broken_by_relation_type_df = pd.DataFrame(columns=new_cols)
    for prompt in unique_prompts:
        df_row = [prompt]
        for relation in unique_relations:
            # find the count from the df
            value = 0
            relation_value = relations_df[(relations_df['prompt_id'] == prompt) & (relations_df['relation_type'] == relation)]['count']
                        
            if not relation_value.empty:
                value = relation_value.item()
                
            df_row = df_row + [value]
            
        broken_by_relation_type_df.loc[len(broken_by_relation_type_df)] = df_row
    broken_by_relation_type_df['formatted_prompt_id'] = [add_prompt_name(row.prompt_id) for _, row in broken_by_relation_type_df.iterrows()]

    broken_by_relation_type_df = broken_by_relation_type_df.sort_values(by='prompt_id').reset_index(drop=True)
        
    new_cols = ['formatted_prompt_id'] + unique_relations
    
    broken_by_relation_type_df = broken_by_relation_type_df.loc[:, new_cols]
    broken_by_relation_type_df.plot(x="formatted_prompt_id", kind="bar", stacked=True, color=colours)
        
    plt.legend(unique_relations)
    generic_chart(f'{df_type} by relation type for {task_type}', 'Prompts', 'Relation count')
    
broken_by_relation_type('RE', re_hallucinations_df, 'Hallucinations')
broken_by_relation_type('NERRE', nerre_hallucinations_df, 'Hallucinations')

#### Classification of the extracted entities and identified relations

In [None]:
# 6. true positives, false negatives, false positives, false positive relations, false negative relations (stacked) vs total extracted entities for each task for exact match vs relaxed match

# for ner
entity_division_ner_df = evaluation_df[evaluation_df['task'] == 'NER']
entity_division_ner_df = entity_division_ner_df.loc[:, ['formatted_prompt_id', 'true_positive', 'false_positive', 'false_negative', 'language']].sort_values(by=['language']).reset_index(drop=True)

eng_entity_division_ner_df = entity_division_ner_df[entity_division_ner_df['language'] == 'English prompt']
esp_entity_division_ner_df = entity_division_ner_df[entity_division_ner_df['language'] == 'Spanish prompt']

average_eng_val_row = ['Average', eng_entity_division_ner_df['true_positive'].mean(), eng_entity_division_ner_df['false_positive'].mean(), eng_entity_division_ner_df['false_negative'].mean(),'English prompt']

entity_division_ner_df.loc[len(entity_division_ner_df)] = average_eng_val_row

average_esp_val_row = ['Average', esp_entity_division_ner_df['true_positive'].mean(), esp_entity_division_ner_df['false_positive'].mean(), esp_entity_division_ner_df['false_negative'].mean(),'Spanish prompt']

entity_division_ner_df.loc[len(entity_division_ner_df) + 1] = average_esp_val_row

entity_division_ner_df = entity_division_ner_df.sort_values(by=['language']).reset_index(drop=True)

fig = entity_division_ner_df.plot(x="formatted_prompt_id", kind="bar", color=colours[5:8])
 
sec = fig.secondary_xaxis(location=0)
sec.set_xticks([2, 6], labels=['English prompts', 'Spanish prompts'])
sec.tick_params('x', length=80, width=0)

plt.legend(['true_positive', 'false_positive', 'false_negative'])

generic_chart(f'Entity division for NER grouped by prompt language', 'Prompts', 'Count') 

In [None]:
# for relation triplets 
def entity_division_with_relations(task):
    entity_division_df = evaluation_df[evaluation_df['task'] == task]
    
    entity_division_df = entity_division_df.loc[:, ['formatted_prompt_id', 'true_positive', 'false_positive', 'false_negative', 'false_positive_relations','false_negative_relations']].reset_index(drop=True)
    
    average_row = ['Average', entity_division_df['true_positive'].mean(), entity_division_df['false_positive'].mean(), entity_division_df['false_negative'].mean(), 
     entity_division_df['false_positive_relations'].mean(), entity_division_df['false_negative_relations'].mean()]
    
    entity_division_df.loc[len(entity_division_df)] = average_row
    
    entity_division_df.plot(x="formatted_prompt_id", kind="bar", color=colours[5:10])
    
    plt.legend(['true_positive', 'false_positive', 'false_negative', 'false_positive_relations','false_negative_relations'])
    
    generic_chart(f'Entity and relation division for {task} grouped by prompt language', 'Prompts', 'Count') 
    
entity_division_with_relations('RE')
entity_division_with_relations('NERRE')

In [None]:
# 7. Breakdown of the gold entities and relations vs extracted entities and relations for each task

broken_by_entity_type_ner('NER', ner_extracted_entities, 'Extracted entities')
broken_by_entity_type_re('RE', re_extracted_entities, 'Extracted entities')
broken_by_entity_type_re('NERRE', nerre_extracted_entities, 'Extracted entities')

In [None]:
broken_by_relation_type('RE', re_extracted_entities, 'Extracted entities')
broken_by_relation_type('NERRE', nerre_extracted_entities, 'Extracted entities')

In [None]:
def gold_broken_by_entity_type_ner(task_type, df, df_type):
    breakdown_by_entity_type = df['label'].value_counts()    
    breakdown_by_entity_type.plot(x="label", kind="bar", color=colours)
        
    generic_chart(f'{df_type} types for {task_type}', 'Type', 'Entity count')
    
gold_broken_by_entity_type_ner('NER', ner_gold_entities, 'Gold entity')

In [None]:
def gold_broken_by_entity_type_re(task_type, df, df_type):
    unique_labels1 = df['label1'].unique().tolist()
    unique_labels2 = df['label2'].unique().tolist()
    unique_labels = unique_labels1 + unique_labels2
    unique_labels = list(set(unique_labels))
    
    new_cols = unique_labels
    
    label1 = pd.DataFrame(df['label1'].value_counts()).reset_index()
    label1.columns = ['label', 'count']
    label2 = pd.DataFrame(df['label2'].value_counts()).reset_index()
    label2.columns = ['label', 'count']
    breakdown_by_entity_type = pd.DataFrame(columns=new_cols)
    
    df_row = []
    for label in unique_labels:
        value = 0
        value_tuple_label1 = label1[(label1['label'] == label)]['count']
        
        value_tuple_label2 = label2[(label2['label'] == label)]['count']
                    
        if not value_tuple_label1.empty:
            value = value_tuple_label1.item()
        
        if not value_tuple_label2.empty:
            value = value + value_tuple_label2.item()
        df_row = df_row + [value] 
    breakdown_by_entity_type.loc[len(breakdown_by_entity_type)] = df_row
    
    breakdown_by_entity_type.plot(kind="bar", color=colours)
        
    generic_chart(f'{df_type} types for {task_type}', 'Entity type', 'Entity count')
    
gold_broken_by_entity_type_re('RE', re_gold_entities, 'Gold entity')
gold_broken_by_entity_type_re('NERRE', nerre_gold_entities, 'Gold entity')

In [None]:
def gold_broken_by_relation_type(task_type, df, df_type):
    breakdown_by_entity_type = df['relation_type'].value_counts(dropna=False) 
    
    breakdown_by_entity_type.plot(x="relation_type", kind="bar", color=colours)
        
    generic_chart(f'{df_type} types for {task_type}', 'Type', 'Relation count')
    
gold_broken_by_relation_type('RE', re_gold_entities, 'Gold relation')
gold_broken_by_relation_type('NERRE', nerre_gold_entities, 'Gold relation')