## Building an MMLU Eval

This notebook shows how to:
- Build and run an eval
- Load the results and into a Pandas Dataframe

We use the `evals.elsuite.basic.match:Match` Eval class here to check whether new completions match the correct answer. Under the hood, it will generate a completion with the choice of model for each prompt, check if the completion matches the true answer, then logs a result.

In [None]:
# Install, and download MMLU if you haven't already
#%pip install -e .

!curl -O https://people.eecs.berkeley.edu/~hendrycks/data.tar
!tar -xf data.tar
data_pth = "../../../multiconer2023/EN-English"

In [None]:
%pip install openai
%pip install natsort

In [None]:
import pandas as pd
import os
import numpy as np

# Assuming this notebook is in examples/
registry_pth = os.path.join(os.getcwd(), "../evals/registry")

## Code for my attempt

In [None]:
# Build the prompts using Chat format. We support converting Chat conversations to text for non-Chat models

sys_msg = """Fill in the prompt you want to use to send the input to GPT-3"""






def create_chat_prompt(sys_msg, sentence):
    user_prompt = f"\n{sentence}" + "\nAnswer:"
    return [
        {"role": "system", "content": sys_msg}, 
        {"role": "user", "content": user_prompt}
    ]

def create_chat_example(sentence, correct_answer):
    """
    Form few-shot prompts in the recommended format: https://github.com/openai/openai-python/blob/main/chatml.md#few-shot-prompting
    """
    user_prompt = f"\n{sentence}" + "\nAnswer:"
    return [
        {"role": "system", "content": user_prompt, "name": "example_user"},
        {"role": "system", "content": correct_answer, "name": "example_assistant"},
    ]

In [None]:
import yaml

# Load in the test data and the few-shot dataset
test_df = pd.read_csv('...')
dev_df = pd.read_csv('...')

registry_yaml = {}

# Create few-shot prompts
dev_df['sample'] = dev_df.apply(lambda x: create_chat_example(x['sentence'], x['fewshot_gpt']), axis=1)
few_shot_pth = os.path.join(registry_pth, 'data', 'thesis_test', "few_shot.jsonl") 
dev_df[["sample"]].to_json(few_shot_pth, lines=True, orient="records")

# Create test prompts and ideal completions
test_df['input'] = test_df.apply(lambda x: create_chat_prompt(sys_msg, x['sentence']), axis=1)
test_df['ideal'] = test_df.tags
samples_pth = os.path.join(registry_pth, 'data', 'thesis_test', "samples.jsonl")     
test_df[["input", "ideal"]].to_json(samples_pth, lines=True, orient="records")

eval_id = "match_mmlu_thesis"

registry_yaml[eval_id] = {
    "id": f"{eval_id}.test.v1",
    "metrics": ['accuracy']
}

# Adjust the "num_few_shot" parameter to the number of few-shot samples you load in
registry_yaml[f"{eval_id}.test.v1"] = {
    "class": "evals.elsuite.basic.match:Match",
    "args": {
        "samples_jsonl": samples_pth,
        "few_shot_jsonl": few_shot_pth,
        "num_few_shot": 10,
    }
}

with open(os.path.join(registry_pth, "evals", "thesis.yaml"), "w") as f:
    yaml.dump(registry_yaml, f)

In [None]:
# This will generate a JSONL which will record samples and logs and store it in /tmp/evallogs
!oaieval gpt-3.5-turbo match_mmlu_thesis

In [None]:
# How to process the log events generated by oaieval
# assign the path to the json file with the models outputs to the variable "events"
events = "..."

with open(events, "r", encoding='utf-8') as f:
    events_df = pd.read_json(f, lines=True, encoding='utf-8')
    
events_df

In [None]:
# Inspect samples
for i, r in pd.json_normalize(events_df[events_df.type == "sampling"].data).iterrows():
    print(f"Prompt: {r.prompt}")
    print(f"Sampled: {r.sampled}")
    print("-" * 25)

In [None]:
events_df = events_df[~events_df['sample_id'].isnull()]

In [None]:
from natsort import index_natsorted

events_df['order'] = events_df['sample_id'].apply(lambda x: x[23:])
events_df['order'] = events_df['order'].astype(int)

# get the index of the sorted values using natural sort
index = index_natsorted(events_df['order'])
events_df = events_df.iloc[index]
events_df = events_df[events_df.type == "sampling"]
events_df.drop_duplicates(subset='order', keep='first', inplace=True)

In [None]:
predictions = []
pd.set_option('display.max_colwidth', 5000)

for i, r in pd.json_normalize(events_df.data).iterrows():
    predictions.append(r.sampled[0])

predictions_df = pd.DataFrame(predictions, columns=['predictions'])

predictions_df

In [None]:
pd.set_option('display.max_colwidth', 50)

In [None]:
comp_df = pd.concat([test_df, predictions_df], axis=1)
comp_df

In [None]:
def wrong_output(df):
    counter = 0
    no_entities_counter = 0
    more_context_counter = 0
    typo_counter = 0
    no_entities_from_list_counter = 0
    language_counter = 0
    policy_counter = 0
    no_entities = ['There are no named entities in this sentence.', 'does not contain any recognizable entities']
    more_context = ['context', 'more information', 'incomplete']
    typo = ['typo', 'spelling mistake', 'not grammatically correct', 'error']
    no_entities_from_list = ['match the given tags', 'from the list provided', 'match the specified tags', 'named entities from the given list']
    language = ['English']
    policy = ["against OpenAI's use case policy"]
    index_list = []
    
    for i in range(len(df)):
        try:
            eval(df['predictions'][i])
        
        except:
            counter += 1
            print('-' * 30, '\n',
                  df['sentence'][i], '\n',
                  df['predictions'][i], '\n',
                  '-' * 30)
            for string in no_entities:
                if string in df['predictions'][i]:
                    no_entities_counter += 1
                    break
            for string in more_context:
                if string in df['predictions'][i]:
                    more_context_counter += 1
                    break
            for string in typo:
                if string in df['predictions'][i]:
                    typo_counter += 1
                    break
            for string in no_entities_from_list:
                if string in df['predictions'][i]:
                    no_entities_from_list_counter += 1
                    break
            for string in language:
                if string in df['predictions'][i]:
                    language_counter += 1
                    break
            for string in policy:
                if string in df['predictions'][i]:
                    policy_counter += 1
                    break
            index_list.append(i)
                    
    print('There were', counter, 'wrong outputs')
    print(no_entities_from_list_counter, 'were due to there not being entities in the sentence that corresponded to a label in the provided list')
    print(no_entities_counter, 'were due to GPT-3 stating there were no NEs in the provided sentence')
    print(more_context_counter, 'were due to GPT-3 requesting more context')
    print(typo_counter, 'were due to GPT-3 stating there was a spelling or grammar mistake in the sentence')
    print(language_counter, 'were due to the sentence provided not being English')
    
    return index_list

In [None]:
wrong_index_list = wrong_output(comp_df)

In [None]:
import re

def allign_predictions(df):
    prediction_tags = []
    gold_tags = []
    
    actual_tags = ['Facility', 'OtherLOC', 'HumanSettlement', 'Station', 
                   'VisualWork', 'MusicalWork', 'WrittenWork', 'ArtWork', 
                   'Software', 'MusicalGRP', 'PublicCorp', 'PrivateCorp', 
                   'AerospaceManufacturer', 'SportsGRP', 'CarManufacturer', 
                   'ORG', 'Scientist', 'Artist', 'Athlete', 'Politician', 
                   'Cleric', 'SportsManager', 'OtherPER', 'Clothing', 
                   'Vehicle', 'Food', 'Drink', 'OtherPROD', 'Medication/Vaccine', 
                   'MedicalProcedure', 'AnatomicalStructure', 'Symptom', 'Disease']
    
    elab_tags = ['Facility', 'OtherLocation', 'HumanSettlement', 'Station', 
                 'VisualWork', 'MusicalWork', 'WrittenWork', 'ArtWork', 'Software', 
                 'MusicalGroup', 'PublicCorporation', 'PrivateCorporation', 
                 'AerospaceManufacturer', 'SportsGroup', 'CarManufacturer', 
                 'Organization', 'Scientist', 'Artist', 'Athlete', 'Politician', 
                 'Cleric', 'SportsManager', 'OtherPerson', 'Clothing',
                 'Vehicle', 'Food', 'Drink', 'OtherProduct', 'Medication/Vaccine', 
                 'MedicalProcedure', 'AnatomicalStructure', 'Symptom', 'Disease']
    
    normal_counter = 0
    group_counter = 0
    except_counter = 0
    continue_counter = 0
    
    for i, row in df.iterrows():
        tag_list = []
        previous_words = []

        for word in row['sentence'].split(' '):
            track = False
            previous_words.append(word)
            pred_counter = 0
            pred_entities = []
            try:
                for prediction in eval(row['predictions']):
                    entity_tokens = []
                    pred_counter += 1
                    normal_counter += 1
                    groups = re.match('^([^ ]+)(?:\s\((.+)\))?$', prediction)
                    named_entities = groups.group(2).split(', ')
                    predicted_tag = groups.group(1)
                    
                    for named_entity in named_entities:
                        entity_tokens.extend(named_entity.split(' '))
                        pred_entities.extend(named_entity.split(' '))
                    
                    group_counter += 1
                    if word in entity_tokens:
                        if (predicted_tag in elab_tags) and (track == False) and (previous_words.count(word) == pred_entities.count(word)):
                            tag_index = elab_tags.index(predicted_tag)
                            tag_list.append(str(actual_tags[tag_index]))
                            track = True
            except:
                except_counter += 1
                pass
            
            if track == False:
                tag_list.append('O')
        prediction_tags.append(tag_list)
        
        # Cast the strings in tags to actual lists
        gold_tags.append(eval(row['tags']))
        
    final_df = pd.DataFrame({'sentence': df['sentence'], 'tags': gold_tags, 
                             'prediction': df['predictions'], 'prediction_list': prediction_tags})
    
    print(normal_counter, group_counter, except_counter)
    
    return final_df

In [None]:
final_df = allign_predictions(comp_df)

In [None]:
def add_or_increment_key(dictionary, key):
    dictionary[key] = dictionary.setdefault(key, 0) + 1

def check_correctness_wrongs(dataframe):
    label_recog_err = {}
    label_count = {}
    label_frac = {}
    
    for index in range(len(dataframe)):
        previous_label = ''
        gold_tags = dataframe.iloc[index]['tags']
        pred_tags = dataframe.iloc[index]['prediction_list']
        for ind in range(len(gold_tags)):
            if gold_tags[ind] != 'O':
                add_or_increment_key(label_count, gold_tags[ind])
            if gold_tags[ind] != 'O' and pred_tags[ind] == 'O':
                add_or_increment_key(label_recog_err, gold_tags[ind])
                
    for label in label_count.keys():
        label_frac[label] = label_recog_err[label] / label_count[label]

    sorted_dict = dict(sorted(label_frac.items(), key=lambda x: x[1], reverse=True))
    
    for label in sorted_dict.keys():
        print(label, ' ', label_count[label], ' ', label_recog_err[label], ' ', "{:.3f}".format(label_recog_err[label] / label_count[label]))


check_correctness_wrongs(final_df)

In [None]:
def add_or_increment_key(dictionary, key):
    dictionary[key] = dictionary.setdefault(key, 0) + 1

def check_correctness_wrongs(dataframe):
    label_recog_err = {}
    label_count = {}
    label_frac = {}
    
    for index in range(len(dataframe)):
        previous_label = ''
        tracker = False
        gold_tags = dataframe.iloc[index]['tags']
        pred_tags = dataframe.iloc[index]['prediction_list']
        for ind in range(len(gold_tags)):
            if gold_tags[ind] == 'O':
                previous_label = ''
            if gold_tags[ind] != 'O' and gold_tags[ind] != previous_label:
                tracker = False
                previous_label = gold_tags[ind]
            if gold_tags[ind] != 'O' and gold_tags[ind] != pred_tags[ind]:
                tracker = True
                            
            try:
                next_label = gold_tags[ind + 1]
            except:
                next_label = ''
                
            if gold_tags[ind] != 'O' and gold_tags[ind] != next_label:
                add_or_increment_key(label_count, gold_tags[ind])
            if gold_tags[ind] != next_label and tracker:
                add_or_increment_key(label_recog_err, gold_tags[ind])
                tracker = False
                
    for label in label_count.keys():
        label_frac[label] = label_recog_err[label] / label_count[label]

    sorted_dict = dict(sorted(label_frac.items(), key=lambda x: x[1], reverse=True))
    
    for label in sorted_dict.keys():
        print(label, ' ', label_count[label], ' ', label_recog_err[label], ' ', "{:.3f}".format(label_recog_err[label] / label_count[label]))


check_correctness_wrongs(final_df)

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, classification_report

# Create a set of unique tags
unique_tags = set().union(*final_df['tags'], *final_df['prediction_list'])

# Fit the LabelEncoder on the unique tags
encoder = LabelEncoder()
encoder.fit(list(unique_tags))

# Convert tags to numerical labels
final_df['tags_encoded'] = final_df['tags'].apply(lambda x: encoder.transform(x))
final_df['prediction_encoded'] = final_df['prediction_list'].apply(lambda x: encoder.transform(x))

# Flatten encoded tag sequences
tags_flat = [tag for sublist in final_df['tags_encoded'] for tag in sublist]
prediction_flat = [tag for sublist in final_df['prediction_encoded'] for tag in sublist]

for index in range(len(tags_flat) - 1, -1, -1):
    if tags_flat[index] == 17:
        del tags_flat[index]
        del prediction_flat[index]

# Calculate macro F1 score
f1_macro = f1_score(tags_flat, prediction_flat, average='macro', zero_division = 1)
f1_weighted = f1_score(tags_flat, prediction_flat, average='weighted', zero_division = 1)

# Convert back to original tag labels
tags_original = encoder.inverse_transform(tags_flat)
prediction_original = encoder.inverse_transform(prediction_flat)

# Calculate per-label metrics
classification_rep = classification_report(tags_original, prediction_original, zero_division = 1)

print("Macro F1 Score:", f1_macro)
print('Weighted F1 Score:', f1_weighted)
print("Per-label Metrics:")
print(classification_rep)


In [None]:
import re

def allign_predictions_class(df):
    prediction_tags = []
    gold_tags = []

    org_loc_tags = ['Facility', 'OtherLOC', 'HumanSettlement', 'Station']
    org_cw_tags = ['VisualWork', 'MusicalWork', 'WrittenWork', 'ArtWork', 'Software']
    org_grp_tags = ['MusicalGRP', 'PublicCorp', 'PrivateCorp', 'AerospaceManufacturer', 
                    'SportsGRP', 'CarManufacturer', 'ORG']
    org_prs_tags = ['Scientist', 'Artist', 'Athlete', 'Politician', 'Cleric', 'SportsManager', 'OtherPER']
    org_prod_tags = ['Clothing', 'Vehicle', 'Food', 'Drink', 'OtherPROD']
    org_med_tags = ['Medication/Vaccine', 'MedicalProcedure', 'AnatomicalStructure', 'Symptom', 'Disease']

    loc_tags = ['Facility', 'OtherLocation', 'HumanSettlement', 'Station']
    cw_tags = ['VisualWork', 'MusicalWork', 'WrittenWork', 'ArtWork', 'Software']
    grp_tags = ['MusicalGroup', 'PublicCorporation', 'PrivateCorporation', 
                 'AerospaceManufacturer', 'SportsGroup', 'CarManufacturer', 
                 'Organization']
    prs_tags = ['Scientist', 'Artist', 'Athlete', 'Politician', 
                 'Cleric', 'SportsManager', 'OtherPerson']
    prod_tags = ['Clothing', 'Vehicle', 'Food', 'Drink', 'OtherProduct']
    med_tags = ['Medication/Vaccine', 'MedicalProcedure', 'AnatomicalStructure', 'Symptom', 'Disease']
    
    normal_counter = 0
    group_counter = 0
    except_counter = 0
    continue_counter = 0
    
    for i, row in df.iterrows():
        tag_list = []
        previous_words = []
        gold_list = []

        for word in row['sentence'].split(' '):
            track = False
            previous_words.append(word)
            pred_counter = 0
            pred_entities = []
            try:
                for prediction in eval(row['predictions']):
                    entity_tokens = []
                    pred_counter += 1
                    normal_counter += 1
                    groups = re.match('^([^ ]+)(?:\s\((.+)\))?$', prediction)
                    named_entities = groups.group(2).split(', ')
                    predicted_tag = groups.group(1)
                    
                    for named_entity in named_entities:
                        entity_tokens.extend(named_entity.split(' '))
                        pred_entities.extend(named_entity.split(' '))
                    
                    group_counter += 1
                    if word in entity_tokens:
                        if (predicted_tag in loc_tags) and (track == False) and (previous_words.count(word) == pred_entities.count(word)):
                            tag_list.append('Location')
                            track = True
                        elif (predicted_tag in cw_tags) and (track == False) and (previous_words.count(word) == pred_entities.count(word)):
                            tag_list.append('Creative Work')
                            track = True
                        elif (predicted_tag in grp_tags) and (track == False) and (previous_words.count(word) == pred_entities.count(word)):
                            tag_list.append('Group')
                            track = True
                        elif (predicted_tag in prs_tags) and (track == False) and (previous_words.count(word) == pred_entities.count(word)):
                            tag_list.append('Person')
                            track = True
                        elif (predicted_tag in prod_tags) and (track == False) and (previous_words.count(word) == pred_entities.count(word)):
                            tag_list.append('Product')
                            track = True
                        elif (predicted_tag in med_tags) and (track == False) and (previous_words.count(word) == pred_entities.count(word)):
                            tag_list.append('Medical')
                            track = True
                        
            except:
                except_counter += 1
                pass
            
            if track == False:
                tag_list.append('O')
        prediction_tags.append(tag_list)
        
        for tag in eval(row['tags']):
            if tag == 'O':
                gold_list.append(tag)
            elif tag in org_loc_tags:
                gold_list.append('Location')
            elif tag in org_cw_tags:
                gold_list.append('Creative Work')
            elif tag in org_grp_tags:
                gold_list.append('Group')
            elif tag in org_prs_tags:
                gold_list.append('Person')
            elif tag in org_prod_tags:
                gold_list.append('Product')
            elif tag in org_med_tags:
                gold_list.append('Medical')
            
            
        # Cast the strings in tags to actual lists
        gold_tags.append(gold_list)
        
    final_df = pd.DataFrame({'sentence': df['sentence'], 'tags': gold_tags, 
                             'prediction': df['predictions'], 'prediction_list': prediction_tags})
    
    print(normal_counter, group_counter, except_counter)
    
    return final_df

In [None]:
class_final_df = allign_predictions_class(comp_df)

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, classification_report

# Create a set of unique tags
unique_tags = set().union(*class_final_df['tags'], *class_final_df['prediction_list'])

# Fit the LabelEncoder on the unique tags
encoder = LabelEncoder()
encoder.fit(list(unique_tags))

# Convert tags to numerical labels
class_final_df['tags_encoded'] = class_final_df['tags'].apply(lambda x: encoder.transform(x))
class_final_df['prediction_encoded'] = class_final_df['prediction_list'].apply(lambda x: encoder.transform(x))

# Flatten encoded tag sequences
tags_flat = [tag for sublist in class_final_df['tags_encoded'] for tag in sublist]
prediction_flat = [tag for sublist in class_final_df['prediction_encoded'] for tag in sublist]

for index in range(len(tags_flat) - 1, -1, -1):
    if tags_flat[index] == 4:
        del tags_flat[index]
        del prediction_flat[index]

# Calculate macro F1 score
f1_macro = f1_score(tags_flat, prediction_flat, average='macro')
f1_weighted = f1_score(tags_flat, prediction_flat, average='weighted')

# Convert back to original tag labels
tags_original = encoder.inverse_transform(tags_flat)
prediction_original = encoder.inverse_transform(prediction_flat)

# Calculate per-label metrics
classification_rep = classification_report(tags_original, prediction_original)

print("Class Macro F1 Score:", f1_macro)
print("Class Weighted F1 Score:", f1_weighted)
print("Per-label Metrics:")
print(classification_rep)


In [None]:
check_correctness_wrongs(class_final_df)

In [None]:
def find_most_frequent_wrong_predictions_no_o(dataframe):
    label_count = {}
    wrong_class_count = {}
    wrong_count = {}
    label_frac = {}
    
    for index in range(len(dataframe)):
        previous_label = ''
        tracker = False
        wrong_labels = []
        gold_tags = dataframe.iloc[index]['tags']
        pred_tags = dataframe.iloc[index]['prediction_list']
        for ind in range(len(gold_tags)):
            if gold_tags[ind] == 'O':
                previous_label = ''
            if gold_tags[ind] != 'O' and gold_tags[ind] != previous_label:
                tracker = False
                previous_label = gold_tags[ind]
                wrong_labels = []
            if gold_tags[ind] != 'O' and gold_tags[ind] != pred_tags[ind]:
                tracker = True
                if pred_tags[ind] != 'O':
                    if pred_tags[ind] not in wrong_labels:
                        wrong_labels.append(pred_tags[ind])
                
                
                            
            try:
                next_label = gold_tags[ind + 1]
            except:
                next_label = ''
                
            if gold_tags[ind] != 'O' and gold_tags[ind] != next_label:
                add_or_increment_key(label_count, gold_tags[ind])
            if gold_tags[ind] != next_label and tracker:
                add_or_increment_key(wrong_count, gold_tags[ind])
                for label in wrong_labels:
                    if gold_tags[ind] not in wrong_class_count:
                        wrong_class_count[gold_tags[ind]] = {}
                    if label not in wrong_class_count[gold_tags[ind]]:
                        wrong_class_count[gold_tags[ind]][label] = 0
                    wrong_class_count[gold_tags[ind]][label] += 1  # Increment count for the predicted class
                tracker = False
                wrong_labels = []
                
    for label in label_count.keys():
        label_frac[label] = wrong_count[label] / label_count[label]

    sorted_dict = dict(sorted(label_frac.items(), key=lambda x: x[1], reverse=True))
    
    common_wrong = {}
    
    for gold in label_count.keys():
        inner_dict = wrong_class_count[gold]
        sorted_items = sorted(inner_dict.items(), key=lambda x: x[1], reverse = True)
        max_key, max_value = sorted_items[0]
        try:
            second_max_key, second_max_value = sorted_items[1]
        except IndexError:
            second_max_key, second_max_value = 'none', 0
        common_wrong[gold] = {max_key: max_value, second_max_key: second_max_value}
    

    for label in sorted_dict.keys():
        print(label, " ", label_count[label], " ", list(common_wrong[label].keys())[0], " ", list(common_wrong[label].values())[0], " ", list(common_wrong[label].keys())[1], " ", list(common_wrong[label].values())[1])

find_most_frequent_wrong_predictions_no_o(final_df)
check_correctness_wrongs(class_final_df)

In [None]:
def find_most_frequent_wrong_predictions(dataframe):
    label_count = {}
    wrong_class_count = {}
    wrong_count = {}
    label_frac = {}
    
    for index in range(len(dataframe)):
        previous_label = ''
        tracker = False
        wrong_labels = []
        gold_tags = dataframe.iloc[index]['tags']
        pred_tags = dataframe.iloc[index]['prediction_list']
        for ind in range(len(gold_tags)):
            if gold_tags[ind] == 'O':
                previous_label = ''
            if gold_tags[ind] != 'O' and gold_tags[ind] != previous_label:
                tracker = False
                previous_label = gold_tags[ind]
                wrong_labels = []
            if gold_tags[ind] != 'O' and gold_tags[ind] != pred_tags[ind]:
                tracker = True
                if pred_tags[ind] not in wrong_labels:
                    wrong_labels.append(pred_tags[ind])
                
                
                            
            try:
                next_label = gold_tags[ind + 1]
            except:
                next_label = ''
                
            if gold_tags[ind] != 'O' and gold_tags[ind] != next_label:
                add_or_increment_key(label_count, gold_tags[ind])
            if gold_tags[ind] != next_label and tracker:
                add_or_increment_key(wrong_count, gold_tags[ind])
                for label in wrong_labels:
                    if gold_tags[ind] not in wrong_class_count:
                        wrong_class_count[gold_tags[ind]] = {}
                    if label not in wrong_class_count[gold_tags[ind]]:
                        wrong_class_count[gold_tags[ind]][label] = 0
                    wrong_class_count[gold_tags[ind]][label] += 1  # Increment count for the predicted class
                tracker = False
                wrong_labels = []
                
    for label in label_count.keys():
        label_frac[label] = wrong_count[label] / label_count[label]

    sorted_dict = dict(sorted(label_frac.items(), key=lambda x: x[1], reverse=True))
    
    common_wrong = {}
    
    for gold in label_count.keys():
        inner_dict = wrong_class_count[gold]
        sorted_items = sorted(inner_dict.items(), key=lambda x: x[1], reverse = True)
        max_key, max_value = sorted_items[0]
        try:
            second_max_key, second_max_value = sorted_items[1]
        except IndexError:
            second_max_key, second_max_value = 'none', 0
        common_wrong[gold] = {max_key: max_value, second_max_key: second_max_value}
    
    for label in sorted_dict.keys():
        print(label, " ", label_count[label], " ", list(common_wrong[label].keys())[0], " ", list(common_wrong[label].values())[0], " ", list(common_wrong[label].keys())[1], " ", list(common_wrong[label].values())[1])

find_most_frequent_wrong_predictions(final_df)
check_correctness_wrongs(class_final_df)

In [None]:
def find_most_frequent_wrong_predictions_no_o(gold_labels, predictions):
    class_counts = {}  # Dictionary to store counts of wrong predictions for each class
    gold_label_counts = {}  # Dictionary to store counts of gold label occurrences
    wrong_count = {}

    for gold, pred in zip(gold_labels, predictions):
        if gold not in gold_label_counts:
            gold_label_counts[gold] = 0
        gold_label_counts[gold] += 1  # Increment count for gold label occurrence

        if gold != pred:  # Check if the prediction is wrong
            add_or_increment_key(wrong_count, gold)
            if gold not in class_counts:
                class_counts[gold] = {}
            if pred not in class_counts[gold] and pred != 'O':
                class_counts[gold][pred] = 0
            if pred != 'O':
                class_counts[gold][pred] += 1  # Increment count for the predicted class

    most_frequent_wrong_predictions = {}
    for gold in class_counts:
        most_frequent_wrong_predictions[gold] = max(class_counts[gold], key=class_counts[gold].get)

    return wrong_count, most_frequent_wrong_predictions, gold_label_counts, class_counts


In [None]:
wrong_count, common_wrong_class, gold_count, class_count = find_most_frequent_wrong_predictions_no_o(tags_original, prediction_original)
print("wrong count looks like:", wrong_count)
print("common_wrong_class looks like", common_wrong_class)
print("gold_count looks like:", gold_count)
print("class_count looks like:", class_count)

In [None]:
common_wrong = {}
wrong_fraction = {}
  
    
for gold in gold_count.keys():
    wrong_fraction[gold] = wrong_count[gold] / gold_count[gold]
 

wrong_fraction = dict(sorted(wrong_fraction.items(), key=lambda x: x[1], reverse=True))    


for gold in gold_count.keys():
    inner_dict = class_count[gold]
    sorted_items = sorted(inner_dict.items(), key=lambda x: x[1], reverse = True)
    max_key, max_value = sorted_items[0]
    try:
        second_max_key, second_max_value = sorted_items[1]
    except IndexError:
        second_max_key, second_max_value = 'none', 0
    common_wrong[gold] = {max_key: max_value, second_max_key: second_max_value}


for gold in wrong_fraction.keys():
    print(gold, " ", gold_count[gold], " ", list(common_wrong[gold].keys())[0], " ", list(common_wrong[gold].values())[0], " ", list(common_wrong[gold].keys())[1], " ", list(common_wrong[gold].values())[1])

In [None]:
def find_most_frequent_wrong_predictions(gold_labels, predictions):
    class_counts = {}  # Dictionary to store counts of wrong predictions for each class
    gold_label_counts = {}  # Dictionary to store counts of gold label occurrences
    wrong_count = {}

    for gold, pred in zip(gold_labels, predictions):
        if gold not in gold_label_counts:
            gold_label_counts[gold] = 0
        gold_label_counts[gold] += 1  # Increment count for gold label occurrence

        if gold != pred:  # Check if the prediction is wrong
            add_or_increment_key(wrong_count, gold)
            if gold not in class_counts:
                class_counts[gold] = {}
            if pred not in class_counts[gold]:
                class_counts[gold][pred] = 0
            class_counts[gold][pred] += 1  # Increment count for the predicted class

    most_frequent_wrong_predictions = {}
    for gold in class_counts:
        most_frequent_wrong_predictions[gold] = max(class_counts[gold], key=class_counts[gold].get)

    return wrong_count, most_frequent_wrong_predictions, gold_label_counts, class_counts


In [None]:
wrong_count, common_wrong_class, gold_count, class_count = find_most_frequent_wrong_predictions(tags_original, prediction_original)
print("wrong count looks like:", wrong_count)
print("common_wrong_class looks like", common_wrong_class)
print("gold_count looks like:", gold_count)
print("class_count looks like:", class_count)

In [None]:
common_wrong = {}
wrong_fraction = {}
  
    
for gold in gold_count.keys():
    wrong_fraction[gold] = wrong_count[gold] / gold_count[gold]
 

wrong_fraction = dict(sorted(wrong_fraction.items(), key=lambda x: x[1], reverse=True))    


for gold in gold_count.keys():
    inner_dict = class_count[gold]
    sorted_items = sorted(inner_dict.items(), key=lambda x: x[1], reverse = True)
    max_key, max_value = sorted_items[0]
    try:
        second_max_key, second_max_value = sorted_items[1]
    except IndexError:
        second_max_key, second_max_value = 'none', 0
    common_wrong[gold] = {max_key: max_value, second_max_key: second_max_value}


for gold in wrong_fraction.keys():
    print(gold, " ", gold_count[gold], " ", list(common_wrong[gold].keys())[0], " ", list(common_wrong[gold].values())[0], " ", list(common_wrong[gold].keys())[1], " ", list(common_wrong[gold].values())[1])

In [None]:
def find_indexes(lst, target):
    indexes = []
    for i, item in enumerate(lst):
        if item == target:
            indexes.append(i)
    return indexes

def find_predictions(dataframe, label):
    for i, row in dataframe.iterrows():
        if label in row['tags']:
            print(row['sentence'])
            print(row['tags'])
            try:
                print(row['prediction_list'])
            except:
                print(row['predictions'])
            
            print('-' * 25)

In [None]:
find_predictions(comp_df, 'OtherPER')

In [None]:
def find_most_frequent_wrong_predictions_no_o(gold_labels, predictions):
    class_counts = {}  # Dictionary to store counts of wrong predictions for each class
    gold_label_counts = {}  # Dictionary to store counts of gold label occurrences

    for gold, pred in zip(gold_labels, predictions):
        if gold not in gold_label_counts:
            gold_label_counts[gold] = 0
        gold_label_counts[gold] += 1  # Increment count for gold label occurrence
        
        if pred == "O":
            continue

        if gold != pred:  # Check if the prediction is wrong
            if gold not in class_counts:
                class_counts[gold] = {}
            if pred not in class_counts[gold]:
                class_counts[gold][pred] = 0
            class_counts[gold][pred] += 1  # Increment count for the predicted class

    most_frequent_wrong_predictions = {}
    for gold in class_counts:
        most_frequent_wrong_predictions[gold] = max(class_counts[gold], key=class_counts[gold].get)

    return most_frequent_wrong_predictions, gold_label_counts, class_counts


In [None]:
wrong_count, gold_count, class_count = find_most_frequent_wrong_predictions_no_o(tags_original, prediction_original)
print("wrong count looks like:", wrong_count)
print("gold_count looks like:", gold_count)
print("class_count looks like:", class_count)

common_wrong = {}
    
for gold in gold_count.keys():
    try:
        inner_dict = class_count[gold]
        max_value = max(inner_dict.values())
        max_key = max(inner_dict, key=inner_dict.get)
        common_wrong[gold] = {max_key: max_value}
    except:
        common_wrong[gold] = {'none': 0}

print(common_wrong)

for gold in gold_count.keys():
    print("Label", gold, "occured", gold_count[gold], "and was mislabelled as", list(common_wrong[gold].keys())[0], list(common_wrong[gold].values())[0], "times")

In [None]:
def span_error(dataframe):
    correct_span = []
    
    for index in range(len(dataframe)):
        gold_labels = dataframe['tags'][index]
        pred_labels = dataframe['prediction_list'][index]
        for ind in range(len(gold_labels)):
            if gold_labels[ind] != 'O' and pred_labels[ind] == 'O':
                correct_span.append(0)
                break
            if ind + 1 == len(gold_labels):
                correct_span.append(1)
        
    new_df = pd.DataFrame({'sentence': dataframe['sentence'], 'tags': dataframe['tags'], 'prediction_list': dataframe['prediction_list'], 'correct_span': correct_span})
    
    return new_df

In [None]:
def span_error(dataframe):
    correct_span = []
    number_of_spans = []
    number_missed_entity = []
    number_wrong_entity = []
    number_non_entity = []
    
    
    for index in range(len(dataframe)):
        gold_labels = dataframe['tags'][index]
        pred_labels = dataframe['prediction_list'][index]
        correct_span_sentence = True
        span_counter = 0
        wrong_entity = 0
        non_entity = 0
        missed_entity = 0
        for ind in range(len(gold_labels)):
            if gold_labels[ind] != 'O':
                try: 
                    if gold_labels[ind + 1] == gold_labels[ind]:
                        continue
                    else:
                        span_counter += 1
                except:
                    span_counter += 1
                    continue
            
            if gold_labels[ind] != pred_labels[ind]:
                correct_span_sentence = False
                if gold_labels[ind] == 'O':
                    non_entity += 1
                elif pred_labels[ind] == 'O':
                    missed_entity += 1
                else:
                    wrong_entity += 1
                    
                    
        if not correct_span_sentence:
            correct_span.append(0)
        else:
            correct_span.append(1)
        
        number_of_spans.append(span_counter)
        number_missed_entity.append(missed_entity)
        number_wrong_entity.append(wrong_entity)
        number_non_entity.append(non_entity)
        
    new_df = pd.DataFrame({'sentence': dataframe['sentence'], 'tags': dataframe['tags'], 'prediction_list': dataframe['prediction_list'], 'number_of_spans': number_of_spans, 'number_wrong_entities': number_wrong_entity, 'number_non_entities': number_non_entity, 'number_missed_entities': number_missed_entity, 'correct_span': correct_span})
    
    return new_df

In [None]:
some_df = span_error(final_df)

In [None]:
some_df[some_df['correct_span'] == 0]

In [None]:
wrong_values = some_df[some_df['correct_span'] == 0]['number_of_spans'].value_counts()
correct_values = some_df[some_df['correct_span'] == 1]['number_of_spans'].value_counts()

for span_count in wrong_values.keys():
    print('Total of span count', span_count, ':', wrong_values[span_count] + correct_values[span_count])
    print(correct_values[span_count], 'correct')
    print(wrong_values[span_count], 'wrong')
    print('proportion correct is:', correct_values[span_count] / (wrong_values[span_count] + correct_values[span_count]))
    print('-' * 25)
    

In [None]:
mean_span_wrong = some_df[some_df['correct_span'] == 0]['number_of_spans'].mean()
mean_span_correct = some_df[some_df['correct_span'] == 1]['number_of_spans'].mean()

print(mean_span_wrong)
print(mean_span_correct)

In [None]:
wrong_values = some_df[some_df['correct_span'] == 0]['sentence'].apply(lambda x: len(x.split(' ')))
correct_values = some_df[some_df['correct_span'] == 1]['sentence'].apply(lambda x: len(x.split(' ')))

print('the average incorrect sentence length is', wrong_values.mean())
print('the average correct sentence length is', correct_values.mean())

In [None]:
wrong_values = some_df[some_df['correct_span'] == 0]

print('for the wrong predicted sentences:')
print('number of wrong predictions:', wrong_values['number_wrong_entities'].sum())
print('number of false predicted entities:', wrong_values['number_non_entities'].sum())
print('number of missed entities:', wrong_values['number_missed_entities'].sum())

In [None]:
%pip install textstat

In [None]:
from textstat import flesch_reading_ease

In [None]:
# Function to compute the text complexity for each sentence
def text_complexity(sentence_df):
    flesch_scores = []  
    correct = []
    
    for index, row in sentence_df.iterrows():
        flesch_score = flesch_reading_ease(row['sentence'])
        flesch_scores.append(flesch_score)
        if row['tags'] == row['prediction_list']:
            correct.append(1)
        else:
            correct.append(0)

    tc_sentence_df = sentence_df.assign(text_complexity = flesch_scores)
    tc_sentence_df = tc_sentence_df.assign(correct = correct)
    
    return tc_sentence_df

In [None]:
final_df_tc = text_complexity(final_df)
final_df_tc.head()

In [None]:
final_df_tc[final_df_tc['correct'] == 0]['text_complexity'].mean()

In [None]:
final_df_tc[final_df_tc['correct'] == 1]['text_complexity'].mean()

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the sentences using the vectorizer
tfidf_matrix = tfidf_vectorizer.fit_transform(some_df['sentence'])

# Calculate the sum of TF-IDF scores for each sentence
sentence_tfidf_sum = tfidf_matrix.sum(axis=1)

# Create a new column in the dataframe to store the sum of TF-IDF scores
some_df['tfidf_sum'] = sentence_tfidf_sum

# Print the dataframe with the sum of TF-IDF scores
print(some_df)


In [None]:
print(some_df[some_df['correct_span'] == 0]['tfidf_sum'].mean())
print(some_df[some_df['correct_span'] == 1]['tfidf_sum'].mean())

In [None]:
def number_wrong_missed(dataframe):
    count = 0
    wrong = 0
    missed = 0
    
    for index in range(len(dataframe)):
        previous_label = ''
        wrong_tracker = False
        miss_tracker = False
        gold_tags = dataframe.iloc[index]['tags']
        pred_tags = dataframe.iloc[index]['prediction_list']
        for ind in range(len(gold_tags)):
            if gold_tags[ind] == 'O':
                previous_label = ''
            if gold_tags[ind] != 'O' and gold_tags[ind] != previous_label:
                wrong_tracker = False
                miss_tracker = False
                previous_label = gold_tags[ind]
            if gold_tags[ind] != 'O' and gold_tags[ind] != pred_tags[ind] and pred_tags[ind] != 'O':
                wrong_tracker = True
            if gold_tags[ind] != 'O' and pred_tags[ind] == 'O':
                miss_tracker = True
                            
            try:
                next_label = gold_tags[ind + 1]
            except:
                next_label = ''
                
            if gold_tags[ind] != 'O' and gold_tags[ind] != next_label:
                count += 1
            if gold_tags[ind] != next_label and wrong_tracker:
                wrong += 1
                wrong_tracker = False
            if gold_tags[ind] != next_label and miss_tracker:
                missed += 1
                miss_tracker = False
                
    print('There are', count, 'entities. \n', 'The model labeled', wrong, ' wrong. \n', 'The model missed', missed)


number_wrong_missed(final_df)

In [None]:
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns

classification_rep = """
                         precision    recall  f1-score   support

B-AerospaceManufacturer       0.53      0.81      0.64        48
  B-AnatomicalStructure       0.86      0.78      0.82       285
              B-ArtWork       0.64      0.66      0.65        35
               B-Artist       0.74      0.77      0.75      1770
              B-Athlete       0.76      0.72      0.74       921
      B-CarManufacturer       0.69      0.64      0.66       103
               B-Cleric       0.75      0.46      0.57       166
             B-Clothing       0.71      0.62      0.67        88
              B-Disease       0.91      0.84      0.88       256
                B-Drink       0.84      0.44      0.57        71
             B-Facility       0.72      0.64      0.68       511
                 B-Food       0.77      0.57      0.66       184
      B-HumanSettlement       0.92      0.89      0.90      1576
     B-MedicalProcedure       0.91      0.76      0.83       199
   B-Medication/Vaccine       0.91      0.78      0.84       242
           B-MusicalGRP       0.74      0.66      0.70       348
          B-MusicalWork       0.82      0.71      0.76       436
                  B-ORG       0.68      0.59      0.63       748
             B-OtherLOC       0.57      0.42      0.48       191
             B-OtherPER       0.46      0.52      0.49       797
            B-OtherPROD       0.73      0.61      0.67       409
           B-Politician       0.59      0.48      0.53       584
          B-PrivateCorp       0.19      0.43      0.26        21
           B-PublicCorp       0.64      0.51      0.57       268
            B-Scientist       0.44      0.58      0.50       158
             B-Software       0.81      0.62      0.71       293
            B-SportsGRP       0.88      0.87      0.88       455
        B-SportsManager       0.53      0.54      0.54       167
              B-Station       0.87      0.80      0.84       205
              B-Symptom       0.58      0.53      0.55        53
              B-Vehicle       0.66      0.57      0.62       195
           B-VisualWork       0.75      0.69      0.72       508
          B-WrittenWork       0.79      0.65      0.71       398
I-AerospaceManufacturer       0.68      0.75      0.71        28
  I-AnatomicalStructure       0.80      0.57      0.67        75
              I-ArtWork       0.86      0.48      0.62        77
               I-Artist       0.76      0.79      0.77      2038
              I-Athlete       0.78      0.75      0.76      1124
      I-CarManufacturer       0.41      0.43      0.42        28
               I-Cleric       0.65      0.48      0.55       205
             I-Clothing       0.40      0.25      0.31        16
              I-Disease       0.90      0.77      0.83       126
                I-Drink       0.74      0.56      0.64        25
             I-Facility       0.76      0.73      0.75       569
                 I-Food       0.85      0.53      0.65        62
      I-HumanSettlement       0.91      0.89      0.90       577
     I-MedicalProcedure       0.88      0.59      0.71       108
   I-Medication/Vaccine       0.92      0.55      0.69        40
           I-MusicalGRP       0.81      0.68      0.74       316
          I-MusicalWork       0.86      0.78      0.82       656
                  I-ORG       0.74      0.68      0.71       883
             I-OtherLOC       0.73      0.59      0.65       275
             I-OtherPER       0.45      0.57      0.50      1047
            I-OtherPROD       0.70      0.52      0.60       204
           I-Politician       0.59      0.47      0.52       733
          I-PrivateCorp       0.32      0.50      0.39        18
           I-PublicCorp       0.58      0.38      0.46       129
            I-Scientist       0.46      0.61      0.53       204
             I-Software       0.87      0.61      0.72       258
            I-SportsGRP       0.93      0.85      0.89       567
        I-SportsManager       0.56      0.55      0.55       197
              I-Station       0.89      0.85      0.87       205
              I-Symptom       0.47      0.33      0.39        27
              I-Vehicle       0.72      0.53      0.61       176
           I-VisualWork       0.78      0.71      0.74       740
          I-WrittenWork       0.87      0.68      0.76       554
                      O       0.00      1.00      0.00         0
"""

# Extracting f1-scores and support counts from the classification report
lines = classification_rep.split('\n')
data = lines[2:-5]  # Extract relevant lines

f1_scores = []
supports = []

for line in data:
    tokens = line.split()
    if len(tokens) > 4:
        if tokens[0] != 'O':
            f1_scores.append(float(tokens[3]))
            supports.append(int(tokens[4]))

# Creating a DataFrame for the data
df = pd.DataFrame({'Support Count': supports, 'F1-Score': f1_scores})

# Calculating the correlation coefficient and p-value
correlation, p_value = stats.pearsonr(df['Support Count'], df['F1-Score'])

# Creating the scatter plot using Seaborn
sns.scatterplot(data=df, x='Support Count', y='F1-Score')
plt.xlabel('Support Count')
plt.ylabel('F1-Score')
plt.title('F1-Score vs Support Count')
plt.text(0.4, 0.01, f'Correlation: {correlation:.2f} (p-value: {p_value:.4f})', transform=plt.gca().transAxes, fontsize=12)
plt.show()



