In [None]:
import csv, json
from csv_jsonl import JSONLinesDictWriter
import re
from evaluate import load
import torch
import pandas as pd
import os
from collections import Counter
import seaborn as sns
# import pyplot
import matplotlib.pyplot as plt

In [None]:
def read_and_prepare_data(path):
    with open(path, encoding="utf-8") as f:
        lines = f.read().splitlines()
        df_inter = pd.DataFrame(lines)
        df_inter.columns = ['json_element']
        df_inter['json_element'].apply(json.loads)
        df_final = pd.json_normalize(df_inter['json_element'].apply(json.loads))
        
        return df_final

In [None]:
def read_and_prepare_multirc_data(df):
    df = df.explode("passage.questions").reset_index(drop=True)
    df = df.join(pd.json_normalize(df["passage.questions"]), lsuffix="_original").drop(columns=['passage.questions'])
    df = df.explode("answers").reset_index(drop=True)
    df["label"] = df.apply(lambda row: map_label_multirc(row), axis=1)
    return df

In [None]:
def read_predictions(path, dataset):
    best_models_predictions = {}
    for filename in os.listdir(path):
        test_pred_file = os.listdir(f"{path}/{filename}")[0]
        test_preds = torch.load(os.path.join(f"{path}/{filename}", test_pred_file))
        
        if f'{dataset}' in test_preds:
            best_models_predictions[filename] = list(test_preds[f'{dataset}']['preds'])
        else:
            best_models_predictions[filename] = list(test_preds[f'{dataset}_mk']['preds'])
    return best_models_predictions   

In [None]:
def merge_data(original_data_path, predictions_path, dataset):
    df = read_and_prepare_data(original_data_path)
    if dataset=="multirc":
        df = read_and_prepare_multirc_data(df)
        
    best_models_predictions = read_predictions(predictions_path, dataset)
    for model, preds in best_models_predictions.items():
        df[model] = preds
    
    return df, list(best_models_predictions.keys())

In [None]:
def map_predictions_boolq(elt):
    if elt == 0:
        return False
    return True

In [None]:
def map_label_multirc(row):
    return  row["answers"]["label"]

In [None]:
def count_total(row, dataset):
    label = row["label"]
    if dataset=="copa":
        counter = Counter(list(row[6:]))
    else:
        counter = Counter(list(row[7:]))
    return counter[label]

def count_total_percentage(row, dataset):
    if dataset=="copa":
        return round(row["total_correct"]/16*100,2)
    else:
        return round(row["total_correct"]/15*100,2)

def add_counts(df, dataset):
    df["total_correct"] = df.apply(lambda x: count_total(x, dataset), axis=1)
    df["total_correct_percentage"] = df.apply(lambda x: count_total_percentage(x, dataset), axis=1)
    df.sort_values(["total_correct_percentage", "idx"], inplace=True) 
    return df

## BoolQ

In [None]:
# Passage & questions separately
boolq_dataset_distribution = {}
for elt in ["train", "val", "test"]:
    df = read_and_prepare_data(f"BoolQ/{elt}.jsonl")
    boolq_dataset_distribution[elt] = len(df)
df_boolq_distribution = pd.DataFrame({'Data sets': ["train", "val", "test"], 'Size': list(boolq_dataset_distribution.values())})

In [None]:
fig,ax = plt.subplots(figsize=(8,6))
sns.set_color_codes("pastel")
sns.set_style("whitegrid", {"grid.linestyle": ":"})
sns.barplot(y="Size", width=0.5, x="Data sets", data=df_boolq_distribution, palette='pastel', orient='v')
ax.bar_label(ax.containers[0], padding=5)
# ax.bar_label(ax.containers[1], padding=5)
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
# ax.set_xlim(0,480)
ax.set_ylabel('Number of samples', rotation=90, labelpad=12, fontsize=15)
ax.set_xlabel('Data set', rotation=0, labelpad=25, fontsize=15)
ax.margins(y=0.01)
# ax.bar(height=10)
ax.set_title('BoolQ', fontsize=20, pad=20)
# ax.legend(loc='right',title='Choice type')
# ax.xaxis.grid(True) 
plt.tight_layout()
fig = ax.get_figure()
fig.savefig("boolq_distribution.png", bbox_inches='tight') 
plt.savefig('boolq_distribution.pdf')

## COPA

In [None]:
dtype="test"
dataset="copa"
predictions_path = "Best Models/COPA"
original_dataset_dest = f"Best Models/COPA data/{dtype}.jsonl"
df, models = merge_data(original_dataset_dest, predictions_path, dataset)

In [None]:
df = add_counts(df, dataset)

In [None]:
total_effect = len(df[df["question"]=="effect"])
total_cause = len(df[df["question"]=="cause"])
models_lst = []
effect_p = []
cause_p = []

print(f"Total effect: {total_effect}\tTotal cause:{total_cause}\n")
for model in models:
    counter_effect = len(df[(df["label"]==df[model]) & (df["question"]=="effect")])
    counter_cause = len(df[(df["label"]==df[model]) & (df["question"]=="cause")])
    print(f"\tCount_effect % = {(counter_effect/total_effect)*100:0.2f}%, \tCount_cause % = {(counter_cause/total_cause)*100:0.2f}%\n")
    
    models_lst.append(model)
    effect_p.append(round(counter_effect/total_effect*100,2))
    cause_p.append(round(counter_cause/total_cause*100,2))

In [None]:
df_copa_results = pd.DataFrame({'Models': models_lst, 'effect': effect_p, 'cause': cause_p})

In [None]:
df_copa_results.head()

In [None]:
df2 = pd.melt(df_copa_results, id_vars=['Models'], value_vars=['effect','cause'], var_name='Choice type')

In [None]:
fig,ax = plt.subplots(figsize=(10,10))
sns.set_color_codes("pastel")
sns.set_style("whitegrid", {"grid.linestyle": ":"})
sns.barplot(x="value", y="Models", hue="Choice type", data=df2, palette='pastel')
ax.bar_label(ax.containers[0], fmt='%.2f%%', padding=5)
ax.bar_label(ax.containers[1], fmt='%.2f%%', padding=5)
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_xlim(0,101)
ax.set_xlabel('Percentage of correct answers', rotation=0, labelpad=12, fontsize=15)
ax.set_ylabel('Model', rotation=90, labelpad=25, fontsize=15)
ax.margins(y=0.01)
# ax.xaxis.grid(True) 
plt.tight_layout()
fig = ax.get_figure()
fig.savefig("copa.png", bbox_inches='tight') 
plt.savefig('copa.pdf')

In [None]:
copa_dataset_distribution = {"train": 400, "val": 100, "test": 500}
copa_type_distribution = {}
effects = []
causes = []
for elt in ["train", "val", "test"]:
    df = read_and_prepare_data(f"COPA/{elt}.jsonl")
    total_effect = len(df[df["question"]=="effect"])
    total_cause = len(df[df["question"]=="cause"])
    effects.append(total_effect)
    causes.append(total_cause)
    copa_type_distribution[elt] = [total_effect, total_cause]

In [None]:
df_copa_distribution = pd.DataFrame({'Data sets': ["train", "val", "test"], 'effect': effects, 'cause': causes})
df_copa_distribution_2 = pd.melt(df_copa_distribution, id_vars=['Data sets'], value_vars=['effect','cause'], var_name='Choice type')
# df2

In [None]:
fig,ax = plt.subplots(figsize=(9,6))
sns.set_color_codes("pastel")
sns.set_style("whitegrid", {"grid.linestyle": ":"})
sns.barplot(y="value", x="Data sets", hue="Choice type", data=df_copa_distribution_2, palette='pastel', orient='v')
ax.bar_label(ax.containers[0], padding=5)
ax.bar_label(ax.containers[1], padding=5)
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_ylim(0,280)
ax.set_ylabel('Number of samples', rotation=90, labelpad=12, fontsize=15)
ax.set_xlabel('Data set', rotation=0, labelpad=25, fontsize=15)
ax.margins(y=0.01)
ax.set_title('COPA', fontsize=20, pad=20)
ax.legend(loc='upper right',title='Choice type',bbox_to_anchor=(1.2,0.5))
# ax.xaxis.grid(True) 
plt.tight_layout()
fig = ax.get_figure()
fig.savefig("copa_distribution.png", bbox_inches='tight') 
plt.savefig('copa_distribution.pdf')

## MultiRC

In [None]:
dataset="multirc"
dtype="test"
predictions_path = "Best Models/MultiRC"
original_dataset_dest = f"Best Models/MultiRC data/{dtype}.jsonl"
df, models = merge_data(original_dataset_dest, predictions_path, dataset)

In [None]:
df.head()

In [None]:
df = add_counts(df, dataset)

In [None]:
df.to_csv("Best Models/multirc_evaluation.csv", encoding="utf-8")

### Dataset distribution

In [None]:
# Passage & questions separately
multirc_dataset_distribution = {}
multirc_questions_distribution = {}
for elt in ["train", "val", "test"]:
    df = read_and_prepare_data(f"MultiRC/{elt}.jsonl")
    multirc_dataset_distribution[elt] = len(df)
    df = df.explode("passage.questions").reset_index(drop=True)
    multirc_questions_distribution[elt] = len(df)

In [None]:
df_multirc_distribution = pd.DataFrame({'Data sets': ["train", "val", "test"], 'Size': list(multirc_dataset_distribution.values())})
df_multirc_questions = pd.DataFrame({'Data sets': ["train", "val", "test"], 'Size': list(multirc_questions_distribution.values())})


In [None]:
fig,ax = plt.subplots(figsize=(8,6))
sns.set_color_codes("pastel")
sns.set_style("whitegrid", {"grid.linestyle": ":"})
sns.barplot(y="Size", width=0.5, x="Data sets", data=df_multirc_distribution, palette='pastel', orient='v')
ax.bar_label(ax.containers[0], padding=5)
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_ylabel('Number of paragraphs', rotation=90, labelpad=12, fontsize=15)
ax.set_xlabel('Data set', rotation=0, labelpad=25, fontsize=15)
ax.margins(y=0.01)
ax.set_title('MultiRC', fontsize=20, pad=20)
plt.tight_layout()
fig = ax.get_figure()
fig.savefig("multirc_passages.png", bbox_inches='tight') 
plt.savefig('multirc_passages.pdf')

In [None]:
fig,ax = plt.subplots(figsize=(8,6))
sns.set_color_codes("pastel")
sns.set_style("whitegrid", {"grid.linestyle": ":"})
sns.barplot(y="Size", width=0.5, x="Data sets", data=df_multirc_questions, palette='pastel', orient='v')
ax.bar_label(ax.containers[0], padding=5)
# ax.bar_label(ax.containers[1], padding=5)
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
# ax.set_xlim(0,480)
ax.set_ylabel('Number of questions', rotation=90, labelpad=12, fontsize=15)
ax.set_xlabel('Data set', rotation=0, labelpad=25, fontsize=15)
ax.margins(y=0.01)
# ax.bar(height=10)
ax.set_title('MultiRC', fontsize=20, pad=20)
# ax.legend(loc='right',title='Choice type')
# ax.xaxis.grid(True) 
plt.tight_layout()
fig = ax.get_figure()
fig.savefig("multirc_questions.png", bbox_inches='tight') 
plt.savefig('multirc_questions.pdf')

In [None]:
## Answers distribution
multirc_answer_distribution = {}
for elt in ["train", "val", "test"]:
    df = read_and_prepare_data(f"MultiRC/{elt}.jsonl")
    df = df.explode("passage.questions").reset_index(drop=True)
    df = df.join(pd.json_normalize(df["passage.questions"]), lsuffix="_original").drop(columns=['passage.questions'])
    df = df.explode("answers").reset_index(drop=True)
    df = df[['question', 'answers']]
    df2 = df.groupby('question').count()
    
    colors = {"train": "#abc9ea", "val": "#efb792", "test":"#98daa7"}
    fig,ax = plt.subplots(figsize=(8,6))
    sns.set_color_codes("pastel")
    sns.set_style("whitegrid", {"grid.linestyle": ":"})
    sns.histplot(data=df2, x="answers", binwidth=1, color=colors[elt])
    # ax.bar_label(ax.containers[0], padding=5)
    # ax.bar_label(ax.containers[1], padding=5)
    ax.spines['top'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.spines['right'].set_visible(False)
    # ax.set_xlim(0,480)
    ax.set_ylabel('Frequency', rotation=90, labelpad=12, fontsize=15)
#     ax.yaxis.label.set_visible(False)
    if elt == "val":
        elt = "validation"
    ax.set_xlabel(f'Number of answers per question in {elt} set', rotation=0, labelpad=25, fontsize=15)
    ax.margins(y=0.01)
    # ax.bar(height=10)
    ax.set_title('MultiRC', fontsize=20, pad=20)
    # ax.legend(loc='right',title='Choice type')
    # ax.xaxis.grid(True) 
    plt.tight_layout()
    fig = ax.get_figure()
    fig.savefig(f"multirc_histogram_{elt}.png", bbox_inches='tight') 
    plt.savefig(f'multirc_histogram{elt}.pdf')