In [1]:
import os
import json
import numpy as np
import pandas as pd
import itertools
from constants.rave_constants import *
from constants.metrics_constants import *
from utils.utility import load_rave_dataset
from experiments.running_experiments import AttributePredictionResult
from evaluation.attributes_prediction_evaluation import evaluate_predictions
from sklearn.metrics import precision_score, recall_score, jaccard_score, f1_score

  from .autonotebook import tqdm as notebook_tqdm


In [50]:
SAMPLE_ID_KEY = 'sample_id'
CLASSIFIER_KEY = 'classifier'
VECTORIZER_KEY = 'vectorizer'
EXAMPLE_COUNT_KEY = 'example_count'
FOLD_NUMBER_KEY = 'fold_number'
MODEL_SIZE_KEY = 'model_size'
DATASET_VERSION_KEY = 'dataset_version'
EXPERIMENT_NAME_KEY = 'experiment_name'
PREDICTION_KEY = 'prediction'
PREDICTION_BINARY_KEY = 'prediction_binary'
ACTUAL_KEY = 'actual'
ACTUAL_BINARY_KEY = 'actual_binary'

In [51]:
def parse_llm_predictior_info(experiment_name: str) -> dict:
    splitted = experiment_name.split('_')
    model_size = float(splitted[1].strip('b'))
    examples = int(splitted[4].strip('shot'))
    return {
        VECTORIZER_KEY: "",
        CLASSIFIER_KEY: splitted[0],
        DATASET_VERSION_KEY: splitted[2],
        FOLD_NUMBER_KEY: int(splitted[3]),
        EXPERIMENT_NAME_KEY: f"{splitted[0]}:{splitted[1]}-{splitted[4]}s",
        EXAMPLE_COUNT_KEY: examples,
        MODEL_SIZE_KEY: model_size
    }

def parse_classifier_predictior_info(experiment_name: str) -> dict:
    splitted = experiment_name.split('_')
    return {
        VECTORIZER_KEY: splitted[0],
        CLASSIFIER_KEY: splitted[1],
        DATASET_VERSION_KEY: splitted[2],
        FOLD_NUMBER_KEY: int(splitted[3]),
        EXPERIMENT_NAME_KEY: splitted[0] + "_" + splitted[1],
        EXAMPLE_COUNT_KEY: 0,
        MODEL_SIZE_KEY: 0
    }

def parse_base_name(experiment_name: str) -> dict:
    return parse_llm_predictior_info(experiment_name) if experiment_name.endswith("shot") else parse_classifier_predictior_info(experiment_name)

def get_transformed_rows(base_name: str, prediction_list: list[dict]) -> list[dict]:
    base_info = parse_base_name(base_name)
    result_rows: list[dict] = []
    for sample in prediction_list:
        new_row = base_info | {
            SAMPLE_ID_KEY: sample[AttributePredictionResult.ID],
            PREDICTION_KEY: sample[AttributePredictionResult.PREDICTED_ATTRIBUTES],
        }
        result_rows.append(new_row)
    return result_rows

def create_binary_vector(ordered_attributes: list[str], provided_attributes: list[str]):
    provided_set_lower = { attr.lower() for attr in provided_attributes if isinstance(attr, str)}
    return [1 if attribute.lower() in provided_set_lower else 0 for attribute in ordered_attributes]

In [52]:
used_version = "v2"
attribute_names_version = {
    "v1": ALL_SELECTED_ATTRIBUTES,
    "v2": ALL_SELECTED_ATTRIBUTES_V2,
}
TRANSFORMED_RESULT_FOLDER_PATH = './attribute_prediction_results'

SELECTED_ATTRIBUTES = attribute_names_version[used_version]
SORTED_ATTRIBUTES = sorted(SELECTED_ATTRIBUTES)

dataset_path = f"./dataset/rave_dataset_{used_version}.json"
rave_dataset = load_rave_dataset(dataset_path)
for sample in rave_dataset:
    sample.keep_selected_text_attributes(SELECTED_ATTRIBUTES)

sample_attributes_dict = { sample.id : list(sample.text_attributes.keys()) for sample in rave_dataset }

predicted_attributes_results: dict[str, list[dict]] = {}
all_data_rows: list[dict] = []
for filename in os.listdir(TRANSFORMED_RESULT_FOLDER_PATH):
    file_path = os.path.join(TRANSFORMED_RESULT_FOLDER_PATH, filename)
    with open(file_path, 'r', encoding='utf-8') as file:
        try:
            data = json.load(file)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON from file {filename}: {e}")
            continue
    experiment_name = os.path.splitext(os.path.basename(file_path))[0]
    all_data_rows += get_transformed_rows(experiment_name, data)

In [53]:
results_df = pd.DataFrame(all_data_rows)
results_df[ACTUAL_KEY] = results_df[SAMPLE_ID_KEY].map(sample_attributes_dict)
results_df[ACTUAL_BINARY_KEY] = results_df[ACTUAL_KEY].map(lambda x: create_binary_vector(SORTED_ATTRIBUTES, x))
results_df[PREDICTION_BINARY_KEY] = results_df[PREDICTION_KEY].map(lambda x: create_binary_vector(SORTED_ATTRIBUTES, x))

In [54]:
approach_evaluation_results: list[dict] = []

for (classifier_name, model_size, vectorizer_name, experiment_name, fold_number, example_count), group in results_df.groupby([CLASSIFIER_KEY, MODEL_SIZE_KEY, VECTORIZER_KEY, EXPERIMENT_NAME_KEY, FOLD_NUMBER_KEY, EXAMPLE_COUNT_KEY]):
    y_true = np.array(group[ACTUAL_BINARY_KEY].tolist())
    y_pred = np.array(group[PREDICTION_BINARY_KEY].tolist())
    res = {
        CLASSIFIER_KEY: classifier_name,
        MODEL_SIZE_KEY: model_size,
        VECTORIZER_KEY: vectorizer_name,
        EXPERIMENT_NAME_KEY: experiment_name,
        FOLD_NUMBER_KEY: int(fold_number),
        EXAMPLE_COUNT_KEY: int(example_count),
    } | evaluate_predictions(y_true, y_pred)
    approach_evaluation_results.append(res)
evaluation_df = pd.DataFrame(approach_evaluation_results)

In [55]:
metrics_used = [MICRO_PRECISION_KEY, MICRO_RECALL_KEY, MICRO_F1_KEY,
                MACRO_PRECISION_KEY, MACRO_RECALL_KEY, MACRO_F1_KEY,
                WEIGHTED_PRECISION_KEY, WEIGHTED_RECALL_KEY, WEIGHTED_F1_KEY]

In [56]:
summary_df = evaluation_df.groupby([EXPERIMENT_NAME_KEY, CLASSIFIER_KEY, VECTORIZER_KEY, MODEL_SIZE_KEY, EXAMPLE_COUNT_KEY])[metrics_used].agg(['mean'])
summary_df.columns = ['_'.join(col).strip() for col in summary_df.columns]
summary_df = summary_df.reset_index()
summary_df

Unnamed: 0,experiment_name,classifier,vectorizer,model_size,example_count,micro_precision_mean,micro_recall_mean,micro_f1_mean,macro_precision_mean,macro_recall_mean,macro_f1_mean,weighted_precision_mean,weighted_recall_mean,weighted_f1_mean
0,bow_1-NN,1-NN,bow,0.0,0,0.840281,0.755435,0.795533,0.619431,0.512001,0.542897,0.845951,0.755435,0.789906
1,bow_3-NN,3-NN,bow,0.0,0,0.892161,0.709368,0.789938,0.605083,0.416288,0.471898,0.866973,0.709368,0.767955
2,bow_5-NN,5-NN,bow,0.0,0,0.905749,0.688513,0.782098,0.581452,0.373656,0.430073,0.869676,0.688513,0.753023
3,bow_7-NN,7-NN,bow,0.0,0,0.913102,0.672665,0.774421,0.571110,0.353890,0.411692,0.871262,0.672665,0.740892
4,bow_LogReg,LogReg,bow,0.0,0,0.961286,0.859675,0.907557,0.719894,0.552869,0.598662,0.939055,0.859675,0.887146
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,tfidf_5-NN,5-NN,tfidf,0.0,0,0.913512,0.649804,0.759250,0.680397,0.394260,0.473476,0.894345,0.649804,0.734791
71,tfidf_7-NN,7-NN,tfidf,0.0,0,0.928273,0.637479,0.755834,0.617985,0.366747,0.439467,0.885784,0.637479,0.724127
72,tfidf_LogReg,LogReg,tfidf,0.0,0,0.973817,0.688323,0.806387,0.553438,0.336092,0.393874,0.900536,0.688323,0.763944
73,tfidf_SVM,SVM,tfidf,0.0,0,0.965413,0.851199,0.904616,0.699871,0.535948,0.576992,0.934089,0.851199,0.877367


In [57]:
classifier_order = ['LogReg', 'SVM', 'XGBoost', '1-NN', '3-NN', '5-NN', '7-NN', "gemma", "gemma2", 'llama3.1', 'llama3.2', 'mistral', 'qwen2', 'qwen2.5']
vectorizer_order = ['bow', 'tfidf', 'embeddings', '']

summary_df[CLASSIFIER_KEY] = pd.Categorical(summary_df[CLASSIFIER_KEY], categories=classifier_order, ordered=True)
summary_df[VECTORIZER_KEY] = pd.Categorical(summary_df[VECTORIZER_KEY], categories=vectorizer_order, ordered=True)


In [68]:
def to_max_as_percent(col: pd.Series) -> pd.Series:
    percentages = (col * 100).round(2)
    max_val = percentages.max()
    result = []
    for val in percentages:
        val_str = f"{val:.2f}"  # Format with 2 decimals and % sign
        if val == max_val:
            val_str = f"\\textbf{{{val_str}}}"
        result.append(val_str)
    return pd.Series(result)

def transform_names(col: pd.Series) -> pd.Series:
    result: list[str] = []
    val: str = ""
    for val in col:
        if val.startswith("bow"):
            val = val.replace("bow", "BoW")
        if val.startswith("tfidf"):
            val = val.replace("tfidf", "TF-IDF")
        elif val.startswith("embeddings"):
            val = val.replace("embeddings", "Emb")
        result.append(f"\\textbf{{{val}}}")
    return pd.Series(result)

def transform_to_latex_df(df: pd.DataFrame) -> pd.DataFrame:
    metrics_used = [MICRO_PRECISION_KEY, MICRO_RECALL_KEY, MICRO_F1_KEY,
                    MACRO_PRECISION_KEY, MACRO_RECALL_KEY, MACRO_F1_KEY,
                    WEIGHTED_PRECISION_KEY, WEIGHTED_RECALL_KEY, WEIGHTED_F1_KEY]
    latex_df = pd.DataFrame()
    latex_df["Model"] = transform_names(df[EXPERIMENT_NAME_KEY])
    for metric in metrics_used:
        latex_df[metric] = to_max_as_percent(df[metric + '_mean'])
    return latex_df


metrics_aliases = {
    MICRO_PRECISION_KEY, 
    MICRO_RECALL_KEY, 
    MICRO_F1_KEY,
    MACRO_PRECISION_KEY, 
    MACRO_RECALL_KEY, 
    MACRO_F1_KEY,
    WEIGHTED_PRECISION_KEY, 
    WEIGHTED_RECALL_KEY, 
    WEIGHTED_F1_KEY
}
summary_sorted_df = summary_df.sort_values(by=[VECTORIZER_KEY, CLASSIFIER_KEY, MODEL_SIZE_KEY, EXAMPLE_COUNT_KEY])
two_phase_df = summary_sorted_df.loc[summary_sorted_df[VECTORIZER_KEY] != ""]
llm_df = summary_sorted_df.loc[summary_sorted_df[VECTORIZER_KEY] == ""]
tp_latex_df = transform_to_latex_df(two_phase_df)
llm_latex_df = transform_to_latex_df(llm_df)

In [69]:
print(tp_latex_df.to_latex(index=False))

\begin{tabular}{llllllllll}
\toprule
Model & micro_precision & micro_recall & micro_f1 & macro_precision & macro_recall & macro_f1 & weighted_precision & weighted_recall & weighted_f1 \\
\midrule
\textbf{BoW_LogReg} & 96.13 & 85.97 & 90.76 & 71.99 & 55.29 & 59.87 & 93.91 & 85.97 & 88.71 \\
\textbf{BoW_SVM} & 94.51 & \textbf{90.60} & 92.51 & \textbf{79.29} & \textbf{69.77} & \textbf{72.43} & 94.32 & \textbf{90.60} & \textbf{91.93} \\
\textbf{BoW_XGBoost} & 95.36 & 90.39 & \textbf{92.79} & 71.90 & 65.36 & 67.32 & 93.72 & 90.39 & 91.57 \\
\textbf{BoW_1-NN} & 84.03 & 75.54 & 79.55 & 61.94 & 51.20 & 54.29 & 84.60 & 75.54 & 78.99 \\
\textbf{BoW_3-NN} & 89.22 & 70.94 & 78.99 & 60.51 & 41.63 & 47.19 & 86.70 & 70.94 & 76.80 \\
\textbf{BoW_5-NN} & 90.57 & 68.85 & 78.21 & 58.15 & 37.37 & 43.01 & 86.97 & 68.85 & 75.30 \\
\textbf{BoW_7-NN} & 91.31 & 67.27 & 77.44 & 57.11 & 35.39 & 41.17 & 87.13 & 67.27 & 74.09 \\
\textbf{TF-IDF_LogReg} & 97.38 & 68.83 & 80.64 & 55.34 & 33.61 & 39.39 & 90.05 & 68.83

In [71]:
print(llm_latex_df.to_latex(index=False))

\begin{tabular}{llllllllll}
\toprule
Model & micro_precision & micro_recall & micro_f1 & macro_precision & macro_recall & macro_f1 & weighted_precision & weighted_recall & weighted_f1 \\
\midrule
\textbf{gemma:2b-0shots} & 23.39 & 95.22 & 37.56 & 23.34 & 91.90 & 30.84 & 55.12 & 95.22 & 64.60 \\
\textbf{gemma:2b-1shots} & 42.34 & 55.79 & 48.11 & 23.55 & 32.36 & 25.38 & 54.66 & 55.79 & 54.60 \\
\textbf{gemma:2b-2shots} & 37.95 & 54.42 & 44.72 & 24.21 & 35.16 & 25.78 & 56.32 & 54.42 & 54.17 \\
\textbf{gemma:7b-0shots} & 48.58 & 44.55 & 46.46 & 39.63 & 43.66 & 33.33 & 73.10 & 44.55 & 50.04 \\
\textbf{gemma:7b-1shots} & 64.69 & 56.92 & 60.53 & 40.83 & 40.77 & 37.36 & 72.17 & 56.92 & 61.54 \\
\textbf{gemma:7b-2shots} & 66.88 & 59.69 & 63.06 & 40.78 & 35.64 & 35.01 & 70.21 & 59.69 & 62.79 \\
\textbf{gemma2:2b-0shots} & 26.86 & 78.04 & 39.95 & 25.58 & 77.30 & 31.23 & 57.81 & 78.04 & 61.19 \\
\textbf{gemma2:2b-1shots} & 41.40 & 63.99 & 50.24 & 30.08 & 53.18 & 32.69 & 63.57 & 63.99 & 60.52 \\
\t