# Checking results

The objective of this notebook is simply to compute side by side the column of different labels (AG, AI, etc.) of the true label and the predicted labels. It is manual checking mecanism.

In [None]:
import pandas as pd
import json
import re
import numpy as np

In [None]:
def extract_text_and_values(json_file, desired_label=None):
    """
    Extracts text and corresponding values from a JSON file based on desired label.

    Args:
        json_file (str): Path to the JSON file.
        desired_label (str): The label to filter (optional).
    Returns:
        pd.DataFrame: A DataFrame with "text" and "values" columns.
    """
    with open(json_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    text_and_values = []

    for item in data:
        labels = item.get("label", [])
        text = item.get("text")
        values = []

        for label_info in labels:
            label_text = label_info.get("text")
            label_labels = label_info.get("labels", [])

            if desired_label is None or (desired_label in label_labels):
                matches = re.findall(r'(\d+(?:[\.,]\d+)?)\s*%?', label_text)
                values.extend([float(match.replace(',', '.').rstrip('%')) for match in matches])

        if values:
            text_and_values.append([text, values])

    df = pd.DataFrame(text_and_values, columns=["text", "values"])

    #convert values to float and get rid of the list
    df["values"] = df["values"].apply(lambda x: [float(value) for value in x])
    df["values"] = df["values"].apply(lambda x: x[0] if len(x) == 1 else sum(x) / len(x)) # take the average if there are multiple values
    # df["values"] = df["values"].apply(lambda x: x[0] if len(x) == 1 else np.median(x)) # take the median if there are multiple values
    # df["values"] = df["values"].apply(lambda x: x[0] if len(x) == 1 else np.random.choice(x)) # take a random value if there are multiple values
    df["values"] = df["values"].apply(lambda x: round(x, 2))
    return df

For the predicted data

In [None]:
def extract_pred_column(df_path:str, column):
    """
    Extracts text and corresponding values from a JSON file based on desired label.

    Args:
        df_path (str): Path to the DataFrame.
        column (str): The column to extract.
    """
    df = pd.read_csv(df_path)
    df = df[["text", column]]
    return df

def merge_dfs(df_true, df_pred, column):
    """
    Merges two DataFrames on the text column.

    Args:
        df_true (pd.DataFrame): DataFrame with the true values.
        df_pred (pd.DataFrame): DataFrame with the predicted values.
        column (str): The column to rename.
    Returns:
        pd.DataFrame: A DataFrame with "text", "true" and "pred" columns.
    """
    df_true = df_true.rename(columns={"values": "true"})
    df_pred = df_pred.rename(columns={column: "pred"})
    df = pd.merge(df_true, df_pred, on="text")
    return df

def clean_df(df):
    """
    Takes a DataFrame and drops all rows with NaN values.
    """
    df = df.dropna(subset=["pred"], how="any", axis=0)
    df = df.reset_index(drop=True)
    return df

def calculate_diff(df, save_to_csv=False, save_path=None):
    """
    calculates the difference between the true and predicted values and adds a column to the DataFrame.

    Args:
        df (pd.DataFrame): DataFrame with the true and predicted values.
        save_to_csv (bool): Whether to save the DataFrame to a csv file.
        save_path (str): Path to save the DataFrame.
    """
    df["diff"] = df["true"] - df["pred"]
    df["diff"] = df["diff"].abs()

    if save_to_csv:
        df.to_csv(save_path, index=False)
    return df

def print_diff(df):
    """
    Calculates the mean difference, mean true and mean predicted values and prints them.

    Args:
        df (pd.DataFrame): DataFrame with the true and predicted values.
    """
    mean_diff = df["diff"].mean()
    mean_pred = df["pred"].mean()
    mean_true = df["true"].mean()

    #Calculate the mean difference and variance
    n = len(df)
    DN = df["diff"].sum() / n
    variance = ((df["diff"] - DN) ** 2).sum() / n
    df["variance"] = variance

    # var_diff = df["diff"].var()
    # var_pred = df["pred"].var()
    # var_true = df["true"].var()
    median_diff = df["diff"].median()
    median_pred = df["pred"].median()
    median_true = df["true"].median()
    # print(f"Mean difference: {mean_diff}")
    # print(f"Mean true: {mean_true}")
    # print(f"Mean pred: {mean_pred}")
    
    return mean_diff, mean_pred, mean_true, variance, median_diff, median_pred, median_true

Obtaining the results

In [None]:
# we compare all the differences between the true and the predicted values for every label
columns = pd.read_csv(r"../data/processed/final_dataset.csv").columns.tolist()
columns.remove("text")

for column in columns:
    # we extract the true and pred values from the json file and csv file
    df_true = extract_text_and_values(r"../data/raw/data449.json", desired_label=column)
    df_pred = extract_pred_column(r"../data/processed/final_dataset.csv", column)

    # we merge the two dataframes and clean it
    df = merge_dfs(df_true, df_pred, column)
    df = clean_df(df) #activate or deactivate to see the difference

    # we calculate the difference between the true and the predicted values and store them
    df = calculate_diff(df, save_to_csv=True, save_path=rf"../data/intermediate/eval_results_{column}_without_cleaning.csv")
    mean_diff, mean_pred, mean_true, variance, median_diff, median_pred, median_true = print_diff(df)

    with open(rf"../reports/eval_results_{column}_without_cleaning.txt", "w") as f:
        f.write("=====================================\n")
        f.write(f"Results for {column}\n")
        f.write("=====================================\n")
        f.write("\n")
        f.write(f"Mean difference {column}: {mean_diff}\n")
        f.write(f"Mean true {column}: {mean_true}\n")
        f.write(f"Mean pred {column}: {mean_pred}\n")
        f.write("\n")
        f.write(f"Variance difference {column}: {variance}\n")
        # f.write(f"Variance true {column}: {var_true}\n")
        # f.write(f"Variance pred {column}: {var_pred}\n")
        f.write("\n")
        f.write(f"Median difference {column}: {median_diff}\n")
        f.write(f"Median true {column}: {median_true}\n")
        f.write(f"Median pred {column}: {median_pred}\n")

Producing a summary of the difference per label

In [None]:
# We import the different labels
columns = pd.read_csv(r"../data/processed/final_dataset.csv").columns.tolist()
columns.remove("text")

# We collect the mean and label and store them in a list
means = []
for column in columns:
    df = pd.read_csv(rf"../data/intermediate/eval_results_{column}_without_cleaning.csv")
    mean_diff = df["diff"].mean()
    means.append(mean_diff)

# We create a dataframe with the labels and the mean differences
df = pd.DataFrame(columns=["label", "mean_diff"])
df["label"] = columns
df["mean_diff"] = means

mean_diff = df["mean_diff"].iloc[:7].mean() # to avoid taking in account the PPVm which value is absolute and not a percentage
median_diff = df["mean_diff"].iloc[:7].median()
min_diff = df["mean_diff"].iloc[:7].min()
max_diff = df["mean_diff"].iloc[:7].max()
print(f"Mean difference: {mean_diff}")
print(f"Median difference: {median_diff}")
print(f"Min difference: {min_diff}")
print(f"Max difference: {max_diff}")

df.to_csv(r"../data/processed/mean_diff_without_cleaning.csv", index=False)

We count the missing results at the final stage

In [None]:
columns = pd.read_csv(r"../data/processed/final_dataset.csv").columns.tolist()
columns.remove("text")

for column in columns:
    df = pd.read_csv(rf"../data/intermediate/eval_results_{column}_without_cleaning.csv")
    full_pred_cell = df[df["pred"].notna()]
    full_true_cell = df[df["true"].notna()]
    empty_pred_cell = df[df["pred"].isna()]

    #We add it to the txt file
    with open(rf"../reports/eval_results_{column}_without_cleaning.txt", "a") as f:
        f.write("\n")
        f.write("Comparison with and without cleaning\n")
        f.write(f"Full cells pred {column}: {len(full_pred_cell)}\n")
        f.write(f"Full cells true {column}: {len(full_true_cell)}\n")
        f.write(f"Empty cells {column}: {len(empty_pred_cell)}\n")

    # print(f"Full cells pred {column}: {len(full_pred_cell)}")
    # print(f"Full cells true {column}: {len(full_true_cell)}")
    # print(f"Empty cells {column}: {len(empty_pred_cell)}")

We merge the three data frame that summarize the mean, median and random choice efficiency approach

In [None]:
# # We import the different dfs
df_mean_cleaning = pd.read_csv(r"../data/processed/mean_diff.csv")
df_mean_no_cleaning = pd.read_csv(r"../data/processed/mean_diff_without_cleaning.csv")
df_mean_no_cleaning_median = pd.read_csv(r"../data/processed/mean_diff_without_cleaning_median.csv")
df_mean_no_cleaning_random = pd.read_csv(r"../data/processed/mean_diff_without_cleaning_random.csv")

# Start with the first DataFrame
df_summary = df_mean_cleaning

# Merge the other DataFrames one by one with suffixes
dfs_to_merge = [df_mean_no_cleaning, df_mean_no_cleaning_median, df_mean_no_cleaning_random]
list_of_suffixes = ["_no_cleaning", "_no_cleaning_median", "_no_cleaning_random"]

for df, suffix in zip(dfs_to_merge, list_of_suffixes):
    # Rename columns to avoid duplicates
    columns_to_rename = {col: col + suffix for col in df.columns if col != "label"}
    df = df.rename(columns=columns_to_rename)
    df_summary = pd.merge(df_summary, df, on="label")

# We clean the dataframe by rounding the values
df_summary = df_summary.apply(lambda x: round(x, 3) if x.name != "label" else x)

# We save the dataframe
df_summary.to_csv(r"../data/processed/summary.csv", index=False)