# Checking results

The objective of this notebook is simply to compute side by side the column of different labels (AG, AI, etc.) of the true label and the predicted labels. It is manual checking mecanism.

In [12]:
import pandas as pd
import json
import re

In [13]:
def extract_text_and_values(json_file, desired_label=None):
    """
    Extracts text and corresponding values from a JSON file based on desired label.

    Args:
        json_file (str): Path to the JSON file.
        desired_label (str): The label to filter (optional).
    Returns:
        pd.DataFrame: A DataFrame with "text" and "values" columns.
    """
    with open(json_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    text_and_values = []

    for item in data:
        labels = item.get("label", [])
        text = item.get("text")
        values = []

        for label_info in labels:
            label_text = label_info.get("text")
            label_labels = label_info.get("labels", [])

            if desired_label is None or (desired_label in label_labels):
                matches = re.findall(r'(\d+(?:[\.,]\d+)?)\s*%?', label_text)
                values.extend([float(match.replace(',', '.').rstrip('%')) for match in matches])

        if values:
            text_and_values.append([text, values])

    df = pd.DataFrame(text_and_values, columns=["text", "values"])

    #convert values to float and get rid of the list
    df["values"] = df["values"].apply(lambda x: [float(value) for value in x])
    df["values"] = df["values"].apply(lambda x: x[0] if len(x) == 1 else sum(x) / len(x))
    return df

For the predicted data

In [14]:
def extract_pred_column(df_path:str, column):
    """
    Extracts text and corresponding values from a JSON file based on desired label.

    Args:
        df_path (str): Path to the DataFrame.
        column (str): The column to extract.
    """
    df = pd.read_csv(df_path)
    df = df[["text", column]]
    return df

def merge_dfs(df_true, df_pred, column):
    """
    Merges two DataFrames on the text column.

    Args:
        df_true (pd.DataFrame): DataFrame with the true values.
        df_pred (pd.DataFrame): DataFrame with the predicted values.
        column (str): The column to rename.
    Returns:
        pd.DataFrame: A DataFrame with "text", "true" and "pred" columns.
    """
    df_true = df_true.rename(columns={"values": "true"})
    df_pred = df_pred.rename(columns={column: "pred"})
    df = pd.merge(df_true, df_pred, on="text")
    return df

def clean_df(df):
    """
    Takes a DataFrame and drops all rows with NaN values.
    """
    df = df.dropna(subset=["pred"], how="any", axis=0)
    df = df.reset_index(drop=True)
    return df

def calculate_diff(df, save_to_csv=False, save_path=None):
    """
    calculates the difference between the true and predicted values and adds a column to the DataFrame.

    Args:
        df (pd.DataFrame): DataFrame with the true and predicted values.
        save_to_csv (bool): Whether to save the DataFrame to a csv file.
        save_path (str): Path to save the DataFrame.
    """
    df["diff"] = df["true"] - df["pred"]
    df["diff"] = df["diff"].abs()

    if save_to_csv:
        df.to_csv(save_path, index=False)
    return df

def print_diff(df):
    """
    Calculates the mean difference, mean true and mean predicted values and prints them.

    Args:
        df (pd.DataFrame): DataFrame with the true and predicted values.
    """
    mean_diff = df["diff"].mean()
    mean_pred = df["pred"].mean()
    mean_true = df["true"].mean()
    # print(f"Mean difference: {mean_diff}")
    # print(f"Mean true: {mean_true}")
    # print(f"Mean pred: {mean_pred}")
    
    return mean_diff, mean_pred, mean_true

Obtaining the results

In [15]:
# we compare all the differences between the true and the predicted values for every label
columns = pd.read_csv(r"../data/processed/final_dataset.csv").columns.tolist()
columns.remove("text")

for column in columns:
    # we extract the true and pred values from the json file and csv file
    df_true = extract_text_and_values(r"../data/raw/data449.json", desired_label=column)
    df_pred = extract_pred_column(r"../data/processed/final_dataset.csv", column)

    # we merge the two dataframes and clean it
    df = merge_dfs(df_true, df_pred, column)
    df = clean_df(df)

    # we calculate the difference between the true and the predicted values and store them
    df = calculate_diff(df, save_to_csv=True, save_path=rf"../data/intermediate/eval_results_{column}.csv")
    mean_diff, mean_pred, mean_true = print_diff(df)

    with open(rf"../reports/eval_results_{column}.txt", "w") as f:
        f.write(f"Mean difference {column}: {mean_diff}\n")
        f.write(f"Mean true {column}: {mean_true}\n")
        f.write(f"Mean pred {column}: {mean_pred}\n")

Producing a summary of the difference per label

In [27]:
# We import the different labels
columns = pd.read_csv(r"../data/processed/final_dataset.csv").columns.tolist()
columns.remove("text")

# We collect the mean and label and store them in a list
means = []
for column in columns:
    df = pd.read_csv(rf"../data/intermediate/eval_results_{column}.csv")
    mean_diff = df["diff"].mean()
    means.append(mean_diff)

# We create a dataframe with the labels and the mean differences
df = pd.DataFrame(columns=["label", "mean_diff"])
df["label"] = columns
df["mean_diff"] = means

mean_diff = df["mean_diff"].iloc[:7].mean() # to avoid taking in account the PPVm which value is absolute and not a percentage
median_diff = df["mean_diff"].iloc[:7].median()
min_diff = df["mean_diff"].iloc[:7].min()
max_diff = df["mean_diff"].iloc[:7].max()
print(f"Mean difference: {mean_diff}")
print(f"Median difference: {median_diff}")
print(f"Min difference: {min_diff}")
print(f"Max difference: {max_diff}")

# df.to_csv(r"../data/processed/mean_diff.csv", index=False)

Mean difference: 0.2167310989183082
Median difference: 0.1515123456790123
Min difference: 0.07753086419753086
Max difference: 0.6394722222222222
