In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support

from src.load_data import load_dataframe_from_s3
from src.helpers import load_config_from_yaml

In [None]:
#load in data here
conf_file_path = "./config/local.yaml"
yaml_conf = load_config_from_yaml(file_path=conf_file_path)
bucket_name = yaml_conf.get("BUCKET_NAME")
id_col = yaml_conf.get("ID_COL")
data_name = yaml_conf.get("THE_DATA")
human_labels = yaml_conf.get("HUMAN")
pipeline_outputs = yaml_conf.get("FINAL_RESULTS")

float_cols_1 = ['ER_SCORE_1','PR_SCORE_1']
float_cols_2 = ['er_score_p','pr_score_p']
results_df = load_dataframe_from_s3(bucket_name,pipeline_outputs, float_cols_1 + float_cols_2)
# results_df = results_df[results_df['BATCH'] == 5]# filter on batch if required
ground_truth_data = load_dataframe_from_s3(bucket_name,data_name, float_cols_1)
human_extraction = load_dataframe_from_s3(bucket_name,human_labels, float_cols_1)

In [None]:
def annotate_bars(ax, fmt="{:.1f}%"):
    """Annotate each bar in a barplot."""
    for p in ax.patches:
        height = p.get_height()
        if height > 0:
            ax.text(
                x=p.get_x() + p.get_width() / 2,
                y=height+1,
                s=fmt.format(height),
                ha='center',
                va='bottom',
                color='black',
                fontsize=10,
                fontweight='bold'
            )

## Results for multiple tumour flagging

In [None]:
actual_cols = ['Multiple Tumours', 'Multiple Tumours New']
extracted_cols = ['multi_tumour']
multi_results_df = results_df[[id_col]+actual_cols+extracted_cols]

In [None]:
# 2. Accuracy per metric
metric_accuracy = {}

for ent, pred in zip(actual_cols, extracted_cols*2):
    metric_accuracy[ent] = (multi_results_df[ent] == multi_results_df[pred]).mean() * 100

acc_series = pd.Series(metric_accuracy)

plt.figure(figsize=(8, 5))
ax1 = sns.barplot(x=acc_series.index, y=acc_series.values)
plt.ylabel("Accuracy (%)")
plt.xlabel("Extracted Metrics")
plt.title("Accuracy per Entity")
plt.ylim(0, 110)
annotate_bars(ax1)
plt.show()

In [None]:
# Confusion matrix for each metric

for ent, pred in zip(actual_cols, extracted_cols*2):
    labels = [0,1]
    cm = confusion_matrix(multi_results_df[ent], multi_results_df[pred], labels=labels)

    plt.figure(figsize=(7, 5))
    ax = sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                xticklabels=labels, yticklabels=labels, linewidths=0.1, linecolor='grey',)
    plt.xlabel("Extracted")
    plt.ylabel("Actual")
    plt.title(f"Confusion Matrix for {ent}")

    for text in ax.texts:
        if text.get_text() == "0":
            text.set_text("")
    plt.show()

    precision, recall, f1, support = precision_recall_fscore_support(
        multi_results_df[ent],
        multi_results_df[pred],
        labels=labels,
        zero_division=0
    )

    prf_table = pd.DataFrame({
        "Class": labels,
        "Precision": np.round(precision,4),
        "Recall": np.round(recall,4),
        "F1 Score": np.round(f1,4),
        "Support": support
    })

    prf_table = prf_table.set_index("Class")

    print(f"\n=== Precision, Recall, F1 for {ent} ===")
    print(prf_table)

## Results for metric extraction

In [None]:
# Setup
actual_cols = ['ER_STATUS_1','ER_SCORE_1','PR_STATUS_1','PR_SCORE_1','HER2_STATUS_1']
extracted_cols = ['er_status_p','er_score_p','pr_status_p','pr_score_p','her2_status_p']
llm_extraction = results_df[results_df['multi_tumour'] == 0]
llm_extraction = llm_extraction[[id_col]+extracted_cols].fillna('blank')
ground_truth_data = ground_truth_data[[id_col]+actual_cols].fillna('blank')
human_extraction = human_extraction[[id_col]+actual_cols].fillna('blank')

gt_vs_llm = ground_truth_data.merge(llm_extraction, on = id_col, how = 'inner')
gt_vs_hum = ground_truth_data.merge(human_extraction, on = id_col, how = 'inner', suffixes = (None,'_hum'))

In [None]:
# Compute per-document number of correct predictions
gt_vs_llm["num_correct"] = (gt_vs_llm[actual_cols].values == gt_vs_llm[extracted_cols].values).sum(axis=1)

# Compute percentage distribution
correct_dist = gt_vs_llm["num_correct"].value_counts(normalize=True).reindex([0,1,2,3,4,5]).sort_index() * 100

plt.figure(figsize=(8, 5))
ax1 = sns.barplot(x=['0','1','2','3','4','5'], y=correct_dist.values)
plt.xlabel("Number of correctly extracted entities")
plt.ylabel("Percentage of documents (%)")
plt.title("Distribution of Documents by Number of Correct Entity Extractions")
plt.ylim(0, 110)
annotate_bars(ax1)
plt.show()

In [None]:
# Accuracy per metric (5 bars)
metric_accuracy = {}

for ent, pred in zip(actual_cols, extracted_cols):
    metric_accuracy[ent] = (gt_vs_llm[ent] == gt_vs_llm[pred]).mean() * 100

acc_series = pd.Series(metric_accuracy)

plt.figure(figsize=(8, 5))
ax1 = sns.barplot(x=acc_series.index, y=acc_series.values)
plt.ylabel("Accuracy (%)")
plt.xlabel("Extracted Metrics")
plt.title("Accuracy per Entity")
plt.ylim(0, 110)
annotate_bars(ax1)
plt.show()

In [None]:
# Compute Metric Accuracy
metric_accuracy = {}

for ent, pred in zip(actual_cols, extracted_cols):
    accuracy = (gt_vs_llm[ent] == gt_vs_llm[pred]).mean() * 100
    metric_accuracy[ent] = accuracy

accuracy_table = pd.DataFrame({
    "Entity": actual_cols,
    "Accuracy (%)": [metric_accuracy[e] for e in actual_cols]
}).set_index("Entity")

print("=== Entity Extraction Accuracy Table ===")
print(accuracy_table)


In [None]:
# Confusion matrix for each metric
for ent, pred in zip(actual_cols, extracted_cols):

    # Define labels per entity
    if ent in ['ER_STATUS_1','PR_STATUS_1']:
        labels = ['positive','negative','not performed','blank']
    elif ent in ['ER_SCORE_1','PR_SCORE_1']:
        labels = ['0','2','3','4','5','6','7','8','blank']
    else:
        labels = ['negative (unknown)','negative (0)','negative (1+)','borderline (2+)','positive (3+)','not performed','blank']

    # --- CONFUSION MATRIX ---
    cm = confusion_matrix(gt_vs_llm[ent], gt_vs_llm[pred], labels=labels)

    plt.figure(figsize=(7, 5))
    ax = sns.heatmap(
        cm,
        annot=True,
        fmt="d",
        cmap="Blues",
        xticklabels=labels,
        yticklabels=labels,
        linewidths=0.1,
        linecolor='grey'
    )
    plt.xlabel("Extracted")
    plt.ylabel("Actual")
    plt.title(f"Confusion Matrix for {ent}")

    # Hide zero text
    for text in ax.texts:
        if text.get_text() == "0":
            text.set_text("")

    plt.show()

    precision, recall, f1, support = precision_recall_fscore_support(
        gt_vs_llm[ent],
        gt_vs_llm[pred],
        labels=labels,
        zero_division=0
    )

    prf_table = pd.DataFrame({
        "Class": labels,
        "Precision": np.round(precision,4),
        "Recall": np.round(recall,4),
        "F1 Score": np.round(f1,4),
        "Support": support
    })

    prf_table = prf_table.set_index("Class")

    print(f"\n=== Precision, Recall, F1 for {ent} ===")
    print(prf_table)


In [None]:
# LLM vs Human accuracy comparison
hum_cols = [f"{c}_hum" for c in actual_cols]

human_accuracy = {}
llm_accuracy = {}

for act, ext, hum in zip(actual_cols, extracted_cols, hum_cols):
    llm_accuracy[act] = (gt_vs_llm[act] == gt_vs_llm[ext]).mean() * 100
    human_accuracy[act] = (gt_vs_hum[act] == gt_vs_hum[hum]).mean() * 100

comparison_df = pd.DataFrame({
    "Entity": actual_cols,
    "LLM Accuracy": [llm_accuracy[e] for e in actual_cols],
    "Human Accuracy": [human_accuracy[e] for e in actual_cols],
})

comparison_long = comparison_df.melt(id_vars="Entity",
                                     var_name="Source",
                                     value_name="Accuracy")

plt.figure(figsize=(10, 6))
ax4 = sns.barplot(data=comparison_long, x="Entity", y="Accuracy", hue="Source")
plt.ylabel("Accuracy (%)")
plt.title("Human vs LLM Accuracy Comparison")
plt.ylim(0, 110)
annotate_bars(ax4)
ax4.legend(
    loc='upper left',
    bbox_to_anchor=(1.02, 1),
    borderaxespad=0,
    frameon=True
)

plt.tight_layout()
plt.show()