In [None]:
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import matplotlib.pyplot as plt
import seaborn as sns

from src.helpers import load_config_from_yaml
from src.text_preprocessor import TextPreprocessor
from src.prompt_builder import PromptBuilder
from src.model_request import ModelRequest
from src.extractor_pipeline import ExtractorPipeline
from src.post_processor import PostProcessor
from src.custom_logging import setup_logging
from src.cme_evaluator import CMEEvaluator
from src.load_data import load_dataframe_from_s3, save_dataframe_to_s3
import config.pipeline_config as conf
from config.validation_config import ValidSchema, MultiSchema

In [None]:
conf_file_path = "./config/local.yaml"
yaml_conf = load_config_from_yaml(file_path=conf_file_path)

bucket_name = yaml_conf.get("BUCKET_NAME")
id_col = yaml_conf.get("ID_COL")
data_name = yaml_conf.get("THE_DATA")

model_id = yaml_conf.get("MODEL_ID")
model_args = yaml_conf.get("MODEL_ARGS")
output_folder = yaml_conf.get("YOUR_S3_FOLDER")
cme_prompt_id = yaml_conf.get("PROMPT_MANAGEMENT_ID")
cme_prompt_name = yaml_conf.get("PROMPT_MANAGEMENT_NAME")

cme_multi_prompt_id = yaml_conf.get("MULTI_TUMOUR_PROMPT_MANAGEMENT_ID")
cme_multi_prompt_name = yaml_conf.get("MULTI_TUMOUR_PROMPT_MANAGEMENT_NAME")

# Load in the records
float_columns = [f"ER_SCORE_{i+1}" for i in range(4)] + [f"PR_SCORE_{i+1}" for i in range(4)]
records = load_dataframe_from_s3(bucket_name, data_name, float_columns)

load_dotenv()

print(f"Loaded {records.shape[0]} records")

In [None]:
post_processor = PostProcessor(records, conf.multi_tumour_accepted_values.keys(), conf.multi_tumour_accepted_values)
records["Multiple Tumours"] = records["Multiple Tumours"].fillna("0")
records = records.apply(post_processor.apply_general_mapping, mapping = {"y":"1"}, cols_to_map = ["Multiple Tumours"], axis = 1)

records["Multiple Tumours New"] = records["Multiple Tumours New"].fillna("0")
records = records.apply(post_processor.apply_general_mapping, mapping = {"y":"1"}, cols_to_map = ["Multiple Tumours New"], axis = 1)

## Flag Multiple Tumours

In [None]:
setup_logging(enable_console=False,
              enable_file=True,
              console_log_level=conf.console_log_level,
              log_dir=conf.log_dir)

preprocessor = TextPreprocessor()

prompter = PromptBuilder(model_id = model_id,
                         system_prompt = conf.multi_tumour_system_prompt,
                         prompt_layout = conf.multi_tumour_prompt)

requester = ModelRequest(model_id,
                         model_args,
                         prompter)

extractor_pipeline = ExtractorPipeline(config_file_path=conf_file_path,
                                       preprocessor=preprocessor,
                                       model_request=requester,
                                       valid_structure=MultiSchema,
                                       accepted_values = conf.multi_tumour_accepted_values)

multi_tumour_output_df = extractor_pipeline.run(df=records)

In [None]:
multi_tumour_reports = multi_tumour_output_df[multi_tumour_output_df['multi_tumour']=="1"]
single_tumour_reports = multi_tumour_output_df[multi_tumour_output_df['multi_tumour']=="0"]

In [None]:
print(f"{len(multi_tumour_reports)} reports were flagged as containing multiple tumours")

## Multi Tumour Results

There are two comparision columns for multiple tumours. 'Multiple Tumours' is based on the breast cancer registration guidelines definitions, 'Multiple Tumours New' is the same defintion but also includes reports that mention things like 'Tumour 1' and 'Tumour 2' and 'bilateral'.

In [None]:
multi_compare_cols = {"Multiple Tumours": "multi_tumour","Multiple Tumours New": "multi_tumour"}
status_column = 'status'

In [None]:
multi_eval_df = records.merge(multi_tumour_output_df, on = id_col)

evaluator = CMEEvaluator(comparison_dict=multi_compare_cols,
                         accepted_values=conf.multi_tumour_accepted_values,
                         id_col=id_col,
                         df=multi_eval_df)

In [None]:
def annotate_bars(ax, fmt="{:.1f}%"):
    """Annotate each bar in a barplot."""
    for p in ax.patches:
        height = p.get_height()
        if height > 0:
            ax.text(
                x=p.get_x() + p.get_width() / 2,
                y=height+1,
                s=fmt.format(height),
                ha='center',
                va='bottom',
                color='black',
                fontsize=10,
                fontweight='bold'
            )

In [None]:
evaluator.get_status_summary(status_column)

In [None]:
actual_cols = ['Multiple Tumours', 'Multiple Tumours New']
extracted_cols = ['multi_tumour','multi_tumour']

In [None]:
# Accuracy per metric
metric_accuracy = {}

for ent, pred in zip(actual_cols, extracted_cols):
    metric_accuracy[ent] = (evaluator.df[ent] == evaluator.df[pred]).mean() * 100

acc_series = pd.Series(metric_accuracy)

plt.figure(figsize=(8, 5))
ax1 = sns.barplot(x=acc_series.index, y=acc_series.values)
plt.ylabel("Accuracy (%)")
plt.xlabel("Extracted Metrics")
plt.title("Accuracy per Entity")
plt.ylim(0, 110)
annotate_bars(ax1)
plt.show()

In [None]:
# Confusion matrix for each metric

from sklearn.metrics import confusion_matrix

for ent, pred in zip(actual_cols, extracted_cols):
    labels = ['0','1']
    cm = confusion_matrix(evaluator.df[ent], evaluator.df[pred], labels=labels)

    plt.figure(figsize=(7, 5))
    ax = sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                xticklabels=labels, yticklabels=labels, linewidths=0.1, linecolor='grey',)
    plt.xlabel("Extracted")
    plt.ylabel("Actual")
    plt.title(f"Confusion Matrix for {ent}")

    for text in ax.texts:
        if text.get_text() == "0":
            text.set_text("")
    plt.show()

## Run Extraction

In [None]:
preprocessor = TextPreprocessor()

prompter = PromptBuilder(model_id = model_id,
                         system_prompt = conf.extraction_system_prompt,
                         prompt_layout = conf.gpt_oss_prompt,
                         accepted_values = conf.accepted_values)

requester = ModelRequest(model_id,
                         model_args,
                         prompter)

extractor_pipeline = ExtractorPipeline(config_file_path=conf_file_path,
                                       preprocessor=preprocessor,
                                       model_request=requester,
                                       valid_structure=ValidSchema,
                                       accepted_values = conf.accepted_values)

metrics_output_df = extractor_pipeline.run(df=single_tumour_reports)

In [None]:
post_processor = PostProcessor(metrics_output_df, conf.accepted_values.keys(), conf.accepted_values)

functions = {
    "map_two_part_scores": post_processor.map_two_part_scores,
    "map_score": post_processor.map_score,
    "score_to_status": post_processor.score_to_status,
    "apply_general_mapping": post_processor.apply_general_mapping,
}

settings = {
    "map_two_part_scores": {
        "enabled":True,
        "args": [{"cols_to_map":["er_score", "pr_score"]}]
    },
    "map_score": {
        "enabled": True,
        "args": [{"cols_to_map":["er_score", "pr_score"]}]
    },
    "score_to_status": {
        "enabled": True,
        "args": [{"pairs": [("er_score","er_status"),("pr_score","pr_status")]}]
    },
    "apply_general_mapping": {
        "enabled": True,
        "args": [
            {"mapping":{"0": "negative (0)", "1+": "negative (1+)", "2+": "borderline (2+)", "3+": "positive (3+)"},
                 "cols_to_map":["her2_status"]},
            {"mapping":{"null": np.nan},
                 "cols_to_map":["er_status", "er_score", "pr_status", "pr_score", "her2_status"]}
        ]
    }
}

metrics_output_df_processed = post_processor.run(functions, settings)

## Results for Metric Extraction

In [None]:
metric_compare_cols = {"ER_STATUS_1": "er_status_p",
                         "ER_SCORE_1": "er_score_p",
                         "PR_STATUS_1": "pr_status_p",
                         "PR_SCORE_1": "pr_score_p",
                         "HER2_STATUS_1": "her2_status_p"}
status_column = 'status_processed'

In [None]:
metric_eval_df = records.merge(metrics_output_df_processed, on = id_col)

evaluator = CMEEvaluator(comparison_dict=metric_compare_cols,
                         accepted_values=conf.final_accepted_values,
                         id_col=id_col,
                         df=metric_eval_df)

In [None]:
evaluator.get_status_summary(status_column)

In [None]:
evaluator.get_non_accepted_summary_all()

In [None]:
float_cols_1 = ['ER_SCORE_1','PR_SCORE_1']
results_df = evaluator.df.copy()
ground_truth_data = load_dataframe_from_s3(bucket_name,data_name, float_cols_1)

actual_cols = ['ER_STATUS_1','ER_SCORE_1','PR_STATUS_1','PR_SCORE_1','HER2_STATUS_1']
extracted_cols = ['er_status_p','er_score_p','pr_status_p','pr_score_p','her2_status_p']
llm_extraction = results_df[[id_col]+extracted_cols].fillna('blank')
ground_truth_data = ground_truth_data[[id_col]+actual_cols].fillna('blank')

gt_vs_llm = ground_truth_data.merge(llm_extraction, on = id_col, how = 'inner')

In [None]:
# Compute per-document number of correct predictions
gt_vs_llm["num_correct"] = (gt_vs_llm[actual_cols].values == gt_vs_llm[extracted_cols].values).sum(axis=1)

correct_dist = gt_vs_llm["num_correct"].value_counts(normalize=True).reindex([0,1,2,3,4,5]).sort_index() * 100

plt.figure(figsize=(8, 5))
ax1 = sns.barplot(x=['0','1','2','3','4','5'], y=correct_dist.values)
plt.xlabel("Number of correctly extracted entities")
plt.ylabel("Percentage of documents (%)")
plt.title("Distribution of Documents by Number of Correct Entity Extractions")
plt.ylim(0, 110)
annotate_bars(ax1)
plt.show()

In [None]:
# Accuracy per metric
metric_accuracy = {}

for ent, pred in zip(actual_cols, extracted_cols):
    metric_accuracy[ent] = (gt_vs_llm[ent] == gt_vs_llm[pred]).mean() * 100

acc_series = pd.Series(metric_accuracy)

plt.figure(figsize=(8, 5))
ax1 = sns.barplot(x=acc_series.index, y=acc_series.values)
plt.ylabel("Accuracy (%)")
plt.xlabel("Extracted Metrics")
plt.title("Accuracy per Entity")
plt.ylim(0, 110)
annotate_bars(ax1)
plt.show()

In [None]:
# Confusion matrix for each metric
from sklearn.metrics import confusion_matrix

for ent, pred in zip(actual_cols, extracted_cols):
    if ent in ['ER_STATUS_1','PR_STATUS_1']:
        labels = ['positive','negative','not performed','blank']
    elif ent in ['ER_SCORE_1','PR_SCORE_1']:
        labels = ['0','2','3','4','5','6','7','8','blank']
    else:
        labels = ['negative (unknown)','negative (0)','negative (1+)','borderline (2+)','positive (3+)','not performed','blank']
    cm = confusion_matrix(gt_vs_llm[ent], gt_vs_llm[pred], labels=labels)

    plt.figure(figsize=(7, 5))
    ax = sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                xticklabels=labels, yticklabels=labels, linewidths=0.1, linecolor='grey',)
    plt.xlabel("Extracted")
    plt.ylabel("Actual")
    plt.title(f"Confusion Matrix for {ent}")

    for text in ax.texts:
        if text.get_text() == "0":
            text.set_text("")
    plt.show()

In [None]:
results_all = pd.concat([metrics_output_df_processed, multi_tumour_reports], join='outer', ignore_index=True)
all_eval_df = multi_tumour_output_df.drop(["REPORT", "preprocessed_REPORT"], axis = 1).merge(metrics_output_df_processed.drop(["REPORT", "preprocessed_REPORT"], axis = 1), on = id_col, how='left', suffixes=('_multi',None))
all_eval_df = records.merge(all_eval_df, on = id_col)

In [None]:
# Save to S3 if required
# file_name = "descriptive file name here"
# save_dataframe_to_s3(all_eval_df,bucket_name,f"final_outputs/{file_name}.csv")