# Prompt Engineering WorkFlow

In [None]:
import pandas as pd
import numpy as np
from dotenv import load_dotenv

from src.helpers import load_config_from_yaml
from src.text_preprocessor import TextPreprocessor
from src.prompt_builder import PromptBuilder
from src.model_request import ModelRequest
from src.extractor_pipeline import ExtractorPipeline
from src.post_processor import PostProcessor
from src.custom_logging import setup_logging
from src.cme_evaluator import CMEEvaluator
from src.load_data import save_eval_df_to_s3, load_dataframe_from_s3, save_dataframe_to_s3
import config.pipeline_config as conf
from config.validation_config import ValidSchema

## Load and Initiate Config

In [None]:
# Define config file path
conf_file_path = "./config/local.yaml"
# Load config
yaml_conf = load_config_from_yaml(file_path=conf_file_path)

# Get bucket name and data name from config.
bucket_name = yaml_conf.get("BUCKET_NAME")
data_name = yaml_conf.get("THE_DATA")

# Model and S3 info:
model_id = yaml_conf.get("MODEL_ID")
model_args = yaml_conf.get("MODEL_ARGS")
output_folder = yaml_conf.get("YOUR_S3_FOLDER")
cme_prompt_id = yaml_conf.get("PROMPT_MANAGEMENT_ID")
cme_prompt_name = yaml_conf.get("PROMPT_MANAGEMENT_NAME")

# Load in the record table. Use the .head(x) to only use the first x reports (useful for a quick test).
float_columns = [f"ER_SCORE_{i+1}" for i in range(4)] + [f"PR_SCORE_{i+1}" for i in range(4)]
records = load_dataframe_from_s3(bucket_name, data_name, float_columns).head(10)

# To keep things simple for now we are only looking at records where one tumour is present
records = records[records['Multiple Tumours'] != 'Y']

# Load dotenv
load_dotenv()

print(f"There are {records.shape[0]} records in this dataframe.")

## Editing the Prompt
This is where you edit the prompt.

In [None]:
# EDIT YOUR PROMPT HERE
system_prompt = None# Llama and Claude models only
prompt_layout = """
You have this document:
{document}

I would like you to extract out only ER Status, ER Score, PR Status, PR Score 
and HER2 Status and return the output in a JSON markdown structure. Exclude explanations and extra information from your response.

Every entity extracted must have a value from the accepted values below:

{accepted_values}
"""

## Running the Extractor Pipeline

This can take up to 10 minutes when running it on all reports.

In [None]:
# RUN THE PIPELINE HERE
setup_logging(enable_console=False,
              enable_file=True,
              console_log_level=conf.console_log_level,
              log_dir=conf.log_dir)

preprocessor = TextPreprocessor()

prompter = PromptBuilder(model_id = model_id,
                         prompt_layout = prompt_layout,
                         system_prompt = system_prompt,
                         accepted_values = conf.accepted_values)

# Use the below prompter instead if you want to define a prompt version from prompt management.
# prompter = PromptBuilder(model_id = model_id,
#                          system_prompt = system_prompt,
#                          prompt_id = cme_prompt_id,
#                          prompt_version = 0,
#                          accepted_values = conf.accepted_values)

requester = ModelRequest(model_id,
                         model_args,
                         prompter)

extractor_pipeline = ExtractorPipeline(config_file_path=conf_file_path,
                                       preprocessor=preprocessor,
                                       model_request=requester,
                                       valid_structure=ValidSchema,
                                       accepted_values = conf.accepted_values)

output_df = extractor_pipeline.run(df=records)

If you would like to reload a prompt, this will list the prompt versions with their respective descriptions

In [None]:
# LIST VERSIONS OF YOUR PROMPT
prompter.list_prompt_versions(cme_prompt_id)

## Post Process
Applies the post processing steps to the LLM output

In [None]:
post_processor = PostProcessor(output_df, conf.accepted_values.keys())

functions = {
    "map_two_part_scores": post_processor.map_two_part_scores,
    "map_score": post_processor.map_score,
    "score_to_status": post_processor.score_to_status,
    "apply_general_mapping": post_processor.apply_general_mapping,
}

# Configure which post-processing function to run, which columns they run on and their mapping instructions
settings = {
    "map_two_part_scores": {
        "enabled":True,
        "args": [{"cols_to_map":["er_score", "pr_score"]}]
    },
    "map_score": {
        "enabled": True,
        "args": [{"cols_to_map":["er_score", "pr_score"]}]
    },
    "score_to_status": {
        "enabled": True,
        "args": [{"pairs": [("er_score","er_status"),("pr_score","pr_status")]}]
    },
    "apply_general_mapping": {
        "enabled": True,
        "args": [
            {"mapping":{"0": "negative (0)", "1+": "negative (1+)", "2+": "borderline (2+)", "3+": "positive (3+)"},
                 "cols_to_map":["her2_status"]},
            {"mapping":{"null": np.nan},
                 "cols_to_map":["er_status", "er_score", "pr_status", "pr_score", "her2_status"]}
        ]
    }
}

output_df_processed = post_processor.run(functions, settings)

## Set-up Evaluator

This creates the comparison dataframe.

In [None]:
# DEFINE WHICH COLUMNS TO COMPARE
original_compare_cols = {"ER_STATUS_1": "er_status_p",
                         "ER_SCORE_1": "er_score_p",
                         "PR_STATUS_1": "pr_status_p",
                         "PR_SCORE_1": "pr_score_p",
                         "HER2_STATUS_1": "her2_status_p"}
status_column = 'status_processed'

This initiates the evaluator class

In [None]:
# SET UP THE EVALUATOR
eval_df = records.merge(output_df_processed, on = "PATHOLOGY_ID")

evaluator = CMEEvaluator(comparison_dict=original_compare_cols,
                         accepted_values=conf.final_accepted_values,
                         id_col="PATHOLOGY_ID",
                         df=eval_df)

In [None]:
# UNCOMMENT THIS - if you want to load the results for a specific description. i.e. that compliements your reloaded prompt_version.

# evaluator = CMEEvaluator(comparison_dict=original_compare_cols,
#                          accepted_values=conf.final_accepted_values,
#                          id_col="PATHOLOGY_ID",
#                          bucket_name=bucket_name,
#                          folder=output_folder,
#                          list_saved=True
#                         )

## Evaluating the JSON Output and Unaccepted Values

### First Check the breakdown of statuses

* **valid**: This means output parsed to a JSON and all the keys of the JSON is present, and the values are in the accepted value list.
* **partial**: This means the output parsed to a JSON, but some of the keys are missing or a value for a given key is not an accepted value.
* **invalid**: This means the output parsed to a JSON, but none of the expected keys are present.
* **validation_failed**: This means the output was unable to parse to JSON.

We want to maximise the number of valids.

In [None]:
evaluator.get_status_summary(status_column)

### Note down why some were "validation failed"
JSON has to be of a particular structure. So it might be there are special characters in the JSON or extra commas when there shouldn't be etc.

Any notes here can be fed into the prompt, but might also need to be changed in the post-processing of the JSON itself.

In [None]:
pd.set_option("display.max_colwidth", 200)
pd.set_option("display.max_rows", 300)
evaluator.get_validation_failed(status_column)

### Note down why some were "invalid"
There might be something wrong with how the JSON is being constructed with the keys, such as weird characters in the keys, or completely wrong value of the key.

Any notes here can be fed into the prompt, but might also need to be changed in the post-processing of the JSON itself.

In [None]:
pd.set_option("display.max_colwidth", 200)
pd.set_option("display.max_rows", 300)
evaluator.get_invalid(status_column)

### Note down why some were "partially invalid".
In this scenario some of the JSON's keys might be wrongly defined. 

Any notes here can be fed into the prompt, but might also need to be changed in the post-processing of the JSON itself.

In [None]:
pd.set_option("display.max_colwidth", 200)
pd.set_option("display.max_rows", 300)
evaluator.get_partial(status_column)

### Note down when a non-accepted value has been extracted out instead

Below is the list of accepted values for each Status and Score. 

The LLM might have extracted out something different. Use this as an opportunity to work out how to improve the instructions fed to the model.

In [None]:
conf.accepted_values

In [None]:
# This will print ALL of the non-accepted values. If this is too long then comment this out.
# And uncomment the code below for specific "actual_cols"
evaluator.get_non_accepted_summary_all()

# If you want to look at a specific "actual_col" you can use the commented function below
#evaluator.get_non_accepted_summary("HER2_STATUS_1")

If you are unsure as to why a value has been extracted like this, you can print reports below to show values for a given text id.

In [None]:
evaluator.print_text(text_col="REPORT_x", id_val=0)

### Evaluating the Correctness of Extraction

The plot below will give an overview of:
1. **Percentage of Correct Metric Count Across Reports**: This will be for each row the number of values it has got correct out of ER/PR/HER2, status and score.
2. **Percentage of Correct per Comparison Column**: For each of ER/PR/HER2 status/score, how much of it was extracted correctly.

In [None]:
evaluator.plot_correctness_and_rowwise_distribution()

The plot below will give an overview of for each ER/PR/HER2 status/score:
1. **Actual V Extracted Matrix**: This is a count matrix, to show what the actual value is in the report, and what was actually extracted.
   * **key_missing**: This means a key was missing from the JSON for this value.
   * **validation_failed**: This wasn't extracted because the validation failed.
   * **non-accepted values**: Highlights if any were assigned to a value that is not in the accepted list.
2. **Per-Value Metrics**: This displays the precision, recall, and f1 score for each value that is extracted.

In [None]:
evaluator.plot_per_metric_plots_for_all()

## Exploring the specific differences between Actual and Expected

The cell below will retrieve the documents where the "wrong" value has been extracted.

In [None]:
actual_column = "PR_SCORE_1" # ER_STATUS_1, PR_STATUS_1, ER_SCORE_1, PR_SCORE_1, HER2_STATUS_1
extracted_column = original_compare_cols[actual_column]

# Change this to the value you see
actual_value = "1" # i.e. positive
extracted_value = "0" # i.e. negative

# Extract out the values for eval_df
evaluator.df[(evaluator.df[actual_column] == actual_value) & (evaluator.df[extracted_column] == extracted_value)]

Use the cell below to explore the reports:

In [None]:
evaluator.print_text(text_col="REPORT_x", id_val=5)

## Lastly! Was this a good run? Do you want to save something about this run? 

If so please run the last two cells to save your prompt and eval dataframe. We will use the same description here so we can link up prompt with outputs.

In [None]:
# If you don't know what version to write, please use this cell to help display the last version saved.
prompter.list_prompt_versions(cme_prompt_id)

In [None]:
overall_description = "EDIT ME" # Tell me what was good about this run, was changes were made, etc. Why did you save it?

In [None]:
# Saves Prompt Version - you will need to make sure the version is the correct one. 
prompter.save_prompt_version(cme_prompt_id, cme_prompt_name, 1, overall_description)

# Saves Evaluation Outputs
save_eval_df_to_s3(df=eval_df,
                   bucket_name=bucket_name,
                   folder=output_folder,
                   description=overall_description)