# 1. Load Raw Data (with Sentences and Ground Truths)

In [1]:
import os
import pandas as pd

file_name = 'data_oncology_granular.pkl'
data_path = os.path.join('..', 'data', file_name )
print("Raw data path:", data_path)
data_df = pd.read_pickle(data_path)
data_df.head(1)

Raw data path: ../data/data_oncology_granular.pkl


Unnamed: 0,text,ner_chunk,count_onco,ground_truth_list
296,The postoperative pathology was invasive carci...,"[(chunk, 18, 26, pathology, {'sentence': '0', ...",7.0,"[{'entity_type': 'Pathology_Test', 'chunk': 'p..."


# 2. Task Selection

In [11]:
import json

# read tasks from file
with open("./sources/tasks_list.json", "r") as f:
    data = json.load(f)

# extract problem_entity_list and task_list
task_list = data["task_list"]
eval_options_dict = data["eval_options_dict"]


## 2.1 Select Task to Get Predictions

In [22]:
# select entity for evaluation
entity_under_test = "Oncological"
# continue with evaluation using selected entity_under_test
print("Predictions will be generated for the entities:", entity_under_test)

Predictions will be generated for the entities: Oncological


# 3. Prepare Data for Prediction

In [23]:
# filter gt data for prediction based on selected entity under test and entity counts
covered_gt_label_list = task_list[entity_under_test]['covered_gt_label_list']
count_filter = task_list[entity_under_test]['count_filter']

prediction_source = "ChatGPT"

processed_gt_df = data_df.copy()
processed_gt_df.loc[:, 'prediction_list'] = [[] for _ in range(len(processed_gt_df))]

# 4. GPT Predictions 

## 4.1 Load Prompt and Initiate Entity Extractor

In [24]:
from modules import NerExtraction
prompt_file_path = os.path.join('.', 'prompts', task_list[entity_under_test]['prompt_file'] )
print(f"Prompt for [{entity_under_test}] extraction task: ({prompt_file_path})")

with open(prompt_file_path, 'r') as file:
    print("PROMPT:\n")
    for line in file:
        print(line)
    
ner_extraction = NerExtraction.ChatGPTNER(prompt_file_path)

Prompt for [Oncological] extraction task: (./prompts/11_02_Oncological.txt)
PROMPT:



You are a highly experienced and skilled medical annotator who have been working on medical texts to label medical entities.



I will provide you some entity types with sample chunks and I want you to find similar entities from given texts and label them with right entity types.



-  Entity Type: Oncological

    Instruction: include all the cancer, tumor or metastasis related extractions mentioned in the document, of the patient or someone else.



    Examples:

    a) given sample sentence:

    His mother was diagnosed with colon cancer in her 50s, but she died of cancer of the esophagus at age 86.

    Oncological Entities in above given text: colon cancer, cancer of the esophagus



    b) given sample sentence:

    She was diagnosed with pseudomyxoma peritonei in 1994.

    Oncological Entities in above given text: pseudomyxoma peritonei



I want you to extract all Oncological type entitie

## 4.2 Get Prediction

In [28]:
# !!! Run without any changes

from modules import BatchProcessing
from modules.ProcessPredData import corrected_json, get_list_of_entities
import datetime

# Assing auto name to save prediction data as csv and excel
now = datetime.datetime.now()
file_name = f"{entity_under_test}_preds_{now.strftime('%m%d_%H%M')}"
if not os.path.exists('processed_data'):
    os.makedirs('processed_data')
processed_data_path = os.path.join('.', 'processed_data', file_name)

print(processed_data_path)
batch_processor = BatchProcessing.ProcessBatch(
    processed_gt_df.head(5),
    ner_extraction,
    corrected_json,
    get_list_of_entities,
    processed_data_path
)

results_df = batch_processor.do_processing()

./processed_data/Oncological_preds_0405_1127
Given dataframe shape (5, 5)
Getting predictions from API started...
Query: 1 | index: 296 | status: SUCCESS
Query: 2 | index: 604 | status: SUCCESS
Query: 3 | index: 637 | status: SUCCESS
Query: 4 | index: 770 | status: SUCCESS
Query: 5 | index: 774 | status: SUCCESS
Getting predictions from API finished.
final df shape: (5, 5)
file saved as ./processed_data/Oncological_preds_0405_1127.csv 


# 5. Evaluation

## 5.1 Selection of entity for evaluation

In [25]:
###!!! USER INPUT MIGHT BE NECESSARY if there are multiple entites for evaluation under given task
entity_for_benchmark = "Oncological_Granular"

## 5.2 Run Evaluation 

In [26]:
# No user interaction needed, just run the cell
from modules import Evaluation
import datetime
now = datetime.datetime.now()

eval_file_to_save = entity_for_benchmark + "_" + prediction_source + f"_eval_{now.strftime('%m%d_%H%M')}"

selected_entity_prediction = eval_options_dict[entity_for_benchmark]["selected_entity_prediction"]
selected_entity_gt = eval_options_dict[entity_for_benchmark]["selected_entity_gt"]
gt_type_dict = eval_options_dict[entity_for_benchmark]["gt_type_dict"]

file_path__to_read_prediction = f"{processed_data_path}.csv"
# alternative to reading prediction results from file 
# dataframe = results_df !!! make >> file_path__to_read_prediction = None 

evaluator = Evaluation.Evaluate(
    file_path=file_path__to_read_prediction, 
    dataframe=None, 
    prediction_source=prediction_source
)

eval_results = evaluator.get_match_counts(
    selected_entity_prediction, 
    selected_entity_gt, 
    eval_file_to_save,
    gt_type_dict
)

# create folder if it doesn't exist
if not os.path.exists('eval_results'):
    os.makedirs('eval_results')
    
# write result to JSON file
with open('eval_results/eval_result.json', 'a') as f:
    json.dump(eval_results, f)
    print("Results appended to file: eval_results/eval_result.json")
    
eval_results

Evaluation results saved as ./eval_results/Oncological_Granular_ChatGPT_eval_0405_1125.xlsx
Results appended to file: eval_results/eval_result.json


{'version': 'Oncological_Granular_ChatGPT_eval_0405_1125',
 'selected_entity_prediction': ['Oncological'],
 'selected_entity_gt': ['Tumor_Finding',
  'Site_Lymph_Node',
  'Adenopathy',
  'Cancer_Dx',
  'Cancer_Score',
  'Cancer_Surgery',
  'Chemotherapy',
  'Grade',
  'Metastasis',
  'Pathology_Result',
  'Pathology_Test',
  'Staging'],
 'full_match': 8,
 'accuracy_full_match': 0.11,
 'partial_match': 23,
 'accuracy_partial_match': 0.44,
 'no_match': 40,
 'gt_count': 71,
 'fp_count': 3}