# 1. Load Raw Data (with Sentences and Ground Truths)

In [1]:
import os
import pandas as pd

file_name= '100_rows_with_gt_jsl.pkl'
data_path = os.path.join('..', 'data/deid/', file_name )
print("Raw data path:", data_path)

data_df = pd.read_pickle(data_path)
data_df.head(1)

Raw data path: ../data/deid/100_rows_with_gt_jsl.pkl


Unnamed: 0,id,text,ground_truth_list,jsl_prediction_list,prediction_list
0,929334185,\n929334185\nFIH\n8151167\n53653/y9m1\n539442\...,"[{'entity_type': 'ID', 'chunk': '929334185', '...","[{'entity_type': 'ID', 'chunk': '929334185', '...",[]


# 2. Task Selection

In [2]:
import json

# read tasks from file
with open("./sources/tasks_list.json", "r") as f:
    data = json.load(f)

# extract problem_entity_list and task_list
task_list = data["task_list"]
eval_options_dict = data["eval_options_dict"]


## 2.1 Select Task to Get Predictions

In [3]:
# select entity for evaluation
entity_under_test = "Deid"
# continue with evaluation using selected entity_under_test
print("Predictions will be generated for the entities:", entity_under_test)

Predictions will be generated for the entities: Deid


# 3. Prepare Data for Prediction

In [4]:
# filter gt data for prediction based on selected entity under test and entity counts
covered_gt_label_list = task_list[entity_under_test]['covered_gt_label_list']

prediction_source = "ChatGPT"

processed_gt_df = data_df.copy()
processed_gt_df.loc[:, 'prediction_list'] = [[] for _ in range(len(processed_gt_df))]

# 4. GPT Predictions 

## 4.1 Load Prompt and Initiate Entity Extractor

In [5]:
from modules import NerExtraction
prompt_file_path = os.path.join('.', 'prompts', task_list[entity_under_test]['prompt_file'] )
print(f"Prompt for [{entity_under_test}] extraction task: ({prompt_file_path})")

with open(prompt_file_path, 'r') as file:
    print("PROMPT:\n")
    for line in file:
        print(line)
    
ner_extraction = NerExtraction.ChatGPTNER(prompt_file_path)

Prompt for [Deid] extraction task: (./prompts/13_01_DEID.txt)
PROMPT:

Please extract the following entities from the provided medical record text, specifically hospital discharge notes and/or discharge summaries:

ID, DATE, AGE, PHONE, PERSON, ORGANIZATION, LOCATION.

IMPOTANT!!! Do not return any other entities except these ones.



I have provided sample sentences for each entity type below:



ID:

"Mr. Smith's patient ID is 123456789 and he has been visiting our clinic since 2020."

Example:

[{'entity_type': 'ID', 'chunk': '123456789'}]



DATE:

"Mrs. Johnson had her last appointment on March 21, 2023, and her next appointment is scheduled for April 18, 2023."

Example:

[{'entity_type': 'DATE', 'chunk': 'March 21, 2023'},

{'entity_type': 'DATE', 'chunk': 'April 18, 2023'}]



AGE:

"Mr. Anderson is a 45-year-old patient with a history of hypertension."

Example:

[{'entity_type': 'AGE', 'chunk': '45'}]



PHONE:

"You can reach Dr. Adams at 555-123-4567 for any questions regar

## 4.2 Get Prediction

In [6]:
# !!! Run without any changes

from modules import BatchProcessing
from modules.ProcessPredData import corrected_json, get_list_of_entities
import datetime

# Assing auto name to save prediction data as csv and excel
now = datetime.datetime.now()
file_name = f"{entity_under_test}_preds_{now.strftime('%m%d_%H%M')}"
if not os.path.exists('processed_data'):
    os.makedirs('processed_data')
processed_data_path = os.path.join('.', 'processed_data', file_name)

print(processed_data_path)
batch_processor = BatchProcessing.ProcessBatch(
    processed_gt_df.head(5),
    ner_extraction,
    corrected_json,
    get_list_of_entities,
    processed_data_path
)

results_df = batch_processor.do_processing()

./processed_data/Deid_preds_0414_1659
Given dataframe shape (5, 5)
Getting predictions from API started...
Query: 1 | index: 0 | status: SUCCESS


2023-04-14 16:59:32,124 - ProcessPredData - ERROR - generate_start_end_index function error decoding to str: need a bytes-like object, NoneType found
2023-04-14 16:59:32,126 - ProcessPredData - ERROR - generate_start_end_index function error decoding to str: need a bytes-like object, NoneType found
2023-04-14 16:59:32,127 - ProcessPredData - ERROR - generate_start_end_index function error decoding to str: need a bytes-like object, NoneType found


Query: 2 | index: 1 | status: SUCCESS
Query: 3 | index: 2 | status: SUCCESS
Query: 4 | index: 3 | status: SUCCESS


2023-04-14 17:00:18,406 - ProcessPredData - ERROR - generate_start_end_index function error decoding to str: need a bytes-like object, NoneType found


Query: 5 | index: 4 | status: SUCCESS
Getting predictions from API finished.
final df shape: (5, 5)
file saved as ./processed_data/Deid_preds_0414_1659.csv 


# 5. Evaluation

## 5.1 Selection of entity for evaluation

In [7]:
###!!! USER INPUT MIGHT BE NECESSARY if there are multiple entites for evaluation under given task
entity_for_benchmark = "Deid"

## 5.2 Run Evaluation 

In [8]:
# No user interaction needed, just run the cell
from modules import Evaluation
import datetime
now = datetime.datetime.now()

eval_file_to_save = entity_for_benchmark + "_" + prediction_source + f"_eval_{now.strftime('%m%d_%H%M')}"

selected_entity_prediction = eval_options_dict[entity_for_benchmark]["selected_entity_prediction"]
selected_entity_gt = eval_options_dict[entity_for_benchmark]["selected_entity_gt"]
gt_type_dict = eval_options_dict[entity_for_benchmark]["gt_type_dict"]

file_path__to_read_prediction = f"{processed_data_path}.csv"
# alternative to reading prediction results from file 
# dataframe = results_df !!! make >> file_path__to_read_prediction = None 

evaluator = Evaluation.Evaluate(
    file_path=file_path__to_read_prediction, 
    dataframe=None, 
    prediction_source=prediction_source
)

eval_results = evaluator.get_match_counts(
    selected_entity_prediction, 
    selected_entity_gt, 
    eval_file_to_save,
    gt_type_dict
)

# create folder if it doesn't exist
if not os.path.exists('eval_results'):
    os.makedirs('eval_results')
    
# write result to JSON file
with open('eval_results/eval_result.json', 'a') as f:
    json.dump(eval_results, f)
    print("Results appended to file: eval_results/eval_result.json")
    
eval_results

Evaluation results saved as ./eval_results/Deid_ChatGPT_eval_0414_1700.xlsx
Results appended to file: eval_results/eval_result.json


{'version': 'Deid_ChatGPT_eval_0414_1700',
 'selected_entity_prediction': ['ID',
  'DATE',
  'AGE',
  'PHONE',
  'PERSON',
  'LOCATION',
  'ORGANIZATION'],
 'selected_entity_gt': ['ID',
  'DATE',
  'AGE',
  'PHONE',
  'DOCTOR',
  'PATIENT',
  'NAME',
  'HOSPITAL',
  'LOCATION',
  'ORGANIZATION'],
 'full_match': 30,
 'accuracy_full_match': 0.07,
 'partial_match': 25,
 'accuracy_partial_match': 0.13,
 'no_match': 358,
 'gt_count': 413,
 'fp_count': 0}