# 1. Load Raw Data (with Sentences and Ground Truths)

In [1]:
import os
import pandas as pd

file_name= '100_rows_with_gt_jsl.pkl'
data_path = os.path.join('..', 'data/deid/', file_name )
print("Raw data path:", data_path)

data_df = pd.read_pickle(data_path)
data_df.head(1)

Raw data path: ../data/deid/100_rows_with_gt_jsl.pkl


Unnamed: 0,id,text,ground_truth_list,jsl_prediction_list,prediction_list
0,929334185,\n929334185\nFIH\n8151167\n53653/y9m1\n539442\...,"[{'entity_type': 'ID', 'chunk': '929334185', '...","[{'entity_type': 'ID', 'chunk': '929334185', '...",[]


# 2. Task Selection

In [2]:
import json

# read tasks from file
with open("./sources/tasks_list.json", "r") as f:
    data = json.load(f)

# extract problem_entity_list and task_list
task_list = data["task_list"]
eval_options_dict = data["eval_options_dict"]


## 2.1 Select Task to Get Predictions

In [3]:
# select entity for evaluation
entity_under_test = "Deid"
# continue with evaluation using selected entity_under_test
print("Predictions will be generated for the entities:", entity_under_test)

Predictions will be generated for the entities: Deid


# 3. Prepare Data for Prediction

In [4]:
# filter gt data for prediction based on selected entity under test and entity counts
covered_gt_label_list = task_list[entity_under_test]['covered_gt_label_list']

prediction_source = "ChatGPT"

processed_gt_df = data_df.copy()
processed_gt_df.loc[:, 'prediction_list'] = [[] for _ in range(len(processed_gt_df))]

# 4. GPT Predictions 

## 4.1 Load Prompt and Initiate Entity Extractor

In [5]:
from modules import NerExtraction
prompt_file_path = os.path.join('.', 'prompts', task_list[entity_under_test]['prompt_file'] )
print(f"Prompt for [{entity_under_test}] extraction task: ({prompt_file_path})")

with open(prompt_file_path, 'r') as file:
    print("PROMPT:\n")
    for line in file:
        print(line)
    
ner_extraction = NerExtraction.ChatGPTNER(prompt_file_path)

Prompt for [Deid] extraction task: (./prompts/13_01_DEID.txt)
PROMPT:

Please extract the following entities from the provided medical record text, specifically hospital discharge notes and/or discharge summaries:

ID, DATE, AGE, PHONE, PERSON, ORGANIZATION, LOCATION.

IMPOTANT!!! Do not return any other entities except these ones.



I have provided sample sentences for each entity type below:



ID:

"Mr. Smith's patient ID is 123456789 and he has been visiting our clinic since 2020."

Example:

[{'entity_type': 'ID', 'chunk': '123456789'}]



DATE:

"Mrs. Johnson had her last appointment on March 21, 2023, and her next appointment is scheduled for April 18, 2023."

Example:

[{'entity_type': 'DATE', 'chunk': 'March 21, 2023'},

{'entity_type': 'DATE', 'chunk': 'April 18, 2023'}]



AGE:

"Mr. Anderson is a 45-year-old patient with a history of hypertension."

Example:

[{'entity_type': 'AGE', 'chunk': '45'}]



PHONE:

"You can reach Dr. Adams at 555-123-4567 for any questions regar

## Single GPT Prediction

In [10]:
sentence = processed_gt_df.iloc[3,1]
sentence

'\n356529973\nFIH\n2102647\n73383/3545\n791416\n1/19/1993 12:00:00 AM\nPERSISTENT PRIMARY HYPERPARATHYROIDISM .\nUnsigned\nDIS\nReport Status :\nUnsigned\nADMISSION DATE :\n1-19-93\nDISCHARGE DATE :\n1-25-93\nPRINCIPAL DIAGNOSIS :\npersistent primary hyperparathyroidism .\nASSOCIATED DIAGNOSIS :\n1. hypercalcemia ,\n2. status post previous parathyroidectomy ,\n3. history of depression ,\n4. hypertension ,\n5. cholelithiasis ,\n6. thyroid papillary carcinoma ,\n7. status post total thyroidectomy ,\n8. status post radio-iodine therapy ,\n9. history of pancreatitis ,\n10. status post tube thoracostomy for empyema ,\n11. history of cigarette smoking ,\n12. history of lymphosarcoma , treated with radiation therapy ,\n13. abnormal Papanicolaou smear ,\n14. status post colonic polypectomy ,\n15. nephrolithiasis ,\n16. status post extracorporeal shockwave lithotripsy ,\n17. history of urinary tract infections , recurrent ,\n18. status post appendectomy ,\n19. hiatus hernia ,\n20. migraine head

In [11]:
result = ner_extraction.do_query(sentence)

In [12]:
result

<OpenAIObject chat.completion id=chatcmpl-75HMmjHYjvwDevQPJYV3NRYe3RZOz at 0x7f9fa46f5220> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "message": {
        "content": "{\"list_of_entities\":\n[\n{\"entity_type\": \"ID\", \"chunk\": \"2102647\"},\n{\"entity_type\": \"AGE\", \"chunk\": \"52\"},\n{\"entity_type\": \"PHONE\", \"chunk\": null},\n{\"entity_type\": \"PATIENT\", \"chunk\": null},\n{\"entity_type\": \"ORGANIZATION\", \"chunk\": \"Memorial Hospital\"},\n{\"entity_type\": \"LOCATION\", \"chunk\": null},\n{\"entity_type\": \"DATE\", \"chunk\": \"1-19-93\"},\n{\"entity_type\": \"DATE\", \"chunk\": \"1-25-93\"}\n]\n}",
        "role": "assistant"
      }
    }
  ],
  "created": 1681492752,
  "id": "chatcmpl-75HMmjHYjvwDevQPJYV3NRYe3RZOz",
  "model": "gpt-3.5-turbo-0301",
  "object": "chat.completion",
  "usage": {
    "completion_tokens": 124,
    "prompt_tokens": 1814,
    "total_tokens": 1938
  }
}

In [13]:
result["choices"][0]["message"]['content']

'{"list_of_entities":\n[\n{"entity_type": "ID", "chunk": "2102647"},\n{"entity_type": "AGE", "chunk": "52"},\n{"entity_type": "PHONE", "chunk": null},\n{"entity_type": "PATIENT", "chunk": null},\n{"entity_type": "ORGANIZATION", "chunk": "Memorial Hospital"},\n{"entity_type": "LOCATION", "chunk": null},\n{"entity_type": "DATE", "chunk": "1-19-93"},\n{"entity_type": "DATE", "chunk": "1-25-93"}\n]\n}'

In [15]:
# GPT3.5
from modules.ProcessPredData import corrected_json, get_list_of_entities

json_result = corrected_json(result)
json_result

{'list_of_entities': [{'entity_type': 'ID', 'chunk': '2102647'},
  {'entity_type': 'AGE', 'chunk': '52'},
  {'entity_type': 'PHONE', 'chunk': None},
  {'entity_type': 'PATIENT', 'chunk': None},
  {'entity_type': 'ORGANIZATION', 'chunk': 'Memorial Hospital'},
  {'entity_type': 'LOCATION', 'chunk': None},
  {'entity_type': 'DATE', 'chunk': '1-19-93'},
  {'entity_type': 'DATE', 'chunk': '1-25-93'}]}

## 4.2 Get Prediction

In [7]:
# !!! Run without any changes

from modules import BatchProcessing
from modules.ProcessPredData import corrected_json, get_list_of_entities
import datetime

# Assing auto name to save prediction data as csv and excel
now = datetime.datetime.now()
file_name = f"{entity_under_test}_preds_{now.strftime('%m%d_%H%M')}"
if not os.path.exists('processed_data'):
    os.makedirs('processed_data')
processed_data_path = os.path.join('.', 'processed_data', file_name)

# number of sentences to process
i = 5

print(processed_data_path)
batch_processor = BatchProcessing.ProcessBatch(
    processed_gt_df.head(i),
    ner_extraction,
    corrected_json,
    get_list_of_entities,
    processed_data_path
)

results_df = batch_processor.do_processing()

./processed_data/Deid_preds_0414_1712
Given dataframe shape (5, 5)
Getting predictions from API started...
Query: 1 | index: 0 | status: SUCCESS
Query: 2 | index: 1 | status: SUCCESS


2023-04-14 17:12:49,248 - ProcessPredData - ERROR - generate_start_end_index function error decoding to str: need a bytes-like object, NoneType found
2023-04-14 17:12:49,251 - ProcessPredData - ERROR - generate_start_end_index function error decoding to str: need a bytes-like object, NoneType found
2023-04-14 17:12:49,253 - ProcessPredData - ERROR - generate_start_end_index function error decoding to str: need a bytes-like object, NoneType found


Query: 3 | index: 2 | status: SUCCESS


2023-04-14 17:12:55,477 - ProcessPredData - ERROR - generate_start_end_index function error decoding to str: need a bytes-like object, NoneType found
2023-04-14 17:12:55,479 - ProcessPredData - ERROR - generate_start_end_index function error decoding to str: need a bytes-like object, NoneType found


Query: 4 | index: 3 | status: SUCCESS


2023-04-14 17:13:13,779 - ProcessPredData - ERROR - corrected_json function error :malformed or empty prediction from gpt api... skipping this sentence


Getting predictions from API finished.
final df shape: (4, 5)
file saved as ./processed_data/Deid_preds_0414_1712.csv 


# 5. Evaluation

## 5.1 Selection of entity for evaluation

In [8]:
###!!! USER INPUT MIGHT BE NECESSARY if there are multiple entites for evaluation under given task
entity_for_benchmark = "Deid"

## 5.2 Run Evaluation 

In [9]:
# No user interaction needed, just run the cell
from modules import Evaluation
import datetime
now = datetime.datetime.now()

eval_file_to_save = entity_for_benchmark + "_" + prediction_source + f"_eval_{now.strftime('%m%d_%H%M')}"

selected_entity_prediction = eval_options_dict[entity_for_benchmark]["selected_entity_prediction"]
selected_entity_gt = eval_options_dict[entity_for_benchmark]["selected_entity_gt"]
gt_type_dict = eval_options_dict[entity_for_benchmark]["gt_type_dict"]

file_path__to_read_prediction = f"{processed_data_path}.csv"
# alternative to reading prediction results from file 
# dataframe = results_df !!! make >> file_path__to_read_prediction = None 

evaluator = Evaluation.Evaluate(
    file_path=file_path__to_read_prediction, 
    dataframe=None, 
    prediction_source=prediction_source
)

eval_results = evaluator.get_match_counts(
    selected_entity_prediction, 
    selected_entity_gt, 
    eval_file_to_save,
    gt_type_dict
)

# create folder if it doesn't exist
if not os.path.exists('eval_results'):
    os.makedirs('eval_results')
    
# write result to JSON file
with open('eval_results/eval_result.json', 'a') as f:
    json.dump(eval_results, f)
    print("Results appended to file: eval_results/eval_result.json")
    
eval_results

Evaluation results saved as ./eval_results/Deid_ChatGPT_eval_0414_1713.xlsx
Results appended to file: eval_results/eval_result.json


{'version': 'Deid_ChatGPT_eval_0414_1713',
 'selected_entity_prediction': ['ID',
  'DATE',
  'AGE',
  'PHONE',
  'PERSON',
  'LOCATION',
  'ORGANIZATION'],
 'selected_entity_gt': ['ID',
  'DATE',
  'AGE',
  'PHONE',
  'DOCTOR',
  'PATIENT',
  'NAME',
  'HOSPITAL',
  'LOCATION',
  'ORGANIZATION'],
 'full_match': 21,
 'accuracy_full_match': 0.09,
 'partial_match': 11,
 'accuracy_partial_match': 0.13,
 'no_match': 207,
 'gt_count': 239,
 'fp_count': 5}