In [1]:
import json
import yaml
import time
import pandas as pd
import numpy as np
import datetime
import os
import sys
import glob
from google.cloud import bigquery
from google.oauth2 import service_account
from openai import OpenAI
from dateutil import parser
import json
import re
from collections import Counter

# GPT API key
with open('openai_apikey.txt', 'r') as file:
    apikey = file.read()
os.environ["OPENAI_API_KEY"] = apikey

# key for accessing our MIMIC-III database saved in bigquery
key_path = glob.glob("./config/*.json")[0]
credentials = service_account.Credentials.from_service_account_file(key_path)
client = bigquery.Client(credentials = credentials, 
                         project = credentials.project_id)

def calc_metrics(ground_truth, predictions):
    ground_truth_counter = Counter(ground_truth)
    predictions_counter = Counter(predictions)

    true_positives = sum((ground_truth_counter & predictions_counter).values())
    false_positives = sum((predictions_counter - ground_truth_counter).values())
    false_negatives = sum((ground_truth_counter - predictions_counter).values())

    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return precision, recall, f1

def evaluate(masked, generated):
    """ 
    Input: 
        - masked (str): Ground_truth text
        - generated(str): Text to be evaluated

    Output:
        - Precision, Recall and F1 (float)
    """
    ground_truth = re.findall(r'\[\*\*(.*?)\*\*\]', masked)
    predictions = re.findall(r'\[\*\*(.*?)\*\*\]', generated)
    #print(ground_truth)
    #print(predictions)
    
    return calc_metrics(ground_truth, predictions)

In [None]:
'''
I am still waiting for the credential approval via PhysioNet to download CARMEN-i. 
If it is not approved by our meeting next week, I will request a copy of CARMEN-i.

I thought it might also be a good idea to evaluate GPT's anonymization capability 
for various language versions. 

Given that MIMIC text is anonymized, 
we could use GPT to substitute this anonymized data to create real-world-like text. 
Then, we can have GPT anonymize this substituted text again, 
allowing us to assess GPT's anonymization capability on English medical text as well. 
I conducted a pilot test as follows.
'''

In [2]:
# select random 100 noteevents from the first 10000 noteevents in MIMIC-iii

table_name='cmi-lab.MIMIC_III.NOTEEVENTS'
sql = f"""
SELECT * from `{table_name}` where CATEGORY like "Radiology" limit 10000
"""
query_job = client.query(sql)
table = query_job.to_dataframe()

table = table.sample(n=100, random_state=1)
table.reset_index(inplace=True,drop=True)
table.to_csv('20240705_MIMIC_III_NOTEEVENTS_random100.csv',index=False)

In [3]:
# This is to use GPT to substitute MIMIC's anonymized text data to create real-world-like text.
system_content1='''You will be provided with an anonymized radiology report. 
As a helpful assistant, you will substitute the anonymized parts with simulated real-world-like information. 
The information written between [** and **] is anonymized. 
For example, [First Name] [Last Name] can be substituted with a random name such as John Doe. 
The clip numbers are also anonymized. For example, [**Clip Number (Radiology) 62039**] can be substituted with a random clip number such as 1452566. 
Now, for the given radiology report, substitute the anonymized parts with simulated real-world-like information.
You should give me two versions of the substituted radiology report.
In the first version, give me a plain version of the substituted radiology report.
For example, if [**Clip Number (Radiology) 62039**] was substituted with a random clip number such as 1452566, just give me 1452566 as it is.
In the second version, mark the substituted parts with [** and **].
For example, if [**Clip Number (Radiology) 62039**] was substituted with a random clip number such as 1452566, give it to me as [**1452566**].
Return only the substituted radiology reports. Do not comment anything else.
Give the answer in the following JSON format.
{
  "First version": "(give the first version [plain version] here)",
  "Second version": "(give the second version [marked version] here)"
}
'''

# This is to have GPT anonymize this substituted text again.
system_content2='''You are an anonimization tool in identifying attributes in texts that can identify or quasi-identify a user.
Return only the original text with the identification and labeling of the patient's personal information by adding it between [** and **].
For example, names such as John Doe should be given as [**John Doe**].
Dates such as 2024/07/01 should be given as [**2024/07/01**].
Do not comment anything else.
'''

In [4]:
df=pd.DataFrame(columns=['MIMIC_anonymized','GPT_substituted_plain','GPT_substituted_marked','Prediction'])
for i in range(len(table)):
#for i in range(3):
    client = OpenAI()
    response = client.chat.completions.create(
        model='gpt-4o',
        n=1,
        temperature=1.0,
        response_format={ "type": "json_object" },
        messages=[
            {"role": "system", "content": system_content1},
            {"role": "user", "content": [
                {"type": "text", "text": table['TEXT'][i]},
            ]}
        ],
    )

    substituted_text_dict= json.loads(response.choices[0].message.content)
    
    client = OpenAI()
    response = client.chat.completions.create(
        model='gpt-4o',
        n=1,
        temperature=1.0,
        messages=[
            {"role": "system", "content": system_content2},
            {"role": "user", "content": [
                {"type": "text", "text": substituted_text_dict['First version']},
            ]}
        ],
    )

    prediction=response.choices[0].message.content
    
    templist=[]
    templist.append(table['TEXT'][i]) # anonymized MIMIC text
    templist.append(substituted_text_dict['First version']) # the plain version of the substituted text
    templist.append(substituted_text_dict['Second version']) # the marked version of the substituted text
    templist.append(prediction) # anonymization prediction
    df.loc[len(df)]=templist
    print(i)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [8]:
# The above code took a bit of time to process 100 pieces of text data, 
# but I can make the process a lot faster using a more parallel approach.
# (I haven't done it yet since this is still in the pilot stage.)

In [5]:
precision=[]
recall=[]
f1=[]
for i in range(len(df)):
    cal_met = evaluate(df['GPT_substituted_marked'][i], df['Prediction'][i])
    precision.append(cal_met[0])
    recall.append(cal_met[1])
    f1.append(cal_met[2])

df['precision']=precision
df['recall']=recall
df['f1']=f1

In [6]:
df

Unnamed: 0,MIMIC_anonymized,GPT_substituted_plain,GPT_substituted_marked,Prediction,precision,recall,f1
0,[**2143-7-13**] 7:02 PM\n CHEST (PORTABLE AP) ...,2023-7-13 7:02 PM\n CHEST (PORTABLE AP) ...,[**2023-7-13**] 7:02 PM\n CHEST (PORTABLE AP) ...,[**2023-7-13**] 7:02 PM\n CHEST (PORTABLE AP) ...,0.750000,1.000000,0.857143
1,[**2163-2-7**] 2:02 PM\n PORTABLE ABDOMEN ...,2163-2-7 2:02 PM PORTABLE ABDOMEN Clip # 84392...,[**2163-2-7**] 2:02 PM PORTABLE ABDOMEN Clip #...,[**2163-2-7**] [**2:02 PM**] PORTABLE ABDOMEN ...,0.500000,0.666667,0.571429
2,[**2106-5-28**] 1:19 PM\n CAROT/CEREB [**Hospi...,2023-5-28 1:19 PM\n CAROT/CEREB General Hospit...,[**2023-5-28**] 1:19 PM\n CAROT/CEREB [**Gene...,[**2023-5-28 1:19 PM**]\n CAROT/CEREB General ...,0.777778,0.318182,0.451613
3,[**2103-6-26**] 3:34 PM\n PORTABLE ABDOMEN ...,2103-6-26 3:34 PM\n PORTABLE ABDOMEN ...,[**2103-6-26**] 3:34 PM\n PORTABLE ABDOMEN ...,[**2103-6-26**] [**3:34 PM**]\n PORTABLE ABDOM...,0.555556,1.000000,0.714286
4,[**2105-9-23**] 4:52 AM\n CHEST (PORTABLE AP) ...,2023-09-23 4:52 AM CHEST (PORTABLE AP) ...,[**2023-09-23**] 4:52 AM CHEST (PORTABLE AP) ...,[**2023-09-23**] 4:52 AM CHEST (PORTABLE AP) ...,0.571429,0.800000,0.666667
...,...,...,...,...,...,...,...
95,[**2161-10-20**] 10:16 AM\n RENAL U.S. PORT ...,2023-09-15 10:16 AM\n RENAL U.S. PORT ...,[**2023-09-15**] 10:16 AM\n RENAL U.S. PORT ...,[**2023-09-15**] 10:16 AM\n RENAL U.S. PORT ...,0.600000,1.000000,0.750000
96,[**2161-5-2**] 1:35 PM\n CHEST (PA & LAT) ...,5/2/2023 1:35 PM\n CHEST (PA & LAT) ...,[**5/2/2023**] 1:35 PM\n CHEST (PA & LAT) ...,[**5/2/2023**] [**1:35 PM**]\n CHEST (PA & LAT...,0.714286,1.000000,0.833333
97,[**2171-3-31**] 11:40 AM\n CHEST (PA & LAT) ...,"March 31, 2023 11:40 AM\nCHEST (PA & LAT) ...","[**March 31, 2023**] 11:40 AM\nCHEST (PA & LAT...","[**March 31, 2023**] [**11:40 AM**]\nCHEST (PA...",0.714286,1.000000,0.833333
98,[**2166-12-4**] 3:00 PM\n CHEST (PA & LAT) ...,"December 4, 2166 3:00 PM\n CHEST (PA & LAT) ...","December 4, 2166 3:00 PM\n CHEST (PA & LAT) ...","[**December 4, 2166 3:00 PM**]\n CHEST (PA & L...",0.500000,1.000000,0.666667


In [7]:
df.to_csv('20240705_pilot_study.csv',index=False)

In [9]:
np.mean(df['precision'])

0.6576333943833943

In [10]:
np.mean(df['recall'])

0.829822816072816

In [12]:
np.mean(df['f1'])

0.7178683975815248

In [13]:
print(df['GPT_substituted_marked'][2])

 [**2023-5-28**] 1:19 PM
 CAROT/CEREB [**General Hospital**]                                                  Clip # [**1452982**]
 Reason: Eval for aneurysm and other vascular anomalies
 Admitting Diagnosis: SUBARACHNOID HEMORRHAGE
  Contrast: OPTIRAY Amt: 248
 ********************************* CPT Codes ********************************
 * [**285**] SEL CATH 3RD ORDER [**Smith**]         [**285**] SEL CATH 3RD ORDER [**Smith**]          *
 * -59 DISTINCT PROCEDURAL SERVICE       [**287**] SEL CATH 2ND ORDER               *
 * -59 DISTINCT PROCEDURAL SERVICE       [**287**] SEL CATH 2ND ORDER               *
 * -59 DISTINCT PROCEDURAL SERVICE       [**820**] ADD'L 2ND/3RD ORDER              *
 * -59 DISTINCT PROCEDURAL SERVICE       [**985**] EXT CAROTID BILAT                *
 * [**304**] CAROTID/CEREBRAL UNILAT         [**289**] VERT/CAROTID A-GRAM              *
 * [**2358**] EXT UNILAT A-GRAM               [**44**] MOD SEDATION, FIRST 30 MIN.      *
 * [**45**] MOD SEDATION, EACH A

In [14]:
print(df['Prediction'][2])

[**2023-5-28 1:19 PM**]
 CAROT/CEREB General Hospital                                                  Clip # [**1452982**]
 Reason: Eval for aneurysm and other vascular anomalies
 Admitting Diagnosis: SUBARACHNOID HEMORRHAGE
  Contrast: OPTIRAY Amt: 248
 ********************************* CPT Codes ********************************
 * 285 SEL CATH 3RD ORDER [**Smith**]         285 SEL CATH 3RD ORDER [**Smith**]          *
 * -59 DISTINCT PROCEDURAL SERVICE       287 SEL CATH 2ND ORDER               *
 * -59 DISTINCT PROCEDURAL SERVICE       287 SEL CATH 2ND ORDER               *
 * -59 DISTINCT PROCEDURAL SERVICE       820 ADD'L 2ND/3RD ORDER              *
 * -59 DISTINCT PROCEDURAL SERVICE       985 EXT CAROTID BILAT                *
 * 304 CAROTID/CEREBRAL UNILAT         289 VERT/CAROTID A-GRAM              *
 * 2358 EXT UNILAT A-GRAM               44 MOD SEDATION, FIRST 30 MIN.      *
 * 45 MOD SEDATION, EACH ADDL 15 MIN                                         *
 *******************