In [56]:
import json
import yaml
import time
import pandas as pd
import numpy as np
import datetime
import os
import sys
import glob
from google.cloud import bigquery
from google.oauth2 import service_account
from openai import OpenAI
from dateutil import parser
import json
import re
from collections import Counter
from tqdm import tqdm
import pymssql
from threading import Thread
import functools

# GPT API key
with open('openai_apikey.txt', 'r') as file:
    apikey = file.read()
os.environ["OPENAI_API_KEY"] = apikey

def calc_metrics(ground_truth, predictions):
    ground_truth_counter = Counter(ground_truth)
    predictions_counter = Counter(predictions)

    true_positives = sum((ground_truth_counter & predictions_counter).values())
    false_positives = sum((predictions_counter - ground_truth_counter).values())
    false_negatives = sum((ground_truth_counter - predictions_counter).values())

    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return precision, recall, f1

def evaluate(masked, generated):
    """ 
    Input: 
        - masked (str): Ground_truth text
        - generated(str): Text to be evaluated

    Output:
        - Precision, Recall and F1 (float)
    """
    ground_truth = re.findall(r'\[\*\*(.*?)\*\*\]', masked)
    predictions = re.findall(r'\[\*\*(.*?)\*\*\]', generated)
    #print(ground_truth)
    #print(predictions)
    
    return calc_metrics(ground_truth, predictions)

def levenshtein_distance(s1, s2, show_progress=True):
    """
    Calcula la distancia de Levenshtein entre dos cadenas.

    La distancia de Levenshtein es el número mínimo de operaciones de edición 
    (inserción, eliminación o sustitución de un carácter) necesarias para 
    transformar una cadena en otra.

    Parámetros:
        s1 (str): Primera cadena
        s2 (str): Segunda cadena
        show_progress (bool): Si es True, muestra una barra de progreso. 
                              Por defecto es False.
    Retorna:
        int: La distancia de Levenshtein entre s1 y s2
    """
    # Usar tqdm solo si show_progress es True
    iterable = tqdm(s1) if show_progress else s1

    if len(s1) < len(s2):
        s1, s2 = s2, s1
    if len(s2) == 0:
        return len(s1)

    previous_row = list(range(len(s2) + 1))
    for i, c1 in enumerate(iterable):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    return previous_row[-1]

CARMEN_base_path='CARMEN-I_v1.01b 2/CARMEN-I_v1.01b/txt/'

txtlist=os.listdir(CARMEN_base_path+'masked/')

# This is for connecting to the MSSQL server via pymssql. Adjust it according to your configuration.
conn = pymssql.connect(host=r"(local)", database='Ddrive5', charset='utf8')

# This is the table name that you will create in the MSSQL server to save the GPT responses.
# Adjust it according to your preferences.
newtablename1='20240712_test_Replaced_enclosed'
newtablename2='20240712_test_Prediction'

In [65]:
system_content1='''Identify all masked patterns enclosed within [** and **] in the masked data. 
then find the corresponding parts in the original data, and enclose these parts with [** and **]
Do not omit any part of the original data when giving the results.
Return only the results and do not comment anything else.'''

system_content2='''You are an anonimization tool in identifying attributes in texts that can identify or quasi-identify a user.
Return only the original text with the identification and labeling of the patient's personal information by adding it between [** and **].
For example, names such as John Doe should be given as [**John Doe**].
Dates such as 2024/07/01 should be given as [**2024/07/01**].
Do not comment anything else.
'''

In [66]:
df=pd.DataFrame(columns=['Replaced','Masked','Replaced_enclosed','Prediction'])
#for i in range(len(txtlist)):
for i in range(3): 
    with open(CARMEN_base_path+'replaced/'+txtlist[i], 'r', encoding='utf-8') as file:
        Replaced = file.read()
    with open(CARMEN_base_path+'masked/'+txtlist[i], 'r', encoding='utf-8') as file:
        Masked = file.read()

    combined_text='Masked data:\n'
    combined_text+=Masked
    combined_text+='\n'
    combined_text+='Original data:\n'
    combined_text+=Replaced
    
    client = OpenAI()
    response = client.chat.completions.create(
        model='gpt-4o',
        n=1,
        temperature=1.0,
        messages=[
            {"role": "system", "content": system_content1},
            {"role": "user", "content": [
                {"type": "text", "text": combined_text},
            ]}
        ],
    )
    
    Replaced_enclosed=response.choices[0].message.content
    
    client = OpenAI()
    response = client.chat.completions.create(
        model='gpt-4o',
        n=1,
        temperature=1.0,
        messages=[
            {"role": "system", "content": system_content2},
            {"role": "user", "content": [
                {"type": "text", "text": Replaced},
            ]}
        ],
    )

    Prediction=response.choices[0].message.content
    
    templist=[]
    templist.append(Replaced) 
    templist.append(Masked) 
    templist.append(Replaced_enclosed) 
    templist.append(Prediction)
    df.loc[len(df)]=templist
    print(i)

0
1
2


In [67]:
precision=[]
recall=[]
f1=[]
for i in range(len(df)):
    cal_met = evaluate(df['Replaced_enclosed'][i], df['Prediction'][i])
    precision.append(cal_met[0])
    recall.append(cal_met[1])
    f1.append(cal_met[2])

df['precision']=precision
df['recall']=recall
df['f1']=f1

In [68]:
df

Unnamed: 0,Replaced,Masked,Replaced_enclosed,Prediction,precision,recall,f1
0,Primera llamada HDOM COVID-19 POSITIVO\n.\nTra...,Primera llamada HDOM COVID-19 POSITIVO\n.\nTra...,Primera llamada HDOM COVID-19 POSITIVO\n.\nTra...,Primera llamada HDOM COVID-19 POSITIVO\n.\nTra...,1.0,0.75,0.857143
1,Realizo llamada telefónica. Refiere buen desca...,Realizo llamada telefónica. Refiere buen desca...,Realizo llamada telefónica. Refiere buen desca...,Realizo llamada telefónica. Refiere buen desca...,0.9375,0.75,0.833333
2,Visita unidad del dolor. Ver informe de evoluc...,Visita unidad del dolor. Ver informe de evoluc...,Visita unidad del dolor. Ver informe de evoluc...,Visita unidad del dolor. Ver informe de evoluc...,1.0,0.666667,0.8


# speeding up

In [97]:
for newtablename in [newtablename1, newtablename2]:
    sql_createtable="CREATE TABLE [" + newtablename +"""] 
    (
        txtname    NVARCHAR(max),
        txt   NVARCHAR(max),
    )

    """
    conn = pymssql.connect(host=r"(local)", database='Ddrive5', charset='utf8')
    with conn:
        with conn.cursor() as cur:
            cur.execute(sql_createtable)
            conn.commit()

In [102]:
def timeout(timeout):
    def deco(func):
        @functools.wraps(func)
        def wrapper(*args, **kwargs):
            res = [Exception('function [%s] timeout [%s seconds] exceeded!' % (func.__name__, timeout))]
            def newFunc():
                try:
                    res[0] = func(*args, **kwargs)
                except Exception as e:
                    res[0] = e
            t = Thread(target=newFunc)
            t.daemon = True
            try:
                t.start()
                t.join(timeout)
            except Exception as je:
                print ('error starting thread')
                raise je
            ret = res[0]
            #if isinstance(ret, BaseException):
            #    raise ret
            return ret
        return wrapper
    return deco

@timeout(0.1)
def SQL_Replaced_enclosed(i, system_content, model_ver, numbers, temperature_setting):

    with open(CARMEN_base_path+'replaced/'+txtlist[i], 'r', encoding='utf-8') as file:
        Replaced = file.read()
    with open(CARMEN_base_path+'masked/'+txtlist[i], 'r', encoding='utf-8') as file:
        Masked = file.read()

    combined_text='Masked data:\n'
    combined_text+=Masked
    combined_text+='\n'
    combined_text+='Original data:\n'
    combined_text+=Replaced
    
    client = OpenAI()
    response = client.chat.completions.create(
        model=model_ver,
        n=numbers,
        temperature=temperature_setting,
        messages=[
            {"role": "system", "content": system_content},
            {"role": "user", "content": [
                {"type": "text", "text": combined_text},
            ]}
        ],
    )
    
    Replaced_enclosed=response.choices[0].message.content

    conn = pymssql.connect(host=r"(local)", database='Ddrive5', charset='utf8')
    sql = "INSERT INTO [" + newtablename1+"] (txtname, txt) VALUES (%s, %s)"

    with conn:
        with conn.cursor() as cur:
            cur.execute(sql, (txtlist[i], Replaced_enclosed))
            conn.commit()
            
@timeout(0.1)
def SQL_Prediction(Replaced_enclosed, txtname, system_content, model_ver, numbers, temperature_setting):
    
    client = OpenAI()
    response = client.chat.completions.create(
        model=model_ver,
        n=numbers,
        temperature=temperature_setting,
        messages=[
            {"role": "system", "content": system_content},
            {"role": "user", "content": [
                {"type": "text", "text": Replaced_enclosed},
            ]}
        ],
    )
    
    Prediction=response.choices[0].message.content

    conn = pymssql.connect(host=r"(local)", database='Ddrive5', charset='utf8')
    sql = "INSERT INTO [" + newtablename2+"] (txtname, txt) VALUES (%s, %s)"

    with conn:
        with conn.cursor() as cur:
            cur.execute(sql, (txtname, Prediction))
            conn.commit()


In [99]:
for i in range(500):
    SQL_Replaced_enclosed(i, system_content1, 'gpt-4o', 1, 1.0)

In [100]:
conn = pymssql.connect(host=r"(local)", database='Ddrive5', charset='utf8')
sql_statement="select * from ["+ newtablename1 + "]"
df_SQL_Replaced_enclosed = pd.read_sql(sql=sql_statement, con=conn)

  df_SQL_Replaced_enclosed = pd.read_sql(sql=sql_statement, con=conn)


In [103]:
for i in range(len(df_SQL_Replaced_enclosed)):
    Replaced_enclosed=df_SQL_Replaced_enclosed['txt'][i]
    txtname=df_SQL_Replaced_enclosed['txtname'][i]
    
    SQL_Prediction(Replaced_enclosed, txtname, system_content2, 'gpt-4o', 1, 1.0)

In [104]:
conn = pymssql.connect(host=r"(local)", database='Ddrive5', charset='utf8')
sql_statement="select * from ["+ newtablename2 + "]"
df_SQL_Prediction = pd.read_sql(sql=sql_statement, con=conn)

  df_SQL_Prediction = pd.read_sql(sql=sql_statement, con=conn)


In [116]:
df=pd.merge(df_SQL_Prediction,df_SQL_Replaced_enclosed,left_on='txtname',right_on='txtname',how='inner')
df.columns=['txtname','Prediction','Replaced_enclosed']

In [118]:
Replaced_list=[]
Masked_list=[]
for i in range(len(df)):
    txtname=df['txtname'][i]
    with open(CARMEN_base_path+'replaced/'+txtname, 'r', encoding='utf-8') as file:
        Replaced = file.read()
    with open(CARMEN_base_path+'masked/'+txtname, 'r', encoding='utf-8') as file:
        Masked = file.read()
    Replaced_list.append(Replaced)
    Masked_list.append(Masked)
df['Masked']=Masked_list
df['Replaced']=Replaced_list

In [122]:
precision=[]
recall=[]
f1=[]
for i in range(len(df)):
    cal_met = evaluate(df['Replaced_enclosed'][i], df['Prediction'][i])
    precision.append(cal_met[0])
    recall.append(cal_met[1])
    f1.append(cal_met[2])

df['precision']=precision
df['recall']=recall
df['f1']=f1

In [123]:
df

Unnamed: 0,txtname,Prediction,Replaced_enclosed,Masked,Replaced,precision,recall,f1
0,CARMEN-I_IA_ANTECEDENTES_139.txt,Antecedentes personales: niega\nAlergias: nieg...,Antecedentes personales: niega\nAlergias: nieg...,\nAntecedentes personales: niega\nAlergias: ni...,\nAntecedentes personales: niega\nAlergias: ni...,0.000000,0.000000,0.000000
1,CARMEN-I_IA_ANTECEDENTES_39.txt,Sin antecedentes médicos relevantes.,Sin antecedentes médicos relevantes.,\nSin antecedentes médicos relevantes.\n \n,\nSin antecedentes médicos relevantes.\n \n,0.000000,0.000000,0.000000
2,CARMEN-I_IA_ANTECEDENTES_108.txt,"Paciente de [**37 años**], Alergia a Quinolona...","Paciente de [**37 años**], Alergia a Quinolona...","\nPaciente de [**EDAD_SUJETO_ASISTENCIA**], Al...","\nPaciente de 37 años, Alergia a Quinolonas y ...",1.000000,1.000000,1.000000
3,CARMEN-I_IA_ANTECEDENTES_119.txt,Hombre de [**51 años**] originario de [**Emira...,Hombre de [**51 años**] originario de [**Emira...,\n[**SEXO_SUJETO_ASISTENCIA**] de [**EDAD_SUJE...,\nHombre de 51 años originario de Emiratos Ára...,1.000000,1.000000,1.000000
4,CARMEN-I_IA_ANTECEDENTES_14.txt,ANTECEDENTES PERSONALES\n-Natural de [**Gambia...,ANTECEDENTES PERSONALES\n-Natural de [**Gambia...,ANTECEDENTES PERSONALES\n-Natural de [**PAIS**...,"ANTECEDENTES PERSONALES\n-Natural de Gambia, N...",1.000000,1.000000,1.000000
...,...,...,...,...,...,...,...,...
495,CARMEN-I_IA_PROCESO_ACTUAL_14.txt,Consulta por clínica que inicia el [**13 de ag...,Consulta por clínica que inicia el [**13 de ag...,\nConsulta por clínica que inicia el [**FECHAS...,\nConsulta por clínica que inicia el 13 de ago...,1.000000,1.000000,1.000000
496,CARMEN-I_IA_EXPLORACION_COMPLEMENTARIA_21.txt,* MICROBIOLOGÍA\n- Serologías ([**30/10**]): V...,* MICROBIOLOGÍA\n- Serologías ([**30/10**]): V...,\n* MICROBIOLOGÍA\n- Serologías ([**FECHAS**])...,"\n* MICROBIOLOGÍA\n- Serologías (30/10): VHB, ...",1.000000,1.000000,1.000000
497,CARMEN-I_IA_EXPLORACION_COMPLEMENTARIA_36.txt,[**20/08/2017**].\n\n* TC TORACOABDOMINAL [**1...,[**20/08/2017**].\n* TC TORACOABDOMINAL [**17/...,\n[**FECHAS**].\n* TC TORACOABDOMINAL [**FECHA...,\n20/08/2017.\n* TC TORACOABDOMINAL 17/08/20:\...,1.000000,1.000000,1.000000
498,CARMEN-I_IA_PROCESO_ACTUAL_23.txt,"Varón de 63 años exfumador, con antecedentes d...","Varón de 63 años exfumador, con antecedentes d...",\n[**SEXO_SUJETO_ASISTENCIA**] de [**EDAD_SUJE...,"\nVarón de 63 años exfumador, con antecedentes...",0.909091,0.909091,0.909091


In [129]:
df.to_csv('20240712_CARMEN_500.csv',index=False)

In [130]:
df=pd.read_csv('20240712_CARMEN_500.csv',encoding='utf8')

In [131]:
df

Unnamed: 0,txtname,Prediction,Replaced_enclosed,Masked,Replaced,precision,recall,f1
0,CARMEN-I_IA_ANTECEDENTES_139.txt,Antecedentes personales: niega\nAlergias: nieg...,Antecedentes personales: niega\nAlergias: nieg...,\nAntecedentes personales: niega\nAlergias: ni...,\nAntecedentes personales: niega\nAlergias: ni...,0.000000,0.000000,0.000000
1,CARMEN-I_IA_ANTECEDENTES_39.txt,Sin antecedentes médicos relevantes.,Sin antecedentes médicos relevantes.,\nSin antecedentes médicos relevantes.\n \n,\nSin antecedentes médicos relevantes.\n \n,0.000000,0.000000,0.000000
2,CARMEN-I_IA_ANTECEDENTES_108.txt,"Paciente de [**37 años**], Alergia a Quinolona...","Paciente de [**37 años**], Alergia a Quinolona...","\nPaciente de [**EDAD_SUJETO_ASISTENCIA**], Al...","\nPaciente de 37 años, Alergia a Quinolonas y ...",1.000000,1.000000,1.000000
3,CARMEN-I_IA_ANTECEDENTES_119.txt,Hombre de [**51 años**] originario de [**Emira...,Hombre de [**51 años**] originario de [**Emira...,\n[**SEXO_SUJETO_ASISTENCIA**] de [**EDAD_SUJE...,\nHombre de 51 años originario de Emiratos Ára...,1.000000,1.000000,1.000000
4,CARMEN-I_IA_ANTECEDENTES_14.txt,ANTECEDENTES PERSONALES\n-Natural de [**Gambia...,ANTECEDENTES PERSONALES\n-Natural de [**Gambia...,ANTECEDENTES PERSONALES\n-Natural de [**PAIS**...,"ANTECEDENTES PERSONALES\n-Natural de Gambia, N...",1.000000,1.000000,1.000000
...,...,...,...,...,...,...,...,...
495,CARMEN-I_IA_PROCESO_ACTUAL_14.txt,Consulta por clínica que inicia el [**13 de ag...,Consulta por clínica que inicia el [**13 de ag...,\nConsulta por clínica que inicia el [**FECHAS...,\nConsulta por clínica que inicia el 13 de ago...,1.000000,1.000000,1.000000
496,CARMEN-I_IA_EXPLORACION_COMPLEMENTARIA_21.txt,* MICROBIOLOGÍA\n- Serologías ([**30/10**]): V...,* MICROBIOLOGÍA\n- Serologías ([**30/10**]): V...,\n* MICROBIOLOGÍA\n- Serologías ([**FECHAS**])...,"\n* MICROBIOLOGÍA\n- Serologías (30/10): VHB, ...",1.000000,1.000000,1.000000
497,CARMEN-I_IA_EXPLORACION_COMPLEMENTARIA_36.txt,[**20/08/2017**].\n\n* TC TORACOABDOMINAL [**1...,[**20/08/2017**].\n* TC TORACOABDOMINAL [**17/...,\n[**FECHAS**].\n* TC TORACOABDOMINAL [**FECHA...,\n20/08/2017.\n* TC TORACOABDOMINAL 17/08/20:\...,1.000000,1.000000,1.000000
498,CARMEN-I_IA_PROCESO_ACTUAL_23.txt,"Varón de 63 años exfumador, con antecedentes d...","Varón de 63 años exfumador, con antecedentes d...",\n[**SEXO_SUJETO_ASISTENCIA**] de [**EDAD_SUJE...,"\nVarón de 63 años exfumador, con antecedentes...",0.909091,0.909091,0.909091


In [127]:
print(df['Replaced'][1])


Sin antecedentes médicos relevantes.
 



In [128]:
print(df['Masked'][1])


Sin antecedentes médicos relevantes.
 



In [125]:
print(df['Prediction'][0])

Antecedentes personales: niega
Alergias: niega
No toma tratamiento de forma habitual.
