In [78]:
from langchain_openai import ChatOpenAI
import json
import yaml
import time
import pandas as pd
import numpy as np
import datetime
import os
import sys
import glob
from google.cloud import bigquery
from google.oauth2 import service_account
from openai import OpenAI
from dateutil import parser
import json
import re
from collections import Counter
from tqdm import tqdm
import pymssql
from threading import Thread
import functools
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import multiprocessing as mp
from termcolor import colored

#ARMEN_base_path='CARMEN-I_v1.01b 2/CARMEN-I_v1.01b/txt/'
CARMEN_base_path='data/data/processed/'

ann_path='data/data/processed/ann/'

txtlist=os.listdir(CARMEN_base_path+'masked/')

# This is for connecting to the MSSQL server via pymssql. Adjust it according to your configuration.
conn = pymssql.connect(host=r"(local)", database='Ddrive5', charset='utf8')


categorylist=[]
for i in range(len(txtlist)):
    tempdf=pd.read_csv(ann_path+txtlist[i].replace('txt','csv'), header=None)
    tempdf.columns=['category','position1','position2','phrase']
    categorylist+=list(tempdf['category'])
    categorylist=list(set(categorylist))

# This is the table name that you will create in the MSSQL server to save the GPT responses.
# Adjust it according to your preferences.
newtablename='20240809_test_Prediction2'



with open('openai_apikey2.txt', 'r') as file:
    apikey = file.read()
os.environ["OPENAI_API_KEY"] = apikey

def calc_metrics(ground_truth, predictions):
    ground_truth_counter = Counter(ground_truth)
    predictions_counter = Counter(predictions)

    true_positives = sum((ground_truth_counter & predictions_counter).values())
    false_positives = sum((predictions_counter - ground_truth_counter).values())
    false_negatives = sum((ground_truth_counter - predictions_counter).values())

    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return precision, recall, f1

def evaluate(masked, generated):
    """ 
    Input: 
        - masked (str): Ground_truth text
        - generated(str): Text to be evaluated

    Output:
        - Precision, Recall and F1 (float)
    """
    ground_truth = re.findall(r'\[\*\*(.*?)\*\*\]', masked)
    predictions = re.findall(r'\[\*\*(.*?)\*\*\]', generated)
    #print(ground_truth)
    #print(predictions)
    
    return calc_metrics(ground_truth, predictions)

def levenshtein_distance(s1, s2, show_progress=True):
    """
    Calcula la distancia de Levenshtein entre dos cadenas.

    La distancia de Levenshtein es el número mínimo de operaciones de edición 
    (inserción, eliminación o sustitución de un carácter) necesarias para 
    transformar una cadena en otra.

    Parámetros:
        s1 (str): Primera cadena
        s2 (str): Segunda cadena
        show_progress (bool): Si es True, muestra una barra de progreso. 
                              Por defecto es False.
    Retorna:
        int: La distancia de Levenshtein entre s1 y s2
    """
    # Usar tqdm solo si show_progress es True
    iterable = tqdm(s1) if show_progress else s1

    if len(s1) < len(s2):
        s1, s2 = s2, s1
    if len(s2) == 0:
        return len(s1)

    previous_row = list(range(len(s2) + 1))
    for i, c1 in enumerate(iterable):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    return previous_row[-1]

def get_cos_sim(text_hoped, text_generated):
    vectorizer = TfidfVectorizer(token_pattern=r"(?u)\b\w[\w\-/]*\b")
    tfidf_matrix = vectorizer.fit_transform([text_hoped, text_generated])

    try:
        cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
    except: 
        return 0.0
    return cosine_sim[0][0]

def replace_special_characters(text):
    # Define the pattern to match special characters including . / and -
    pattern = r'[!@#$%^&*()_+={}\[\]:;"\'<>,?\\|`~./-]'
    
    # Replace the matched characters with a space
    result = re.sub(pattern, ' ', text)
    
    return result



def find_indices(text, word):
    """Find all indices of word in text"""
    return [m.start() for m in re.finditer(re.escape(word), text)]


def replace_nth_occurrence(text, word, n):
    """Replace the nth occurrence of word in text with [**word**]"""
    indices = find_indices(text, word)
    if n < len(indices):
        start_index = indices[n]
        end_index = start_index + len(word)
        text = text[:start_index] + '[**' + word + '**]' + text[end_index:]
    return text


def sort_by_length_descending(str_list):
    return sorted(str_list, key=len, reverse=True)

def process_text(text):
    result = []
    stack = []
    i = 0
    n = len(text)

    while i < n:
        if text[i:i+3] == '[**':
            if not stack:
                result.append('[**')
            stack.append('[**')
            i += 3
        elif text[i:i+3] == '**]':
            if stack:
                stack.pop()
            if not stack:
                result.append('**]')
            i += 3
        else:
            result.append(text[i])
            i += 1

    return ''.join(result)

def prediction_process(Prediction, Replaced):
    # Extract all words enclosed in [** **] in the Prediction text
    enclosed_words = re.findall(r'\[\*\*(.*?)\*\*\]', Prediction)
    enclosed_words = list(set(enclosed_words))
    enclosed_words = sort_by_length_descending(enclosed_words)
    
    # Create a copy of Replaced text to apply changes
    updated_replaced = Replaced

    for word in enclosed_words:
        # Find all indices of the word in the Prediction text
        prediction_indices = find_indices(Prediction, '[**' + word + '**]')
        
        # Replace the nth occurrence of word in Replaced text based on the indices in Prediction
        for idx in range(len(prediction_indices)):
            updated_replaced = replace_nth_occurrence(updated_replaced, word, idx)
            
    updated_replaced=process_text(updated_replaced)

    return updated_replaced

'''
def partial_score(X, Y):
    if X in Y or Y in X:
        x_words = X.split()
        y_words = Y.split()

        matches = sum(1 for word in y_words if word in x_words)

        return matches / len(x_words) if len(x_words) > 0 else 0
    else:
        return 0
'''

def partial_score(X, Y):
    if X in Y or Y in X:
        x_words = X.split()
        y_words = Y.split()

        counter_a1 = Counter(x_words)
        counter_a2 = Counter(y_words)

        matches = 0

        for element in counter_a1:
            if element in counter_a2:

                matches += min(counter_a1[element], counter_a2[element])

        return matches / len(x_words) if len(x_words) > 0 else 0
    else:
        return 0


def evaluate2(masked,generated,replaced,ann_df):
    #masked=masked.replace('\n','')
    #generated=generated.replace('\n','')

    ground_truth_matches = re.finditer(r'\[\*\*(.*?)\*\*\]', masked)
    ground_truth_positions = {}
    cnt=0
    for match in ground_truth_matches:
        start = match.start(1)-(cnt*2+1)*3  # start of the group (excluding [**)
        end = match.end(1)-(cnt*2+1)*3
        cnt+=1# end of the group (excluding **])
        ground_truth_positions[(start, end)] = replace_special_characters(match.group(1))

    predictions_matches = re.finditer(r'\[\*\*(.*?)\*\*\]', generated)
    predictions_positions = {}
    #predictions_positions_before_replacement = {}
    cnt=0
    for match in predictions_matches:
        start = match.start(1)-(cnt*2+1)*3  # start of the group (excluding [**)
        end = match.end(1)-(cnt*2+1)*3
        cnt+=1# end of the group (excluding **])
        predictions_positions[(start, end)] = replace_special_characters(match.group(1))
        #predictions_positions_before_replacement[(start, end)] = match.group(1)
    totalwordcnt_ground_truth = len(ground_truth_positions)
    score_total=0
    cnt=0
    FN_category_df=pd.DataFrame(columns=['category','FN_count','target_count'])
    FN_category_df['category']=categorylist
    FN_category_df['FN_count'] = 0
    FN_category_df['target_count'] = 0
    for pos_g in ground_truth_positions:
        target_cat = ann_df['category'][cnt]
        FN_category_df.loc[FN_category_df['category'] == target_cat, 'target_count'] += 1
        score_temp2=0
        
        for pos_p in predictions_positions:
            if (pos_p[0]<=pos_g[0] and pos_p[1]>=pos_g[1]) or (pos_p[0]>=pos_g[0] and pos_p[1]<=pos_g[1]):
                score_temp = partial_score(ground_truth_positions[pos_g],predictions_positions[pos_p])
                score_total += score_temp
                score_temp2 += score_temp
            elif (pos_p[0]<pos_g[0] and pos_p[1]>pos_g[0] and pos_p[1]<pos_g[1]):
                ground_truth_phrase_temp=ground_truth_positions[pos_g]
                prediction_phrase_temp=predictions_positions[pos_p]
                prediction_phrase_temp=prediction_phrase_temp[pos_g[0]-pos_p[0]:]
                
                score_temp = partial_score(ground_truth_phrase_temp,prediction_phrase_temp)
                score_total += score_temp
                score_temp2 += score_temp
                
            elif (pos_p[0]>pos_g[0] and pos_p[0]<pos_g[1] and pos_p[1]>pos_g[1]):
                ground_truth_phrase_temp=ground_truth_positions[pos_g]
                prediction_phrase_temp=predictions_positions[pos_p]
                prediction_phrase_temp=prediction_phrase_temp[:-(pos_p[1]-pos_g[1])]
                
                score_temp = partial_score(ground_truth_phrase_temp,prediction_phrase_temp)
                score_total += score_temp
                score_temp2 += score_temp
            
        if score_temp2 < 1:
            FN_score = 1-score_temp2
            FN_cat = ann_df['category'][cnt]
            FN_category_df.loc[FN_category_df['category'] == FN_cat, 'FN_count'] += FN_score
        cnt+=1

    score_total = score_total/totalwordcnt_ground_truth
    recall = score_total

    totalwordcnt_predictions = len(predictions_positions)
    score_total=0

    FP_dict={}
    for pos_p in predictions_positions:
        score_temp2=0
        for pos_g in ground_truth_positions:
            if (pos_g[0]<=pos_p[0] and pos_g[1]>=pos_p[1]) or (pos_g[0]>=pos_p[0] and pos_g[1]<=pos_p[1]):
                score_temp = partial_score(predictions_positions[pos_p],ground_truth_positions[pos_g])
                score_total += score_temp
                score_temp2 += score_temp
                
            elif (pos_p[0]<pos_g[0] and pos_p[1]>pos_g[0] and pos_p[1]<pos_g[1]):
                ground_truth_phrase_temp=ground_truth_positions[pos_g]
                prediction_phrase_temp=predictions_positions[pos_p]
                ground_truth_phrase_temp=ground_truth_phrase_temp[:-(pos_g[1]-pos_p[1])]
                
                score_temp = partial_score(ground_truth_phrase_temp,prediction_phrase_temp)
                score_total += score_temp
                score_temp2 += score_temp
                
            elif (pos_p[0]>pos_g[0] and pos_p[0]<pos_g[1] and pos_p[1]>pos_g[1]):
                ground_truth_phrase_temp=ground_truth_positions[pos_g]
                prediction_phrase_temp=predictions_positions[pos_p]
                ground_truth_phrase_temp=ground_truth_phrase_temp[pos_p[0]-pos_g[0]:]
                
                score_temp = partial_score(ground_truth_phrase_temp,prediction_phrase_temp)
                score_total += score_temp
                score_temp2 += score_temp
                
        if score_temp2 < 1:
            FP_dict[pos_p]=predictions_positions[pos_p]
            #print('======================')
            #print(score_temp2)
            #print(predictions_positions_before_replacement[pos_p])
            #print('======================')
    # Step 1: Color the FP text in replaced and print
    for pos in sorted(FP_dict.keys(), key=lambda x: x[0], reverse=True):
        start, end = pos
        colored_text = colored(replaced[start:end], 'blue')
        replaced = replaced[:start] + colored_text + replaced[end:]
    print('======================')
    print(replaced)
    print('')
    print('')
    print('')
    print('======================')        

    score_total = score_total/totalwordcnt_predictions
    precision = score_total
    
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return precision, recall, f1, FN_category_df


In [68]:
print('txtlist length',len(txtlist))

df=pd.DataFrame(columns=['txtname','Replaced','Masked'])
#for i in range(10): 
for i in range(len(txtlist)): 
    with open(CARMEN_base_path+'txt/'+txtlist[i], 'r', encoding='utf-8') as file:
        Replaced = file.read()
        if Replaced[0]=='\n':
            Replaced=Replaced[1:]
        if Replaced[-1]=='\n':
            Replaced=Replaced[:-1]
    with open(CARMEN_base_path+'masked/'+txtlist[i], 'r', encoding='utf-8') as file:
        Masked = file.read()
        if Masked[0]=='\n':
            Masked=Masked[1:]
        if Masked[-1]=='\n':
            Masked=Masked[:-1]
    if len(Masked.replace('**]','').replace('[**',''))!=len(Replaced):
        print(i)
    else:
        templist=[]
        templist.append(txtlist[i])
        templist.append(Replaced) 
        templist.append(Masked) 
        df.loc[len(df)]=templist
print('length',len(df))

txtlist length 1539
0
556
557
559
560
1045
1057
1163
1356
length 1530


In [69]:
for newtablename in [newtablename]:
    sql_createtable="CREATE TABLE [" + newtablename +"""] 
    (
        txtname    NVARCHAR(max),
        txt   NVARCHAR(max),
    )

    """
    conn = pymssql.connect(host=r"(local)", database='Ddrive5', charset='utf8')
    with conn:
        with conn.cursor() as cur:
            cur.execute(sql_createtable)
            conn.commit()

In [70]:
def timeout(timeout):
    def deco(func):
        @functools.wraps(func)
        def wrapper(*args, **kwargs):
            res = [Exception('function [%s] timeout [%s seconds] exceeded!' % (func.__name__, timeout))]
            def newFunc():
                try:
                    res[0] = func(*args, **kwargs)
                except Exception as e:
                    res[0] = e
            t = Thread(target=newFunc)
            t.daemon = True
            try:
                t.start()
                t.join(timeout)
            except Exception as je:
                print ('error starting thread')
                raise je
            ret = res[0]
            #if isinstance(ret, BaseException):
            #    raise ret
            return ret
        return wrapper
    return deco

            
@timeout(0.1)
def SQL_Prediction(Replaced, txtname, system_content, model_ver, numbers, temperature_setting):

    llm = ChatOpenAI(
        model=model_ver,
        temperature=temperature_setting,
        max_tokens=None,
        timeout=None,
        max_retries=2,
    )

    messages = [
        ("system", system_content),
        ("human", Replaced),
    ]
    ai_msg = llm.invoke(messages)
    Prediction = ai_msg.content



    conn = pymssql.connect(host=r"(local)", database='Ddrive5', charset='utf8')
    sql = "INSERT INTO [" + newtablename+"] (txtname, txt) VALUES (%s, %s)"

    with conn:
        with conn.cursor() as cur:
            cur.execute(sql, (txtname, Prediction))
            conn.commit()

system_content2='''You are an anonymization tool in identifying attributes in texts that can identify or quasi-identify a user.
Return only the original text with the identification and labeling of the patient's personal information by adding it between [** and **].
Following are attributes that you must anonymize.

- Names
Example:
En seguimiento por Hematología Centro Médico Aspasia (Dra. Valvanera).   ->
En seguimiento por Hematología [**Centro Médico Aspasia**] (Dra. [**Valvanera**]).

- Ages
Example: 
Varón de 41 años.   ->
[**Varón**] de [**41 años**].

- Sexes
Example: 
Varón de 41 años.   ->
[**Varón**] de [**41 años**].

- Professions
Example: 
Trabaja como profesor.   ->
Trabaja como [**profesor**].

- Relatives
Example: 
Vive con suegro y 2 yernos.   ->
Vive con [**suegro**] y 2 [**yernos**].

- Dates
Example: 
ha estado viviendo en el Centro desde septiembre de 2008.   ->
ha estado viviendo en el [**Centro**] desde [**septiembre de 2008**].

Example:
Historial de tratamiento
25/12-26/12: abacavir/lamivudina + ritonavir + atazanavir.
26/12-28/02: emtricitabina/tenofovir + ritonavir + atazanavir.
->
Historial de tratamiento
[**25/12**]-[**26/12**]: abacavir/lamivudina + ritonavir + atazanavir.
[**26/12**]-[**28/02**]: emtricitabina/tenofovir + ritonavir + atazanavir.

Example:
TC de ago y abr/13. ->
TC de [**ago**] y [**abr/13**].


- Phone numbers
Example: 
contactando con el siguiente número de teléfono +50 88 078 68 49.   ->
contactando con el siguiente número de teléfono [**+50 88 078 68 49**].

- Identification numbers
Example:
El paciente otorga su consentimiento informado para participar en el estudio del protocolo WYX/8408/5545.   ->
El paciente otorga su consentimiento informado para participar en el estudio del protocolo [**WYX/8408/5545.**]

- Institutions, hospitals, health centers, etc
Example: 
En seguimiento por Hematología Centro Médico Aspasia (Dra. Valvanera).   ->
En seguimiento por Hematología [**Centro Médico Aspasia**] (Dra. [**Valvanera**]).
Example:
Control en Centro Salud Mental Reyes Católicos.   ->
Control en [**Centro Salud Mental Reyes Católicos**].

- Countries, territories, streets, etc
Example:
nacido en la República Italiana.   ->
nacido en la [**República Italiana**].
Example:
ha estado viviendo en el Centro desde septiembre de 2008.   ->
ha estado viviendo en el [**Centro**] desde [**septiembre de 2008**].
Example:
la dirección es Calle de Victor Hugo 39.   ->
la dirección es [**Calle de Victor Hugo 39**].

- Website URLs
participar a través del siguiente enlace: https://www.donarsang.gencat.cat/covid19.   ->
participar a través del siguiente enlace: [**https://www.donarsang.gencat.cat/covid19**].

- Other sensitive information such as races, ethnicities, sexual orientation, dietary preferences, etc
Example:
raça blanca   ->
[**raça blanca**]
Example:
Hsh
[**Hsh**]
Example:
Vegetarià
[**Vegetarià**]


Anonymize only the specific element:
For example, 
Dates (e.g., in "Fecha frotis SARS-Cov2: 12/11/2024," only anonymize "12/11/2024" as "[**12/11/2024**]", not the surrounding text).
Place names (e.g., in "Natural de la región de Samara (Rusia)," anonymize as "Natural de la región de [**Samara**] ([**Rusia**])," not the entire phrase).


Do not anonymize the following:
Common nouns (e.g., "pacient" (patient), "trucada" (call))
Medical and anatomical terms (e.g., "amigdalas" (tonsils), "epicóndilo" (epicondyle), "corazón" (heart), "fémur" (femur), "diabetes"))
General terms that are not unique identifiers (e.g., "doctor" unless followed by a name, "hospital" unless it is a specific named hospital)
Durations and time frames (e.g., "hace 30 años" when referring to a time period, not the age of a person)


Do not comment anything else.
Besides the anonymized attributes, provide the rest of the text exactly the same, including special characters and \n symbols.
Do not correct any typos or spacing errors at your discretion.
For example, if the time is written as 31/12/2000-0 9:20:00 with incorrect spacing, do not return it corrected as 31/12/2000-09:20:00.
Also, for example, if FLUTICASONA + AZELA STINA4 is written with incorrect spacing, do not return it corrected as FLUTICASONA + AZELASTINA 4.
Only focus on the anonymization tasks I have specified, and ignore any typos or spacing errors
'''

In [71]:
sample_size=100

In [72]:
for i in range(sample_size):
    txtname=df['txtname'][i]
    Replaced=df['Replaced'][i]
    SQL_Prediction(Replaced, txtname, system_content2, 'gpt-4o', 1, 1.0)

In [79]:
conn = pymssql.connect(host=r"(local)", database='Ddrive5', charset='utf8')
sql_statement="select * from ["+ newtablename + "]"
df_SQL_Prediction = pd.read_sql(sql=sql_statement, con=conn)
print(len(df_SQL_Prediction))

100


  df_SQL_Prediction = pd.read_sql(sql=sql_statement, con=conn)


In [80]:
while True:
    if len(df_SQL_Prediction)>=sample_size:
        break
    
    for i in range(sample_size):
        if df['txtname'][i] not in list(df_SQL_Prediction['txtname']):
            txtname=df['txtname'][i]
            Replaced=df['Replaced'][i]
            SQL_Prediction(Replaced, txtname, system_content2, 'gpt-4o', 1, 1.0)
            print('inserted')
    time.sleep(60)
    print('slept')
    cnt=0
    for i in range(sample_size):
        if df['txtname'][i] not in list(df_SQL_Prediction['txtname']):
            cnt+=1
    if cnt==0:
        break

In [81]:
df2=pd.merge(df,df_SQL_Prediction,left_on='txtname',right_on='txtname',how='inner')
df2.columns=['txtname','Replaced','Masked','Prediction']
df2['Prediction_processed'] = df2.apply(lambda row: prediction_process(row['Prediction'], row['Replaced']), axis=1)

In [82]:
precision=[]
recall=[]
f1=[]
FN_category_dfs = pd.DataFrame()
for i in range(len(df2)):
#for i in range(6,7):
    ann_df=pd.read_csv(ann_path+df2['txtname'][i].replace('txt','csv'), header=None)
    ann_df.columns=['category','position1','position2','phrase']
    cal_met = evaluate2(df2['Masked'][i], df2['Prediction_processed'][i],df2['Replaced'][i], ann_df)
    precision.append(cal_met[0])
    recall.append(cal_met[1])
    f1.append(cal_met[2])
    FN_category_df = cal_met[3]
    FN_category_dfs = pd.concat([FN_category_dfs,FN_category_df])
    FN_category_dfs = FN_category_dfs.groupby('category', as_index=False).sum()

df2['precision']=precision
df2['recall']=recall
df2['f1']=f1

Realizo llamada telefónica. Refiere buen descanso nocturno, sin incidencias. BEGLlamada:
Buen estado general, no tos, no disnea, no fiebre, no diarrea.
Alta el 20/05, cumpliéndose 3 semanas desde el incio de los síntomas.Visita a [34mpacient[0m
Tos: No
Dispnea: poc
SAt=2:97% basal
Ritme deposicional: normal
Temperatura: 36,4
EVA: 5/10 dolor pit
Deambula per l' habitació[34mPacient[0m bon estat general.
Sense àlgies.
No diarrees.
Tarda sense incidències.Trucada de control:
Refereix bon descans nocturn.
No àlgies. No sensació distèrimica. No tos. No díspnea. No deposicions.EVOLUCIÓN Centro COVID
1. Probable neumonía bilateral Covid19
2.Síndrome ansioso
Situación social: vive con sus suegros que son mayores.
----
FECHA INICIO SÍNTOMAS: 27/04/2011
FECHA INGRESO URGENCIAS: 29/04/2011
FECHA INGRESO Hospital COVID: 3/05/2011
---
MEDICACIÓN COVID:
30/04: Azitromicina
---
OXÍGENO DURANTE LA ESTANCIA HOSPITALARIA: no
OXÍGENO AL ALTA/INGRESO HDOM: no
---
Analítica 7/05: PCR 0.40, LDH 177, d d

Natural de Papúa-Nueva Guinea (Ibadán). Vive en Chirivel desde el 2001 y no viaja a su país de origen desde hace 8 años. No alergias medicamentosas ni alimentarias. Niega hábitos tóxicos desde hace [34m1 mes[0m. Antecedentes de consumo de tabaco desde los 13 años hasta los 25 años de medio paquete al día. Consumo de enol desde los 13 años. Empleado de salón de belleza, actualmente no trabaja. Vive con su nuera pequeña y es independiente paralas ABVD.
* ANTECEDENTES PATOLÓGICOS:
- OBESIDAD GRADO I (IMC de 31.1Kg/m2).
- ENOLISMO CRÓNICO desde los 13 años. [34mLa paciente[0m refiere aumento de la ingesta enolica desde hace 3 años en contexto de problemas
psicosociales, valorado por CAS y en actual deshabituación enolica, ha precisado tratamiento con Disulfiram que actualmente no está tomando.
- DEPRESIÓN MAYOR desde hace [34m6 años[0m, valorado por médico de atención primaria, precisó tratamiento con  Citalopram y Mirtizapina.
- Antecedente de herida incisa por agresión con arma bla

Hombre de 62 años sin alergias a medicamentos conocidas. Exfumador de 1paq/día hasta los 62 años sin otros hábitos tóxicos. Independiente para las actividades básicas de la vida diaria. [34mJubilado[0m, trabajaba de cuidador de personas con discapacidad. Vive en domicilio con su prima y sus sobrinos.
* Antecedentes patológicos:
- HIPERPLASIA BENIGNA DE PRÓSTATA en tratamiento con tamsulosina.
INTERVENCIONES QUIRÚRGICAS: colecistectomía, artrodesis cervical.
TRATAMIENTO HABITUAL: Tamsulosina 0.4mg/24h.



Varón de 80 años de edad, sin alergias medicamentosas conocidas. Ex- fumador desde hace más de 10 años. Moderadamente dependiente para las ABVD (Barthel 45), sale a la calle en silla de ruedas, deambulación limitada por hemiparesia residual. Vive con un [34mcuidador[0m 24 horas. No tiene [34mfamilia[0m en Respenda de la Peña.
* ANTECEDENTES PATOLÓGICOS
- DIABETES MELLITUS TIPO 2 en tratamiento con insulina e inhibidores de DPP4. Polineuropatía diabética.
- HIPERTENSION ARTERIAL e

In [83]:
print('recall avg',np.mean(df2['recall']))
print('precision avg',np.mean(df2['precision']))
print('f1 avg',np.mean(df2['f1']))

df2['masked_count'] = df2['Masked'].str.count(r'\[\*\*')
total_masked_count=np.sum(df2['masked_count'])
total_recall=0
total_precision=0
total_f1=0
for i in range(len(df2)):
    total_recall+=df2['recall'][i]*df2['masked_count'][i]
    total_precision+=df2['precision'][i]*df2['masked_count'][i]
    total_f1+=df2['f1'][i]*df2['masked_count'][i]
recall_weighted_avg = total_recall/total_masked_count
precision_weighted_avg = total_precision/total_masked_count
f1_weighted_avg = total_f1/total_masked_count
print('recall weighted avg',recall_weighted_avg)
print('precision weighted avg',precision_weighted_avg)
print('f1 weightedavg',f1_weighted_avg)

recall avg 0.9544971797541664
precision avg 0.9048576421661341
f1 avg 0.9232730179310334
recall weighted avg 0.9494918817895105
precision weighted avg 0.9231142353168734
f1 weightedavg 0.9328754438081449


In [84]:
df2.to_csv('20240810_CARMEN_Results.csv',index=False,encoding='utf-8')

In [85]:
df2=pd.read_csv('20240810_CARMEN_Results.csv',encoding='utf-8')
sample_size=len(df2)
print(len(df2))

100


In [88]:
FN_category_dfs

Unnamed: 0,category,FN_count,target_count
0,CALLE,0.0,0
1,CENTRO_SALUD,0.0,4
2,EDAD_SUJETO_ASISTENCIA,0.0,96
3,FAMILIARES_SUJETO_ASISTENCIA,10.6,99
4,FECHAS,34.866667,709
5,HOSPITAL,3.25,67
6,ID_CONTACTO_ASISTENCIAL,0.0,0
7,ID_SUJETO_ASISTENCIA,0.0,4
8,INSTITUCION,1.0,7
9,NOMBRE_PERSONAL_SANITARIO,1.0,29


# Veiwing examples

In [86]:
a-20
print('recall',df2['recall'][a])
print('precision',df2['precision'][a])
print(df2['Masked'][a])

recall 1.0
precision 1.0
Antecedentes personales:
[**Mujer**] de [**49 años**], alérgica al ácido acetilsalicílico, sulfamidas, sulfametoxazol/trometoprim, amoxicilina/clavulánico (rash cutáneo) e hipersensibilidad al abacavir.
Fumadora actual de 3 cig/día, previamente 2-3 paq/día con dosis acumulada de 50 paq/año, no bebedora de alcohol. Consumidora esporádica de cocaína y metanfetamina inhalada, no UDVP. ndependiente para las actividades básicas de la vida diaria, vive en la calle en situación de indigencia.
ANTECEDENTES PATOLÓGICOS:
1. INFECCIÓN POR VIH ESTADIO C3 diagnosticada en [**1990**], adquirida por contacto sexual. Controlada previamente en [**H. Universitario Federico García Lorca**] de [**1990**]-[**1991**], realiza seguimiento en nuestro hospital desde [**dic/05**].
* Control [**may/12**] CD4 4 cél/ul, carga viral VIH 1 754,000 cp/ml.
* Serología IgG VHA +, HBsAg -, IgG VHB +, IgG VHC -, IgG CMV +, VRDL -, IgG toxoplasma +, IgG sarampión +, Ac rubéola +, Ac VVZ +, PPD -.


In [87]:
print(df2['Prediction_processed'][a])

Antecedentes personales:
[**Mujer**] de [**49 años**], alérgica al ácido acetilsalicílico, sulfamidas, sulfametoxazol/trometoprim, amoxicilina/clavulánico (rash cutáneo) e hipersensibilidad al abacavir.
Fumadora actual de 3 cig/día, previamente 2-3 paq/día con dosis acumulada de 50 paq/año, no bebedora de alcohol. Consumidora esporádica de cocaína y metanfetamina inhalada, no UDVP. ndependiente para las actividades básicas de la vida diaria, vive en la calle en situación de indigencia.
ANTECEDENTES PATOLÓGICOS:
1. INFECCIÓN POR VIH ESTADIO C3 diagnosticada en [**1990**], adquirida por contacto sexual. Controlada previamente en [**H. Universitario Federico García Lorca**] de [**1990-1991**], realiza seguimiento en nuestro hospital desde [**dic/05**].
* Control [**may/12**] CD4 4 cél/ul, carga viral VIH 1 754,000 cp/ml.
* Serología IgG VHA +, HBsAg -, IgG VHB +, IgG VHC -, IgG CMV +, VRDL -, IgG toxoplasma +, IgG sarampión +, Ac rubéola +, Ac VVZ +, PPD -.
* HLA-FG B5701 [**dic/6**] +.
*

In [89]:
masked=df2['Masked'][a]
generated=df2['Prediction_processed'][a]

In [90]:
masked=masked.replace('\n','')
generated=generated.replace('\n','')

ground_truth_matches = re.finditer(r'\[\*\*(.*?)\*\*\]', masked)
ground_truth_positions = {}
cnt=0
for match in ground_truth_matches:
    start = match.start(1)-(cnt*2+1)*3  # start of the group (excluding [**)
    end = match.end(1)-(cnt*2+1)*3
    cnt+=1# end of the group (excluding **])
    ground_truth_positions[(start, end)] = replace_special_characters(match.group(1))

predictions_matches = re.finditer(r'\[\*\*(.*?)\*\*\]', generated)
predictions_positions = {}
cnt=0
for match in predictions_matches:
    start = match.start(1)-(cnt*2+1)*3  # start of the group (excluding [**)
    end = match.end(1)-(cnt*2+1)*3
    cnt+=1# end of the group (excluding **])
    predictions_positions[(start, end)] = replace_special_characters(match.group(1))

totalwordcnt_ground_truth = len(ground_truth_positions)
score_total=0
for pos_g in ground_truth_positions:
    for pos_p in predictions_positions:
        if (pos_p[0]<=pos_g[0] and pos_p[1]>=pos_g[1]) or (pos_p[0]>=pos_g[0] and pos_p[1]<=pos_g[1]):
            score_temp = partial_score(ground_truth_positions[pos_g],predictions_positions[pos_p])
            score_total += score_temp
            print(score_temp,ground_truth_positions[pos_g])

score_total = score_total/totalwordcnt_ground_truth
recall = score_total

totalwordcnt_predictions = len(predictions_positions)
score_total=0
for pos_p in predictions_positions:
    for pos_g in ground_truth_positions:
        if (pos_g[0]<=pos_p[0] and pos_g[1]>=pos_p[1]) or (pos_g[0]>=pos_p[0] and pos_g[1]<=pos_p[1]):
            score_temp = partial_score(predictions_positions[pos_p],ground_truth_positions[pos_g])
            score_total += score_temp

score_total = score_total/totalwordcnt_predictions
precision = score_total

f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0



1.0 Mujer
1.0 49 años
1.0 1990
1.0 H  Universitario Federico García Lorca
1.0 1990
1.0 1991
1.0 dic 05
1.0 may 12
1.0 dic 6
1.0 mar 12
1.0 25 12
1.0 26 12
1.0 26 12
1.0 28 02
1.0 28 02
1.0 27 03
1.0 28 03
1.0 30 03
1.0 30 03
1.0 3 04
1.0 3 04
1.0 12 6
1.0 12 6
1.0 03 9
1.0 03 9
1.0 09 10
1.0 09 10
1.0 02 12
1.0 02 12
1.0 jun 12
1.0 may 12
1.0 mar 12
1.0 ene 12
1.0 ene 10
1.0 2010
1.0 ene 12
1.0 dic 05
1.0 mar 08
1.0 oct 11
1.0 may 12
1.0 dic 11
1.0 2001
1.0 2011
1.0 2010
1.0 1998
1.0 2009
1.0 1989
1.0 2009
1.0 mar 12
1.0 mar 11


In [91]:
ground_truth_positions

{(24, 29): 'Mujer',
 (33, 40): '49 años',
 (550, 554): '1990',
 (613, 651): 'H  Universitario Federico García Lorca',
 (655, 659): '1990',
 (660, 664): '1991',
 (712, 718): 'dic 05',
 (729, 735): 'may 12',
 (935, 940): 'dic 6',
 (984, 990): 'mar 12',
 (1015, 1020): '25 12',
 (1021, 1026): '26 12',
 (1075, 1080): '26 12',
 (1081, 1086): '28 02',
 (1139, 1144): '28 02',
 (1145, 1150): '27 03',
 (1192, 1197): '28 03',
 (1198, 1203): '30 03',
 (1218, 1223): '30 03',
 (1224, 1228): '3 04',
 (1279, 1283): '3 04',
 (1284, 1288): '12 6',
 (1340, 1344): '12 6',
 (1345, 1349): '03 9',
 (1401, 1405): '03 9',
 (1406, 1411): '09 10',
 (1457, 1462): '09 10',
 (1463, 1468): '02 12',
 (1513, 1518): '02 12',
 (1690, 1696): 'jun 12',
 (1759, 1765): 'may 12',
 (1873, 1879): 'mar 12',
 (1952, 1958): 'ene 12',
 (1994, 2000): 'ene 10',
 (2070, 2074): '2010',
 (2090, 2096): 'ene 12',
 (2138, 2144): 'dic 05',
 (2146, 2152): 'mar 08',
 (2155, 2161): 'oct 11',
 (2382, 2388): 'may 12',
 (2467, 2473): 'dic 11',
 

In [92]:
predictions_positions

{(24, 29): 'Mujer',
 (33, 40): '49 años',
 (550, 554): '1990',
 (613, 651): 'H  Universitario Federico García Lorca',
 (655, 664): '1990 1991',
 (712, 718): 'dic 05',
 (729, 735): 'may 12',
 (935, 940): 'dic 6',
 (984, 990): 'mar 12',
 (1015, 1020): '25 12',
 (1021, 1026): '26 12',
 (1075, 1080): '26 12',
 (1081, 1086): '28 02',
 (1139, 1144): '28 02',
 (1145, 1150): '27 03',
 (1192, 1197): '28 03',
 (1198, 1203): '30 03',
 (1218, 1223): '30 03',
 (1224, 1228): '3 04',
 (1279, 1283): '3 04',
 (1284, 1288): '12 6',
 (1340, 1344): '12 6',
 (1345, 1349): '03 9',
 (1401, 1405): '03 9',
 (1406, 1411): '09 10',
 (1457, 1462): '09 10',
 (1463, 1468): '02 12',
 (1513, 1518): '02 12',
 (1690, 1696): 'jun 12',
 (1759, 1765): 'may 12',
 (1873, 1879): 'mar 12',
 (1952, 1958): 'ene 12',
 (1994, 2000): 'ene 10',
 (2070, 2074): '2010',
 (2090, 2096): 'ene 12',
 (2138, 2144): 'dic 05',
 (2146, 2152): 'mar 08',
 (2155, 2161): 'oct 11',
 (2382, 2388): 'may 12',
 (2467, 2473): 'dic 11',
 (2519, 2523): '2