In [1]:
from llm.context.llm_context import LLMContext
from llm.strategy.big_llama_model import BigLlamaModel
from llm.strategy.big_llama3_1_model import BigLlama3_1Model
from llm.strategy.small_llama_model import SmallLlamaModel
from llm.strategy.big_mistral_model import BigMistralModel
from llm.strategy.haiku3_model import Haiku3Model
from llm.strategy.sonet3_model import Sonet3Model

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
from tqdm import tqdm
import os
import numpy as np 
import pandas as pd
import re
import warnings
import spacy

In [2]:
PATH = './data/processed/txt'

In [3]:
def get_text_and_masked_carmen(name):
    filename_result = f'./data/processed/masked/{name}'

    with open(filename_result, 'r') as archivo:
        text_masked = archivo.read()

    csv_data = f'./data/processed/ann/{name.split(".")[0]}.csv'
    result_dict = {}
    df = pd.read_csv((csv_data), header=None)
    for index, row in df.iterrows():
        result_dict[row[3]] = row[0]
    return [text_masked, result_dict]

# METRICS

In [4]:
def compute_classification_agreement(dict1, dict2):
    # Look for key disagreement
    keys_not_present_in_dict2 = list(set(dict1.keys()) - set(dict2.keys()))
    
    # look for misclassifications
    disagreement_in_values = {}
    for key in dict1.keys() & dict2.keys():
        if dict1[key] != dict2[key]:
            disagreement_in_values[key] = (dict1[key], dict2[key])

    return disagreement_in_values, keys_not_present_in_dict2

# Loop

In [5]:
import json
def anonimized(llm=None, name_model="", data=None):
    counter = 0
    context = LLMContext(llm)
    list_data  = []
    for filename in sorted(os.listdir(PATH)):
        print(filename)
        metrics_data = {}
        metrics_data["filename"] = filename
        try:
            [text_masked, array_ground_truth] = get_text_and_masked_carmen(filename)
            data["user"] = text_masked
            text_generated = context.generate_response(data)
            array_of_dicts = json.loads(text_generated)
            merged_dict = {}
            for d in array_of_dicts:
                merged_dict.update(d)
            diferencias, unicas_dict1 = compute_classification_agreement(array_ground_truth, merged_dict)
            print("Errores de clasificacion:", diferencias)
            print("Claves no enconctradas en el segundo diccionario:", unicas_dict1)
        except Exception as e:
            print("Error")
            print(e)
            continue

In [6]:
data = {}
data["system"] = """
You are a clasification tool. 
You have the ability to identify quasi-identifying attributes which are in [**  **] simbols.
Then you will return an array with the anonymized attributes in the order they appear in the text.

Las posibles etiquetas son:
- 'FAMILIARES_SUJETO_ASISTENCIA'
- 'FECHAS'
- 'NUMERO_TELEFONO'
- 'OTROS_SUJETO_ASISTENCIA'
- 'INSTITUCION'
- 'NUMERO_IDENTIF'
- 'EDAD_SUJETO_ASISTENCIA'
- 'HOSPITAL'
- 'PAIS'
- 'TERRITORIO'
- 'NOMBRE_PERSONAL_SANITARIO'
- 'PROFESION'
- 'ID_SUJETO_ASISTENCIA'
- 'CENTRO_SALUD'
- 'SEXO_SUJETO_ASISTENCIA'
- 'CALLE'
- 'ID_CONTACTO_ASISTENCIAL'
- 'URL_WEB'


Examples:
Paciente de [**41 años**], [**albañil**], con dolor lumbar crónico y problemas de movilidad. = [{"41 años": "EDAD_SUJETO_ASISTENCIA"}, {"albañil": "PROFESION"}]

Do not comment anything else. Just return the array.

"""

# Function to get labels

In [7]:
# import pandas as pd
# import os
# PATH = './data/processed/ann'
# array_options = []
# for filename in sorted(os.listdir(PATH)):
#     filename_result = f'./data/processed/ann/{filename}'
#     data = pd.read_csv(filename_result)
#     array_options.append(data.iloc[:, 0])

# combined_options = [item for sublist in array_options for item in sublist]
# unique_options_ordered = list(dict.fromkeys(combined_options))
# unique_options_ordered

In [8]:
anonimized(llm=BigLlamaModel(), name_model="big_llama3", data=data)

CARMEN-I_CC_1.txt
Errores de clasificacion: {'Salud laboral': ('OTROS_SUJETO_ASISTENCIA', 'INSTITUCION')}
Claves no enconctradas en el segundo diccionario: []
CARMEN-I_CC_2.txt
Errores de clasificacion: {'WYX/8408/5545': ('NUMERO_IDENTIF', 'URL_WEB'), 'Hospital COVID': ('INSTITUCION', 'HOSPITAL')}
Claves no enconctradas en el segundo diccionario: []
CARMEN-I_CC_3.txt
Errores de clasificacion: {}
Claves no enconctradas en el segundo diccionario: []
CARMEN-I_CC_4.txt
Errores de clasificacion: {'34': ('FECHAS', 'EDAD_SUJETO_ASISTENCIA')}
Claves no enconctradas en el segundo diccionario: []
CARMEN-I_CC_5.txt
Errores de clasificacion: {}
Claves no enconctradas en el segundo diccionario: []
CARMEN-I_IA_ANTECEDENTES_1.txt
Errores de clasificacion: {'Centro Médico Aspasia': ('HOSPITAL', 'CENTRO_SALUD'), 'Historia clínica electrónica': ('OTROS_SUJETO_ASISTENCIA', 'INSTITUCION')}
Claves no enconctradas en el segundo diccionario: []
CARMEN-I_IA_ANTECEDENTES_10.txt
Errores de clasificacion: {}
Cla

KeyboardInterrupt: 

In [None]:

df = pd.read_csv((csv_data), header=None)

result = [{row[3]: row[0]} for index, row in df.iterrows()]

In [None]:
result