In [16]:
import os
import threading
from langchain_aws import ChatBedrock
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import pandas as pd
import warnings
import spacy
import numpy as np 
from colorama import init, Fore

In [17]:
init(autoreset=True)

In [18]:
MODELS = [
    "meta.llama3-70b-instruct-v1:0", #Big Llama
    "anthropic.claude-3-haiku-20240307-v1:0", #Big Claude
    "anthropic.claude-3-5-sonnet-20240620-v1:0", #Last Claude
    "anthropic.claude-3-sonnet-20240229-v1:0", #Last middle Claude
    "mistral.mixtral-8x7b-instruct-v0:1", #Big Mistral
    "meta.llama3-8b-instruct-v1:0" #Small Llama
]

In [19]:
prompt1 = "Devuelve solo el texto original con la identificación y etiquetación del texto con la información personal del paciente añadiendolo entre claudators, por ejemplo: [**arquitecto**], [**8/9/21**], [**padre**]. No comentes nada más"

# Poner -> Etiquetas 

In [20]:
system_prompt = """
You are an anonymization tool in identifying attributes in texts that can identify or quasi-identify a user.
Return only the original text with the identification and labeling of the patient's personal information by adding it between [** and **].
Following are attributes that you must anonymize.

- Names
Example:
En seguimiento por Hematología Centro Médico Aspasia (Dra. Valvanera).   ->
En seguimiento por Hematología [**Centro Médico Aspasia**] (Dra. [**Valvanera**]).

- Ages
Example: 
Varón de 41 años.   ->
[**Varón**] de [**41 años**].

- Sexes
Example: 
Varón de 41 años.   ->
[**Varón**] de [**41 años**].

- Professions
Example: 
Trabaja como profesor.   ->
Trabaja como [**profesor**].

- Relatives
Example: 
Vive con suegro y 2 yernos.   ->
Vive con [**suegro**] y 2 [**yernos**].

- Dates
Example: 
ha estado viviendo en el Centro desde septiembre de 2008.   ->
ha estado viviendo en el [**Centro**] desde [**septiembre de 2008**].

- Phone numbers
Example: 
contactando con el siguiente número de teléfono +50 88 078 68 49.   ->
contactando con el siguiente número de teléfono [**+50 88 078 68 49**].

- Identification numbers
Example:
El paciente otorga su consentimiento informado para participar en el estudio del protocolo WYX/8408/5545.   ->
El paciente otorga su consentimiento informado para participar en el estudio del protocolo [**WYX/8408/5545.**]

- Institutions, hospitals, health centers, etc
Example: 
En seguimiento por Hematología Centro Médico Aspasia (Dra. Valvanera).   ->
En seguimiento por Hematología [**Centro Médico Aspasia**] (Dra. [**Valvanera**]).
Example:
Control en Centro Salud Mental Reyes Católicos.   ->
Control en [**Centro Salud Mental Reyes Católicos**].

- Countries, territories, streets, etc
Example:
nacido en la República Italiana.   ->
nacido en la [**República Italiana**].
Example:
ha estado viviendo en el Centro desde septiembre de 2008.   ->
ha estado viviendo en el [**Centro**] desde [**septiembre de 2008**].
Example:
la dirección es Calle de Victor Hugo 39.   ->
la dirección es [**Calle de Victor Hugo 39**].

- Website URLs
participar a través del siguiente enlace: https://www.donarsang.gencat.cat/covid19.   ->
participar a través del siguiente enlace: [**https://www.donarsang.gencat.cat/covid19**].

- Other sensitive information such as races, ethnicities, sexual orientation, dietary preferences, etc
Example:
raça blanca   ->
[**raça blanca**]
Example:
Hsh
[**Hsh**]
Example:
Vegetarià
[**Vegetarià**]

Do not comment anything else.
Besides the anonymized attributes, provide the rest of the text exactly the same, including special characters and \n symbols.
Do not correct any typos or spacing errors at your discretion.
For example, if the time is written as 31/12/2000-0 9:20:00 with incorrect spacing, do not return it corrected as 31/12/2000-09:20:00.
Also, for example, if FLUTICASONA + AZELA STINA4 is written with incorrect spacing, do not return it corrected as FLUTICASONA + AZELASTINA 4.
Only focus on the anonymization tasks I have specified, and ignore any typos or spacing errors
"""

In [21]:
meddocan = """
Anota todos los datos de información personal médica que encuentres en este informe utilizando las guias meddocan a continuación:
Nombres
Datos geográficos
Todos los elementos de las fechas
Números telefónicos
Números FAX
Correos electrónicos
Números de Seguridad Social
Números de registros médicos
Números de beneficiarios del plan de salud
Números de cuenta
Certificado / números de licencia
Identificadores de vehículos y números de serie, incluidas placas
Identificadores de dispositivo y números de serie
URL web
Direcciones de protocolo de Interne
Identificadores biométricos (es decir, escaneo retiniano, huellas dactilar
Fotos de cara completa e imágenes comparable
Cualquier número de identificación único, característica o código
"""

In [22]:
prompt_template = ChatPromptTemplate.from_messages([
    # ("system", "You are an anonymization tool for identifying attributes in texts that can identify or quasi-identify a user. Whenever you find them, you should label them as follows: [**attribute|Generalization**]."),
    ("system", """{system_prompt}"""),
    ("user",    """{text}""")
])

In [23]:
parser = StrOutputParser()

In [24]:
def get_text_and_masked_carmen(name):
    filename = f'./data/processed/txt/{name}'
    filename_result = f'./data/processed/masked/{name}'
    with open(filename, 'r') as archivo:
        text = archivo.read()

    with open(filename_result, 'r') as archivo:
        text_masked = archivo.read()

    return [text, text_masked]

In [25]:
list_data  = []
counter = 0

In [26]:
import threading
llm_small_llama =   ChatBedrock(  
                        model_id=MODELS[5],
                        region_name='eu-west-2',
                        model_kwargs=dict(
                                    temperature = 0.1,
                                    max_tokens = 2000,
                                    ),
                    )
llm_big_llama   =   ChatBedrock(    
                        model_id=MODELS[0],
                        region_name='eu-west-2',
                        model_kwargs=dict(
                                    temperature = 0.1,
                                    max_tokens = 2000,
                                    ),
                    )
llm_haiku       =   ChatBedrock(    
                        model_id=MODELS[1],
                        region_name='eu-west-3',
                        model_kwargs=dict(
                                    temperature = 0.1,
                                    max_tokens = 2000,
                                    ),
                    )
llm_sonet       =   ChatBedrock(    
                        model_id=MODELS[3],
                        region_name='eu-west-3',
                        model_kwargs=dict(
                                    temperature = 0.1,
                                    max_tokens = 2000,
                                    ),
                    )
llm_mistral     =   ChatBedrock(
                        model_id=MODELS[4],
                        region_name='eu-west-2',
                        model_kwargs=dict(
                                    temperature = 0.1,
                                    max_tokens = 2000,
                                    ),
                    )

In [27]:
def save_file(filename, text):
    with open(filename, 'w') as archivo:
        archivo.write(text)

In [28]:
def create_folder(name):
    try:
        os.mkdir(name)
        print(f"Folder '{name}' created successfully")
    except FileExistsError:
        pass
    except Exception as e:
        print(f"Error creating the folder '{name}': {e}")


# Metrics

In [29]:
import re
import warnings
import spacy

warnings.filterwarnings("ignore", message="\[W008\] Evaluating Doc.similarity based on empty vectors")
# Cargar el modelo de lenguaje en español mediano

nlp = spacy.load("es_core_news_md")
# Función de similitud de embeddings
def embedding_similarity(str1, str2, threshold=0.8):
    doc1 = nlp(str1)
    doc2 = nlp(str2)
    similarity = doc1.similarity(doc2)
    return similarity >= threshold

def eliminar_adverbios_preposiciones_determinantes(texto):
    doc = nlp(texto)
    # Eliminar preposiciones (ADP) y determinantes (DET)
    tokens_filtrados = [token.text for token in doc if token.pos_ not in ('ADP', 'DET')]
    return ' '.join(tokens_filtrados)

## Cosine similarity

In [30]:
def get_cos_sim(text_hoped, text_generated):
    vectorizer = TfidfVectorizer(token_pattern=r"(?u)\b\w[\w\-/]*\b")
    tfidf_matrix = vectorizer.fit_transform([text_hoped, text_generated])

    try:
        cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
    except: 
        return 0.0
    return cosine_sim[0][0]

## Levenshtein distance

In [31]:
from tqdm import tqdm

def levenshtein_distance(s1, s2, show_progress=True):
    """
    Calcula la distancia de Levenshtein entre dos cadenas.

    La distancia de Levenshtein es el número mínimo de operaciones de edición 
    (inserción, eliminación o sustitución de un carácter) necesarias para 
    transformar una cadena en otra.

    Parámetros:
        s1 (str): Primera cadena
        s2 (str): Segunda cadena
        show_progress (bool): Si es True, muestra una barra de progreso. 
                              Por defecto es False.
    Retorna:
        int: La distancia de Levenshtein entre s1 y s2
    """
    # Usar tqdm solo si show_progress es True
    iterable = tqdm(s1) if show_progress else s1

    if len(s1) < len(s2):
        s1, s2 = s2, s1
    if len(s2) == 0:
        return len(s1)

    previous_row = list(range(len(s2) + 1))
    for i, c1 in enumerate(iterable):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    return previous_row[-1]



## Precision and Recall

In [32]:
def calc_metrics(ground_truth, predictions):
    # Convertir arrays de ground_truth y predictions a listas de str
    ground_truth_processed = np.array([eliminar_adverbios_preposiciones_determinantes(str(item)) for item in ground_truth])
    predictions_processed = np.array([eliminar_adverbios_preposiciones_determinantes(str(item)) for item in predictions])

    # Crear matrices de similitud de coseno y embedding 
    get_cos_sim_vectorized = np.vectorize(lambda gt, pred: get_cos_sim(str(gt), str(pred)))
    embedding_similarity_vectorized = np.vectorize(lambda gt, pred: embedding_similarity(str(gt), str(pred)))

    cosine_results = get_cos_sim_vectorized(ground_truth_processed[:, None], predictions_processed[None, :])
    embedding_results = embedding_similarity_vectorized(ground_truth_processed[:, None], predictions_processed[None, :])

    # Promediar las similitudes
    avg_similarities = (cosine_results + embedding_results) / 2

    # Determinar verdaderos positivos
    matches = avg_similarities > 0.5
    true_positives = np.sum(np.any(matches, axis=1))

    # Determinar falsos negativos
    false_negatives = len(ground_truth) - true_positives

    # Determinar falsos positivos
    predicted_matches = np.any(matches, axis=0)
    false_positives = len(predictions) - np.sum(predicted_matches)

    # Cálculo de métricas
    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1

def evaluate(masked, generated):
    """ 
    Input: 
        - masked (str): Ground_truth text
        - generated(str): Text to be evaluated

    Output:
        - Precision, Recall and F1 (float)
    """
    ground_truth = re.findall(r'\[\*\*(.*?)\*\*\]', masked)
    predictions = re.findall(r'\[\*\*(.*?)\*\*\]', generated)
    labels = [ground_truth, predictions]
    
    return [calc_metrics(ground_truth, predictions), labels]

# Loop

In [33]:
#!!! METRICS

def anonimized(llm=None, name_model=""):
    counter = 0
    path = './data/processed/txt'
    chain = prompt_template | llm | parser
    list_data  = []
    for filename in sorted(os.listdir(path)):
        metrics_data = {}
        metrics_data["filename"] = filename
        try:
            [text, text_hoped] = get_text_and_masked_carmen(filename)
            text_generated = chain.invoke({"max_tokens": 2000, "system_prompt": system_prompt, "text": text})
            create_folder(f'data/anon/raw/{name_model}')
            save_file(f'data/anon/raw/{name_model}/{filename}', text_generated)
            # print(f"THE COUNT IS {counter}")
            # print(f"THE FILENAME IS {filename}")
            # print(text_generated)
            # print(" ")
            # print("========================================")
            # print(text_hoped)
            # print("========================================")
            # print("========================================")
            # print("========================================")
            # print(" ")
            # print(" ")
            # print(" ")
            [cal_met, labels] = evaluate(text_hoped, text_generated)    
            cosine_sim = get_cos_sim(text_hoped, text_generated)
            text_generated = text_generated.replace('[**', '').replace('**]', '')
            text_hoped = text_hoped.replace('[**', '').replace('**]', '')
            result = levenshtein_distance(text_generated, text_hoped[:len(text_generated)], show_progress=False)
            metrics_data["precision"] = cal_met[0]
            metrics_data["recall"] = cal_met[1]
            metrics_data["f1"] = cal_met[2]
            metrics_data["cos"] = cosine_sim
            metrics_data["levenshtein"] = result
            metrics_data["labels hoped"] = labels[0]
            metrics_data["labels generated"] = labels[1]
            metrics_data["fail"] = 0
            if int(metrics_data["levenshtein"]) == 0:
                metrics_data["inv_levenshtein"] = 1
            else: 
                metrics_data["inv_levenshtein"] = (1/metrics_data["levenshtein"])
            metrics_data["overall"] = metrics_data["precision"] + metrics_data["recall"] +  metrics_data["f1"] + metrics_data["cos"] + metrics_data["inv_levenshtein"]
            list_data.append(metrics_data)
            counter += 1
            if counter > 5:
                break
        except Exception as e:
            print(e) 
            metrics_data["fail"] = 1
        finally:
            list_data.append(metrics_data)
    list_data = pd.DataFrame(list_data)
    list_data.to_csv(f'data/metrics/{name_model}.csv')


In [34]:
thread_small_llama = threading.Thread(target=anonimized, args=(llm_small_llama, "small_llama" ))
thread_big_llama = threading.Thread(target=anonimized, args=(llm_big_llama, "big_llama" ))
thread_haiku = threading.Thread(target=anonimized, args=(llm_haiku, "haiku" ))
thread_sonet = threading.Thread(target=anonimized, args=(llm_sonet, "sonet" ))
thread_mistral = threading.Thread(target=anonimized, args=(llm_mistral, "mistral" ))

In [35]:
thread_small_llama.start()
thread_big_llama.start()
thread_haiku.start()
thread_sonet.start()
thread_mistral.start()

In [36]:
thread_big_llama.join()
thread_haiku.join()
thread_sonet.join()
thread_small_llama.join()
thread_mistral.join()

Error raised by bedrock service: An error occurred (ValidationException) when calling the InvokeModel operation: Malformed input request: #: extraneous key [max_tokens] is not permitted, please reformat your input and try again.
Error raised by bedrock service: An error occurred (ValidationException) when calling the InvokeModel operation: Malformed input request: #: extraneous key [max_tokens] is not permitted, please reformat your input and try again.
Error raised by bedrock service: An error occurred (ValidationException) when calling the InvokeModel operation: Malformed input request: #: extraneous key [max_tokens] is not permitted, please reformat your input and try again.
Error raised by bedrock service: An error occurred (ValidationException) when calling the InvokeModel operation: Malformed input request: #: extraneous key [max_tokens] is not permitted, please reformat your input and try again.
Error raised by bedrock service: An error occurred (ValidationException) when callin

In [None]:
haiku_data = pd.read_csv('./data/metrics/haiku.csv')
mistral_data = pd.read_csv('./data/metrics/mistral.csv')
sonet_data = pd.read_csv('./data/metrics/sonet.csv')
small_llama_data = pd.read_csv('./data/metrics/small_llama.csv')
big_llama_data = pd.read_csv('./data/metrics/big_llama.csv')

In [None]:
haiku_data = haiku_data[haiku_data['fail'] == 0]
mistral_data = mistral_data[mistral_data['fail'] == 0]
sonet_data = sonet_data[sonet_data['fail'] == 0]
small_llama_data = small_llama_data[small_llama_data['fail'] == 0]
big_llama_data = big_llama_data[big_llama_data['fail'] == 0]

In [None]:
print(haiku_data["precision"].mean())
print(mistral_data["precision"].mean())
print(sonet_data["precision"].mean())
print(small_llama_data["precision"].mean())
print(big_llama_data["precision"].mean())


0.5638462194913809
0.37747872816838335
0.7711574074074073
0.4613777027570132
0.7388096652802535


In [None]:
print(haiku_data["recall"].mean())
print(mistral_data["recall"].mean())
print(sonet_data["recall"].mean())
print(small_llama_data["recall"].mean())
print(big_llama_data["recall"].mean())

0.9445175438596491
0.2064788329519451
0.9516447368421052
0.47004513094330025
0.7806731502669718


In [None]:
print(haiku_data["f1"].mean())
print(mistral_data["f1"].mean())
print(sonet_data["f1"].mean())
print(small_llama_data["f1"].mean())
print(big_llama_data["f1"].mean())

0.6634361188200225
0.2321243869630966
0.8329043310341264
0.393632730558018
0.7426805030004204


In [None]:
print(haiku_data["cos"].mean())
print(mistral_data["cos"].mean())
print(sonet_data["cos"].mean())
print(small_llama_data["cos"].mean())
print(big_llama_data["cos"].mean())

0.9815436460076422
0.8539475065430507
0.984524020484903
0.9130081388689327
0.9098922058898773


In [None]:
print(haiku_data['levenshtein'].mean())
print(mistral_data['levenshtein'].mean())
print(sonet_data['levenshtein'].mean())
print(small_llama_data['levenshtein'].mean())
print(big_llama_data['levenshtein'].mean())

6.0
133.33333333333334
23.5
11.333333333333334
42.166666666666664


In [None]:
print(haiku_data['inv_levenshtein'].mean())
print(mistral_data['inv_levenshtein'].mean())
print(sonet_data['inv_levenshtein'].mean())
print(small_llama_data['inv_levenshtein'].mean())
print(big_llama_data['inv_levenshtein'].mean())

0.47441077441077445
0.35437925170068024
0.4665065287016506
0.18109536662168244
0.24509564509564505


In [None]:
print(haiku_data['overall'].mean())
print(mistral_data['overall'].mean())
print(sonet_data['overall'].mean())
print(small_llama_data['overall'].mean())
print(big_llama_data['overall'].mean())

3.6277543025894694
2.0244087063271556
4.006737024470191
2.4191590697489467
3.4171511695331684
