In [1]:
from llm.context.llm_context import LLMContext
from llm.strategy.big_llama_model import BigLlamaModel
from llm.strategy.small_llama_model import SmallLlamaModel
from llm.strategy.big_mistral_model import BigMistralModel

from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import os
import numpy as np 
import pandas as pd
import re
import warnings
import spacy

In [2]:
PATH = './data/processed/txt'

# Functions

In [3]:
def create_folder(name):
    try:
        os.mkdir(name)
        print(f"Folder '{name}' created successfully")
    except FileExistsError:
        pass
    except Exception as e:
        print(f"Error creating the folder '{name}': {e}")

In [4]:
def save_file(filename, text):
    with open(filename, 'w') as archivo:
        archivo.write(text)

In [5]:
def get_text_and_masked_carmen(name):
    filename = f'./data/processed/txt/{name}'
    filename_result = f'./data/processed/masked/{name}'
    with open(filename, 'r') as archivo:
        text = archivo.read()

    with open(filename_result, 'r') as archivo:
        text_masked = archivo.read()

    return [text, text_masked]

In [6]:
def store_metrics(metrics_data, precision, recall, f1, cosine_sim, levenshtein_distance, labels, name_model):
    metrics_data["precision"] = precision
    metrics_data["recall"] = recall
    metrics_data["f1"] = f1
    metrics_data["cos"] = cosine_sim
    metrics_data["levenshtein"] = levenshtein_distance
    metrics_data["name_model"] = name_model
    metrics_data["ground_truth"] = labels[0]
    metrics_data["generated"] = labels[1]
    if int(metrics_data["levenshtein"]) == 0:
        metrics_data["inv_levenshtein"] = 1
    else: 
        metrics_data["inv_levenshtein"] = (1/metrics_data["levenshtein"])
        metrics_data["overall"] = metrics_data["precision"] + metrics_data["recall"] +  metrics_data["f1"] + metrics_data["cos"] + metrics_data["inv_levenshtein"]
    return metrics_data

In [7]:
def save_metrics(name_model, list_data):
    list_data = pd.DataFrame(list_data)
    list_data.to_csv(f'data/metrics/{name_model}.csv')

# Metrics

In [8]:
warnings.filterwarnings("ignore", message="\[W008\] Evaluating Doc.similarity based on empty vectors")
nlp = spacy.load("es_core_news_md")


In [9]:
def embedding_similarity(str1, str2, threshold=0.8):
    doc1 = nlp(str1)
    doc2 = nlp(str2)
    similarity = doc1.similarity(doc2)
    return similarity >= threshold

In [10]:
def eliminar_adverbios_preposiciones_determinantes(texto):
    doc = nlp(texto)
    # Eliminar preposiciones (ADP) y determinantes (DET)
    tokens_filtrados = [token.text for token in doc if token.pos_ not in ('ADP', 'DET')]
    return ' '.join(tokens_filtrados)

In [11]:
def levenshtein_distance(s1, s2, show_progress=True):
    """
    Calcula la distancia de Levenshtein entre dos cadenas.

    La distancia de Levenshtein es el número mínimo de operaciones de edición 
    (inserción, eliminación o sustitución de un carácter) necesarias para 
    transformar una cadena en otra.

    Parámetros:
        s1 (str): Primera cadena
        s2 (str): Segunda cadena
        show_progress (bool): Si es True, muestra una barra de progreso. 
                              Por defecto es False.
    Retorna:
        int: La distancia de Levenshtein entre s1 y s2
    """
    

    # Usar tqdm solo si show_progress es True
    iterable = tqdm(s1) if show_progress else s1

    if len(s1) < len(s2):
        s1, s2 = s2, s1
    if len(s2) == 0:
        return len(s1)

    previous_row = list(range(len(s2) + 1))
    for i, c1 in enumerate(iterable):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    return previous_row[-1]

In [12]:
def get_cos_sim(text_hoped, text_generated):
    vectorizer = TfidfVectorizer(token_pattern=r"(?u)\b\w[\w\-/]*\b")
    tfidf_matrix = vectorizer.fit_transform([text_hoped, text_generated])

    try:
        cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
    except: 
        return 0.0
    return cosine_sim[0][0]

In [13]:
def calc_metrics(ground_truth, predictions):
    # Convertir arrays de ground_truth y predictions a listas de str
    ground_truth_processed = np.array([eliminar_adverbios_preposiciones_determinantes(str(item)) for item in ground_truth])
    predictions_processed = np.array([eliminar_adverbios_preposiciones_determinantes(str(item)) for item in predictions])

    # Crear matrices de similitud de coseno y embedding 
    get_cos_sim_vectorized = np.vectorize(lambda gt, pred: get_cos_sim(str(gt), str(pred)))
    embedding_similarity_vectorized = np.vectorize(lambda gt, pred: embedding_similarity(str(gt), str(pred)))

    cosine_results = get_cos_sim_vectorized(ground_truth_processed[:, None], predictions_processed[None, :])
    embedding_results = embedding_similarity_vectorized(ground_truth_processed[:, None], predictions_processed[None, :])

    # Promediar las similitudes
    avg_similarities = (cosine_results + embedding_results) / 2

    # Determinar verdaderos positivos
    matches = avg_similarities > 0.5
    true_positives = np.sum(np.any(matches, axis=1))

    # Determinar falsos negativos
    false_negatives = len(ground_truth) - true_positives

    # Determinar falsos positivos
    predicted_matches = np.any(matches, axis=0)
    false_positives = len(predictions) - np.sum(predicted_matches)

    # Cálculo de métricas
    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1


In [14]:
def evaluate(masked, generated):
    """ 
    Input: 
        - masked (str): Ground_truth text
        - generated(str): Text to be evaluated

    Output:
        - Precision, Recall and F1 (float)
    """
    ground_truth = re.findall(r'\[\*\*(.*?)\*\*\]', masked)
    predictions = re.findall(r'\[\*\*(.*?)\*\*\]', generated)
    labels = [ground_truth, predictions]
    
    return [calc_metrics(ground_truth, predictions), labels]

In [15]:
def calculate_metrics(text_hoped, text_generated):
    cosine_sim = get_cos_sim(text_hoped, text_generated)
    [cal_met, labels] = evaluate(text_hoped, text_generated)
    print(labels)
    text_generated = text_generated.replace('[**', '').replace('**]', '')
    text_hoped = text_hoped.replace('[**', '').replace('**]', '')
    result = levenshtein_distance(text_generated, text_hoped[:len(text_generated)], show_progress=False)
    return cosine_sim, cal_met, result, labels

In [16]:
def replace_special_characters(text):
    # Define the pattern to match special characters including . / and -
    pattern = r'[!@#$%^&*()_+={}\[\]:;"\'<>,?\\|`~./-]'
    
    # Replace the matched characters with a space
    result = re.sub(pattern, ' ', text)
    
    return result

def evaluate2(masked,generated):
    masked=masked.replace('\n','')
    generated=generated.replace('\n','')

    ground_truth_matches = re.finditer(r'\[\*\*(.*?)\*\*\]', masked)
    ground_truth_positions = {}
    cnt=0
    for match in ground_truth_matches:
        start = match.start(1)-(cnt*2+1)*3  # start of the group (excluding [**)
        end = match.end(1)-(cnt*2+1)*3
        cnt+=1# end of the group (excluding **])
        ground_truth_positions[(start, end)] = replace_special_characters(match.group(1))

    predictions_matches = re.finditer(r'\[\*\*(.*?)\*\*\]', generated)
    predictions_positions = {}
    cnt=0
    for match in predictions_matches:
        start = match.start(1)-(cnt*2+1)*3  # start of the group (excluding [**)
        end = match.end(1)-(cnt*2+1)*3
        cnt+=1# end of the group (excluding **])
        predictions_positions[(start, end)] = replace_special_characters(match.group(1))

    totalwordcnt_ground_truth = len(ground_truth_positions)
    score_total=0
    for pos_g in ground_truth_positions:
        for pos_p in predictions_positions:
            if (pos_p[0]<=pos_g[0] and pos_p[1]>=pos_g[1]) or (pos_p[0]>=pos_g[0] and pos_p[1]<=pos_g[1]):
                score_temp = partial_score(ground_truth_positions[pos_g],predictions_positions[pos_p])
                score_total += score_temp

    score_total = score_total/totalwordcnt_ground_truth
    recall = score_total

    totalwordcnt_predictions = len(predictions_positions)
    score_total=0
    for pos_p in predictions_positions:
        for pos_g in ground_truth_positions:
            if (pos_g[0]<=pos_p[0] and pos_g[1]>=pos_p[1]) or (pos_g[0]>=pos_p[0] and pos_g[1]<=pos_p[1]):
                score_temp = partial_score(predictions_positions[pos_p],ground_truth_positions[pos_g])
                score_total += score_temp

    score_total = score_total/totalwordcnt_predictions
    precision = score_total
    
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return precision, recall, f1

# Loop

In [17]:
def anonimized(llm=None, name_model="", data=None):
    counter = 0
    context = LLMContext(llm)
    list_data  = []
    for filename in sorted(os.listdir(PATH)):
        metrics_data = {}
        metrics_data["filename"] = filename
        try:
            [text, text_hoped] = get_text_and_masked_carmen(filename)
            data["user"] = text
            text_generated = context.generate_response(data)
            
            
            # #### Second metrics
            # precision, recall, f1 = evaluate2(text_hoped, text_generated)
            # print(f"precision: {precision}, recall: {recall}, f1: {f1}")
            # #### Second metrics

            create_folder(f'data/anon/raw/{name_model}')
            save_file(f'data/anon/raw/{name_model}/{filename}', text_generated)
            cosine_sim, cal_met, result, labels = calculate_metrics(text_hoped, text_generated)
            metrics_data = store_metrics(metrics_data, cal_met[0],cal_met[1], cal_met[2], cosine_sim, result, labels, name_model)
            list_data.append(metrics_data)
            counter += 1
            if counter > 1:
                break
        except Exception as e:
            print(e) 
            metrics_data["fail"] = 1
        finally:
            list_data.append(metrics_data)
    save_metrics(name_model, list_data)
    

# Prompt 1

In [18]:
data = {}
data["system"] = """
You are an anonymization tool in identifying attributes in texts that can identify or quasi-identify a user.
Return only the original text with the identification and labeling of the patient's personal information by adding it between [** and **].
Following are attributes that you must anonymize.

- Names
Example:
En seguimiento por Hematología Centro Médico Aspasia (Dra. Valvanera).   ->
En seguimiento por Hematología [**Centro Médico Aspasia**] (Dra. [**Valvanera**]).

- Family number:
2 yernos ->
[**2 yernos**]

- Ages
Example: 
Varón de 41 años.   ->
[**Varón**] de [**41 años**].

- Sexes
Example: 
Varón de 41 años.   ->
[**Varón**] de [**41 años**].

- Professions
Example: 
Trabaja como profesor.   ->
Trabaja como [**profesor**].

- Relatives
Example: 
Vive con suegro y 2 yernos.   ->
Vive con [**suegro**] y 2 [**yernos**].

- Dates
Example: 
ha estado viviendo en el Centro desde septiembre de 2008.   ->
ha estado viviendo en el [**Centro**] desde [**septiembre de 2008**].

- Phone numbers
Example: 
contactando con el siguiente número de teléfono +50 88 078 68 49.   ->
contactando con el siguiente número de teléfono [**+50 88 078 68 49**].

- Identification numbers
Example:
El paciente otorga su consentimiento informado para participar en el estudio del protocolo WYX/8408/5545.   ->
El paciente otorga su consentimiento informado para participar en el estudio del protocolo [**WYX/8408/5545.**]

- Institutions, hospitals, health centers, etc
Example: 
En seguimiento por Hematología Centro Médico Aspasia (Dra. Valvanera).   ->
En seguimiento por Hematología [**Centro Médico Aspasia**] (Dra. [**Valvanera**]).
Example:
Control en Centro Salud Mental Reyes Católicos.   ->
Control en [**Centro Salud Mental Reyes Católicos**].

- Countries, territories, streets, etc
Example:
nacido en la República Italiana.   ->
nacido en la [**República Italiana**].
Example:
ha estado viviendo en el Centro desde septiembre de 2008.   ->
ha estado viviendo en el [**Centro**] desde [**septiembre de 2008**].
Example:
la dirección es Calle de Victor Hugo 39.   ->
la dirección es [**Calle de Victor Hugo 39**].

- Website URLs
participar a través del siguiente enlace: https://www.donarsang.gencat.cat/covid19.   ->
participar a través del siguiente enlace: [**https://www.donarsang.gencat.cat/covid19**].

- Other sensitive information such as races, ethnicities, sexual orientation, dietary preferences, etc
Example:
raça blanca   ->
[**raça blanca**]
Example:
Hsh
[**Hsh**]
Example:
Vegetarià
[**Vegetarià**]

Do not comment anything else.
Besides the anonymized attributes, provide the rest of the text exactly the same, including special characters and \n symbols.
Do not correct any typos or spacing errors at your discretion.
For example, if the time is written as 31/12/2000-0 9:20:00 with incorrect spacing, do not return it corrected as 31/12/2000-09:20:00.
Also, for example, if FLUTICASONA + AZELA STINA4 is written with incorrect spacing, do not return it corrected as FLUTICASONA + AZELASTINA 4.
Only focus on the anonymization tasks I have specified, and ignore any typos or spacing errors
"""

In [19]:
len(data["system"])

3118

In [20]:
anonimized(llm=BigLlamaModel(), name_model="big_llama3", data=data)

Folder 'data/anon/raw/big_llama3' created successfully
[['hilandero', 'suegro', '2 yernos', '8/9/21', '7/9/21', '976028134', '983425634', '8/9/21', '7/9/21', '8/9/21', '7/9/21', '8/9/21', '7/9/21', 'Salud laboral', '15/09', '10.09.2021'], ['hilandero', 'suegro', 'yernos', '8/9/21', '7/9/21', 'HDOM', '976028134', '983425634', '8/9/21', '7/9/21', '8/9/21', '7/9/21', '15/09', '8/9/21', '7/9/21', '10.09.2021', 'HDOM', 'Salud laboral']]
[['20/05', 'Centro COVID', 'suegros', '27/04/2011', '29/04/2011', 'Hospital COVID', '3/05/2011', '30/04', '7/05', '7/05', '18/05', 'Centro COVID', 'suegros', '27/04/2011', '29/04/2011', '3/05/2011', '30/04', '7/05', '7/05', 'WYX/8408/5545'], ['20/05', 'Centro COVID', 'suegros', '27/04/2011', '29/04/2011', 'Hospital COVID', '3/05/2011', '30/04', '7/05', '7/05', '18/05', 'Centro COVID', 'suegros', '27/04/2011', '29/04/2011', 'Hospital SALUT', '3/05/2011', '30/04', '7/05', '7/05', 'WYX/8408/5545']]


In [21]:
anonimized(llm=SmallLlamaModel(), name_model="small_llama3", data=data)

Folder 'data/anon/raw/small_llama3' created successfully
[['hilandero', 'suegro', '2 yernos', '8/9/21', '7/9/21', '976028134', '983425634', '8/9/21', '7/9/21', '8/9/21', '7/9/21', '8/9/21', '7/9/21', 'Salud laboral', '15/09', '10.09.2021'], ['hilandero', 'suegro', 'yernos', 'rinitis alergica', 'Teléfono HDOM', '976028134', '983425634', 'HDOM', 'Salud laboral']]
[['20/05', 'Centro COVID', 'suegros', '27/04/2011', '29/04/2011', 'Hospital COVID', '3/05/2011', '30/04', '7/05', '7/05', '18/05', 'Centro COVID', 'suegros', '27/04/2011', '29/04/2011', '3/05/2011', '30/04', '7/05', '7/05', 'WYX/8408/5545'], ['20/05', 'paciente', 'bon estat general', 'bon descans nocturn', 'Centro COVID', 'suegros', '27/04/2011', '29/04/2011', 'Hospital', '3/05/2011', 'Azitromicina', 'buen estado general', 'hoy', 'paciente', 'paciente', 'WYX/8408/5545', 'paciente']]


In [23]:
anonimized(llm=BigMistralModel(), name_model="big_mistral", data=data)

Folder 'data/anon/raw/big_mistral' created successfully
[['hilandero', 'suegro', '2 yernos', '8/9/21', '7/9/21', '976028134', '983425634', '8/9/21', '7/9/21', '8/9/21', '7/9/21', '8/9/21', '7/9/21', 'Salud laboral', '15/09', '10.09.2021'], ['hilandero', 'suegro', '2 yernos', '8/9/21', '7/9/21', '976028134', '983425634', '8/9/21', '7/9/21', '8/9/21', '7/9/21', '15/09', '8/9/21', '7/9/21', '10.09.2021']]
[['20/05', 'Centro COVID', 'suegros', '27/04/2011', '29/04/2011', 'Hospital COVID', '3/05/2011', '30/04', '7/05', '7/05', '18/05', 'Centro COVID', 'suegros', '27/04/2011', '29/04/2011', '3/05/2011', '30/04', '7/05', '7/05', 'WYX/8408/5545'], ['36,4', 'suegros', '27/04/2011', '29/04/2011', '3/05/2011', '0.40', '177', '200', '2100', 'suegros', '27/04/2011', '29/04/2011', '3/05/2011', '0.40', '177', '200', '2100', '35,8', 'WYX/8408/5545']]
