In [1]:
import boto3
import json
import pandas as pd
print('Running boto3 version:', boto3.__version__)

Running boto3 version: 1.34.138


In [2]:
import boto3
import os
# os.environ["AWS_DEFAULT_REGION"] = "eu-west-3"
# os.environ["AWS_STS_REGIONAL_ENDPOINTS"] = "regional"

# Configuration

In [3]:
boto3_session = boto3.session.Session()
bedrock = boto3.client(service_name='bedrock-runtime')

# Variables

In [4]:
model_id = "meta.llama3-70b-instruct-v1:0"

In [5]:
def get_text_and_masked_carmen(name):
    filename = f'./data/processed/txt/{name}'
    filename_result = f'./data/processed/masked/{name}'
    with open(filename, 'r') as archivo:
        text = archivo.read()

    with open(filename_result, 'r') as archivo:
        text_masked = archivo.read()

    return [text, text_masked]

In [6]:
def generate_prompt(text, init_configuration, orden, max_gen_len=2048, temperature=0.1, top_p=0.9):
    prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
    {init_configuration}
    <|eot_id|>
    <|start_header_id|>user<|end_header_id|>
    {orden}
    {text}
    <|eot_id|><|start_header_id|>assistant<|end_header_id|>"""
    body = json.dumps({
        "prompt": prompt,
        "max_gen_len":max_gen_len,
        "temperature":temperature,
        "top_p":top_p
    })
    return body

# Metricas

In [7]:
import re
from collections import Counter

def calc_metrics(ground_truth, predictions):
    ground_truth_counter = Counter(ground_truth)
    predictions_counter = Counter(predictions)

    true_positives = sum((ground_truth_counter & predictions_counter).values())
    false_positives = sum((predictions_counter - ground_truth_counter).values())
    false_negatives = sum((ground_truth_counter - predictions_counter).values())

    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1

def evaluate(masked, generated):
    """ 
    Input: 
        - masked (str): Ground_truth text
        - generated(str): Text to be evaluated

    Output:
        - Precision, Recall and F1 (float)
    """
    ground_truth = re.findall(r'\[\*\*(.*?)\*\*\]', masked)
    predictions = re.findall(r'\[\*\*(.*?)\*\*\]', generated)
    labels = [ground_truth, predictions]
    
    return [calc_metrics(ground_truth, predictions), labels]

## Cosine similarity

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def get_cos_sim(text_hoped, text_generated):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([text_hoped, text_generated])

    cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
    return cosine_sim[0][0]
    

## Levenshtein Distance

In [9]:
from tqdm import tqdm

def levenshtein_distance(s1, s2, show_progress=True):
    """
    Calcula la distancia de Levenshtein entre dos cadenas.

    La distancia de Levenshtein es el número mínimo de operaciones de edición 
    (inserción, eliminación o sustitución de un carácter) necesarias para 
    transformar una cadena en otra.

    Parámetros:
        s1 (str): Primera cadena
        s2 (str): Segunda cadena
        show_progress (bool): Si es True, muestra una barra de progreso. 
                              Por defecto es False.
    Retorna:
        int: La distancia de Levenshtein entre s1 y s2
    """
    # Usar tqdm solo si show_progress es True
    iterable = tqdm(s1) if show_progress else s1

    if len(s1) < len(s2):
        s1, s2 = s2, s1
    if len(s2) == 0:
        return len(s1)

    previous_row = list(range(len(s2) + 1))
    for i, c1 in enumerate(iterable):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    return previous_row[-1]



# Loop

In [11]:
import os
init_configuration = """You are an anonimization tool in identifying attributes in texts that can identify or quasi-identify a user."""
orden = """ Devuelve solo el texto original con la identificación y etiquetación del texto con la información personal del paciente añadiendolo entre claudators, por ejemplo: [**arquitecto**], [**8/9/21**], [**padre**]. No comentes nada más """
path = './data/processed/txt'
list_data  = []
counter = 0
for filename in os.listdir(path):
    metrics_data = {}
    [text, text_hoped] = get_text_and_masked_carmen(filename)
    body = generate_prompt(text, init_configuration, orden)
    response = bedrock.invoke_model(body=body, modelId=model_id)
    response = json.loads(response.get('body').read())
    text_generated = response['generation']
    [cal_met, labels] = evaluate(text_hoped, text_generated)
    cosine_sim = get_cos_sim(text_hoped, text_generated)
    text_generated = text_generated.replace('[**', '').replace('**]', '')
    text_hoped = text_hoped.replace('[**', '').replace('**]', '')
    result = levenshtein_distance(text_generated, text_hoped[:len(text_generated)], show_progress=False)

    metrics_data["filename"] = filename
    metrics_data["precision"] = cal_met[0]
    metrics_data["recall"] = cal_met[1]
    metrics_data["f1"] = cal_met[2]
    metrics_data["cos"] = cosine_sim
    metrics_data["levenshtein"] = result
    metrics_data["labels hoped"] = labels[0]
    metrics_data["labels generated"] = labels[1]
    list_data.append(metrics_data)
    counter += 1
    print(counter)
    if counter == 100:
        break

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100


In [12]:
metrics_info = pd.DataFrame(list_data)
metrics_info

Unnamed: 0,filename,precision,recall,f1,cos,levenshtein,labels hoped,labels generated
0,CARMEN-I_IR_339.txt,0.000000,0.000000,0.000000,0.998484,14,[62 años],"[62, 2]"
1,CARMEN-I_IR_477.txt,0.500000,0.333333,0.400000,0.998596,14,"[Varón, 80 años, 30/08/2013]","[80, 30/08/2013]"
2,CARMEN-I_IA_EVOL_136.txt,0.823529,0.700000,0.756757,1.000000,2,"[Clinica COVID, 73 años, Hospital Clínico Univ...","[73, Hospital Clínico Universitario Ferrán i C..."
3,CARMEN-I_IA_EVOL_91.txt,0.500000,1.000000,0.666667,1.000000,5,"[10/11/2014, 15/10/2014]","[10/11/2014, 15/10/2014, domicilio, centro soc..."
4,CARMEN-I_IA_PROCESO_ACTUAL_103.txt,0.733333,1.000000,0.846154,1.000000,2,"[Centro COVID, Clinica COVID, 20 de noviembre,...","[hospital, Centro COVID, Clinica COVID, 20 de ..."
...,...,...,...,...,...,...,...,...
95,CARMEN-I_IA_EVOL_123.txt,0.666667,0.600000,0.631579,1.000000,2,"[julio del 2023, 26/11, 14.10.2023, 17.10.2023...","[2023, 26/11, 14.10.2023, 17.10.2023, 16.10.20..."
96,CARMEN-I_IA_EVOL_137.txt,0.833333,0.500000,0.625000,1.000000,2,"[R967, 8/3/2023, 7/9/2023, 7/9/2023, Varón, 69...","[8/3/2023, 7/9/2023, 7/9/2023, 3/09, 7/09, Dr...."
97,CARMEN-I_IR_476.txt,0.142857,0.500000,0.222222,0.999365,14,"[81 años, 25/6/2011]","[81, 55%, 9000, 1, 25/6/2011, 37 mm, hiliares ..."
98,CARMEN-I_IR_310.txt,0.333333,0.375000,0.352941,0.999519,14,"[77 años, día 24/12, día 11/4, día 17/1, 19/9/...","[77, 24/12, 11/4, 17/1, 19/9/2008, 9 mm, 2007,..."


In [13]:
metrics_info.to_csv('data/metrics/metrics.csv')