In [23]:
import os
import re
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import ipywidgets as widgets
from IPython.display import display
from scipy.interpolate import griddata



In [24]:
usability_dict = {}
qids = 12
usability_results_path = f'../data/adults/results/usability/{qids}_qids/'

# Tomamos los archivos de la forma k_{k}-l_{l}-t_{t}-results.csv
pattern = re.compile(r'k_\d+-l_\d+-t_(\d+\.\d+|\d+)-results\.csv')
for f in [f for f in os.listdir(usability_results_path) if pattern.fullmatch(f)]:
    # Extraemos los valores de k, l y t (teniendo en cuenta que t es un float)
    k, l, t = map(float, re.findall(r'\d+\.\d+|\d+', f))
    # Creamos una clave para el diccionario
    key = (int(k), int(l), float(t))
    # Leemos el archivo y lo guardamos en el diccionario como dataframe (la primera fila es el nombre de las columnas y no hay índice)
    usability_dict[key] = pd.read_csv(os.path.join(usability_results_path, f))    

In [25]:
k_all_values = [1, 2, 4, 8, 16, 32, 64]
l_all_values = [1, 2]
t_all_values = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
for k in k_all_values:
        for l in l_all_values:
            for t in t_all_values:
                key = (k, l, t)
                if key not in usability_dict:
                    raise Exception(f'No se ha encontrado el archivo para k={k}, l={l}, t={t}')

### Resultados de aplicar únicamente k-anonimización

La siguiente gráfica muestra la distribución de la métrica `accuracy` y `precision` para diferentes valores de k, manteniendo l=1 y t=0, es decir, aplicando solo k-anonimización sobre los datos.

In [31]:
# Filtrar los resultados para l=1 y t=0
filtered = {k_val: df 
            for (k_val, l_val, t_val), df in usability_dict.items() 
            if l_val == 1 and
              t_val == 1.0 and 
              k_val in k_all_values
            }

# Preparar los datos para el diagrama de cajas
data_acc = []
data_pre = []
data_rec = []
data_f1 = []
for k_val, df in filtered.items():
    accuracies = df['accuracy'].values.astype(float)
    precisions = df['precision'].values.astype(float)
    recalls = df['recall'].values.astype(float)
    f1_scores = df['f1_score'].values.astype(float)
    for v in accuracies:
        data_acc.append({'k': k_val, 'accuracy': v})
    for v in precisions:
        data_pre.append({'k': k_val, 'precision': v})
    for v in recalls:
        data_rec.append({'k': k_val, 'recall': v})
    for v in f1_scores:
        data_f1.append({'k': k_val, 'f1_score': v})

box_acc_df = pd.DataFrame(data_acc)
box_pre_df = pd.DataFrame(data_pre)
box_rec_df = pd.DataFrame(data_rec)
box_f1_df = pd.DataFrame(data_f1)

box_df = pd.concat([
    box_acc_df.rename(columns={'accuracy': 'value'}).assign(metric='accuracy'),
    box_pre_df.rename(columns={'precision': 'value'}).assign(metric='precision'),
    box_rec_df.rename(columns={'recall': 'value'}).assign(metric='recall'),
    box_f1_df.rename(columns={'f1_score': 'value'}).assign(metric='f1_score')
])

# Calcular la mediana para cada k y métrica
median_df = box_df.groupby(['k', 'metric'], as_index=False)['value'].median()

fig = go.Figure()

# Añadir líneas de medianas para cada métrica
for metric, color in zip(['accuracy', 'precision', 'recall', 'f1_score'], ['#636efa', '#EF553B', '#00cc96', '#ab63fa']):
    med = median_df[median_df['metric'] == metric]
    fig.add_trace(go.Scatter(
        x=med['k'],
        y=med['value'],
        mode='lines+markers',
        name=f'{metric.capitalize()}',
        line=dict(color=color, width=3),
        marker=dict(size=10),
        showlegend=True,
    ))

fig.update_layout(
    # title='Mediana de Accuracy y Precision para diferentes valores de k',
    xaxis=dict(type='log', title='K'),
    yaxis=dict(title='Métricas'),
    height=600
)
fig.update_layout(
    boxmode='group',
    boxgroupgap=0,
    legend=dict(
        orientation='h',
        yanchor='bottom',
        y=-0.25,
        xanchor='center',
        x=0.5,
        font=dict(size=16),
        title=dict(text='Métricas', font=dict(size=18))

    ),
    xaxis=dict(
        title='K',
        type='log',
        tickvals=box_df['k'].unique(),
        ticktext=[f"{int(val)}" for val in box_df['k'].unique()],
        title_font=dict(size=22),
        tickfont=dict(size=16)
    ),
    yaxis=dict(
        title='Métricas',
        title_font=dict(size=22),
        tickfont=dict(size=16),
        tickvals=[1, 0.8, 0.6, 0.4, 0.2, 0],
        ticktext=[str(val) for val in [1, 0.8, 0.6, 0.4, 0.2, 0]]
    ),
    margin=dict(l=0, r=0, t=0, b=0),
    height=400,
    width=800
)
fig.write_image("../Memoria/images/graphs/usability-vs-k_anonymity.png")
fig.update_xaxes(type='log', tickvals=box_df['k'].unique(), ticktext=[f"{int(val)}" for val in box_df['k'].unique()], title_text='K')
fig.show()

### Resultados de aplicar únicamente l-diversity

La siguiente gráfica muestra la distribución de las métricas `accuracy` y `precision` para diferentes valores de l, manteniendo k=1 y t=0, es decir, aplicando solo l-diversity sobre los datos.

In [27]:
# Filtrar los resultados para k=1 y t=0 (solo l-diversity)
filtered = {l_val: df for (k_val, l_val, t_val), df in usability_dict.items() if k_val == 1 and t_val == 1.0 and l_val in l_all_values}

# Preparar los datos para el diagrama de cajas
data_acc = []
data_pre = []
for l_val, df in filtered.items():
    accuracies = df['accuracy'].values.astype(float)
    precisions = df['precision'].values.astype(float)
    for v in accuracies:
        data_acc.append({'l': l_val, 'accuracy': v})
    for v in precisions:
        data_pre.append({'l': l_val, 'precision': v})

box_df = pd.concat([
    box_acc_df.rename(columns={'accuracy': 'value'}).assign(metric='accuracy'),
    box_pre_df.rename(columns={'precision': 'value'}).assign(metric='precision')
])

box_acc_df = pd.DataFrame(data_acc)
box_pre_df = pd.DataFrame(data_pre)

box_df = pd.concat([
    box_acc_df.rename(columns={'accuracy': 'value'}).assign(metric='accuracy'),
    box_pre_df.rename(columns={'precision': 'value'}).assign(metric='precision')
])

fig = px.box(
    box_df,
    x='l',
    y='value',
    color='metric',
    points=False,
    labels={'value': 'Métrica', 'l': 'L', 'metric': 'Métrica'},
    # title='Distribución de Accuracy y Precision para diferentes valores de l',
    height=600,
    color_discrete_map={
        'accuracy': '#636efa',
        'precision': '#EF553B'
    },
    category_orders={'metric': ['accuracy', 'precision']}
)
fig.for_each_trace(lambda t: t.update(name=t.name.capitalize(), legendgroup=t.name.capitalize(), marker_color=t.marker.color))
fig.update_layout(boxmode='group', boxgroupgap=0)
fig.update_yaxes(
    title_text='Métrica',
    type='log',
    range=[-1.5, 0.5],  # log10(0.0001)= -4, log10(10)=1
    tickvals=[0.05, 0.1, 0.5, 1],
)
fig.show()

### Resultados de aplicar únicamente t-closeness

La siguiente gráfica muestra la distribución de las métricas `accuracy` y `precision` para diferentes valores de t, manteniendo k=1 y l=1, es decir, aplicando solo t-closeness sobre los datos.

In [32]:
# Filtrar los resultados para k=1 y l=1 (solo t-closeness)
filtered = {t_val: df for (k_val, l_val, t_val), df in usability_dict.items() if k_val == 1 and l_val == 1 and t_val in t_all_values}

# Preparar los datos para el diagrama de cajas
data_acc = []
data_pre = []
data_rec = []
data_f1 = []
for t_val, df in filtered.items():
    accuracies = df['accuracy'].values.astype(float)
    precisions = df['precision'].values.astype(float)
    recalls = df['recall'].values.astype(float)
    f1_scores = df['f1_score'].values.astype(float)
    for v in accuracies:
        data_acc.append({'t': t_val, 'accuracy': v})
    for v in precisions:
        data_pre.append({'t': t_val, 'precision': v})
    for v in recalls:
        data_rec.append({'t': t_val, 'recall': v})
    for v in f1_scores:
        data_f1.append({'t': t_val, 'f1_score': v})

box_acc_df = pd.DataFrame(data_acc)
box_pre_df = pd.DataFrame(data_pre)
box_rec_df = pd.DataFrame(data_rec)
box_f1_df = pd.DataFrame(data_f1)

box_df = pd.concat([
    box_acc_df.rename(columns={'accuracy': 'value'}).assign(metric='accuracy'),
    box_pre_df.rename(columns={'precision': 'value'}).assign(metric='precision'),
    box_rec_df.rename(columns={'recall': 'value'}).assign(metric='recall'),
    box_f1_df.rename(columns={'f1_score': 'value'}).assign(metric='f1_score')
])

# Calcular la mediana para cada t y métrica
median_df = box_df.groupby(['t', 'metric'], as_index=False)['value'].median()

fig = go.Figure()

# Añadir líneas de medianas para cada métrica
for metric, color in zip(['accuracy', 'precision', 'recall', 'f1_score'], ['#636efa', '#EF553B', '#00cc96', '#ab63fa']):
    med = median_df[median_df['metric'] == metric]
    fig.add_trace(go.Scatter(
        x=med['t'],
        y=med['value'],
        mode='lines+markers',
        name=f'{metric.capitalize()}',
        line=dict(color=color, width=3),
        marker=dict(size=10),
        showlegend=True,
    ))

fig.update_layout(
    legend=dict(
        orientation='h',
        yanchor='bottom',
        y=-0.25,
        xanchor='center',
        x=0.5,
        font=dict(size=16),
        title=dict(text='Métricas', font=dict(size=18))

    ),
    xaxis=dict(
        title='t',
        tickvals=box_df['t'].unique(),
        title_font=dict(size=22),
        tickfont=dict(size=16)
    ),
    yaxis=dict(
        title='Métricas',
        title_font=dict(size=22),
        tickfont=dict(size=16)
    ),
    margin=dict(l=0, r=0, t=0, b=0),
    height=400,
    width=800
)
fig.update_layout(boxmode='group', boxgroupgap=0)
# fig.write_image("../Memoria/images/graphs/usability-vs-t_closeness.png")
fig.show()

### Resultados de aplicar únicamente k-anonimity y l-diversity.

In [34]:
# Filtrar los resultados para t=0 (solo k y l)
data_acc = []
data_pre = []
for (k_val, l_val, t_val), df in usability_dict.items():
    if t_val in [0, 1]:
        for v in df['accuracy'].values.astype(float):
            data_acc.append({'k': k_val, 'l': l_val, 'accuracy': v})
        for v in df['precision'].values.astype(float):
            data_pre.append({'k': k_val, 'l': l_val, 'precision': v})

df_acc = pd.DataFrame(data_acc)
df_pre = pd.DataFrame(data_pre)

# Unir accuracy y precision en un solo DataFrame para ambos l
box_acc_df_l1 = df_acc[df_acc['l'] == 1].rename(columns={'accuracy': 'value'}).assign(metric='accuracy', l='l=1')
box_acc_df_l2 = df_acc[df_acc['l'] == 2].rename(columns={'accuracy': 'value'}).assign(metric='accuracy', l='l=2')
box_pre_df_l1 = df_pre[df_pre['l'] == 1].rename(columns={'precision': 'value'}).assign(metric='precision', l='l=1')
box_pre_df_l2 = df_pre[df_pre['l'] == 2].rename(columns={'precision': 'value'}).assign(metric='precision', l='l=2')

plot_df = pd.concat([box_acc_df_l1, box_acc_df_l2, box_pre_df_l1, box_pre_df_l2])

color_map = {
    ('accuracy', 'l=1'): 'blue',
    ('accuracy', 'l=2'): 'skyblue',
    ('precision', 'l=1'): 'red',
    ('precision', 'l=2'): 'orange'
}

fig = go.Figure()

# Calcular la mediana para cada k, l y métrica
median_acc = df_acc.groupby(['k', 'l'], as_index=False)['accuracy'].median()
median_pre = df_pre.groupby(['k', 'l'], as_index=False)['precision'].median()

# Unir accuracy y precision en un solo DataFrame para ambos l
median_acc_l1 = median_acc[median_acc['l'] == 1].rename(columns={'accuracy': 'value'}).assign(metric='accuracy', l='l=1')
median_acc_l2 = median_acc[median_acc['l'] == 2].rename(columns={'accuracy': 'value'}).assign(metric='accuracy', l='l=2')
median_pre_l1 = median_pre[median_pre['l'] == 1].rename(columns={'precision': 'value'}).assign(metric='precision', l='l=1')
median_pre_l2 = median_pre[median_pre['l'] == 2].rename(columns={'precision': 'value'}).assign(metric='precision', l='l=2')

median_df = pd.concat([median_acc_l1, median_acc_l2, median_pre_l1, median_pre_l2])

for (metric, l_val), group in median_df.groupby(['metric', 'l']):
    fig.add_trace(go.Scatter(
        x=group['k'],
        y=group['value'],
        mode='lines+markers',
        name=f"{metric.capitalize()} ({l_val})",
        marker=dict(size=10),
        line=dict(
            width=3 if l_val == 'l=2' else 3,
            color=color_map[(metric, l_val)]
            
        ),
        showlegend=True,
    ))

fig.update_layout(
    legend=dict(
        orientation='h',
        yanchor='bottom',
        y=-0.25,
        xanchor='center',
        x=0.5,
        font=dict(size=16),
        title=dict(text='Métricas', font=dict(size=18))

    ),
    xaxis=dict(
        title='t',
        tickvals=box_df['t'].unique(),
        title_font=dict(size=22),
        tickfont=dict(size=16)
    ),
    yaxis=dict(
        title='Métricas',
        title_font=dict(size=22),
        tickfont=dict(size=16),
        tickvals=[1, 0.8, 0.6, 0.4, 0.2, 0],
        ticktext=[str(val) for val in [1, 0.8, 0.6, 0.4, 0.2, 0]]
    ),
    margin=dict(l=0, r=0, t=0, b=0),
    height=400,
    width=800
)
fig.update_xaxes(type='log', tickvals=plot_df['k'].unique(), ticktext=[f"{int(val)}" for val in plot_df['k'].unique()], title_text='K')
fig.write_image("../Memoria/images/graphs/usability-vs-k_anonymity-l_diversity.png")
fig.show()


Las siguientes superficies muestran cómo varían las métricas de `accuracy` y `precision` en función de los parámetros de anonimización k y t, para cada valor de l. Estas gráficas permiten visualizar el impacto combinado de k-anonimity y t-closeness sobre la utilidad de los datos, facilitando la identificación de regiones donde se logra un mejor equilibrio entre privacidad y precisión de los modelos. El uso de superficies suavizadas mediante interpolación ayuda a observar tendencias generales y posibles puntos óptimos en el espacio de parámetros.

In [None]:
# Generador de figuras
def plot_surface(metric: str, selected_l: int) -> go.Figure:
    data = []
    for (k_val, l_val, t_val), df in usability_dict.items():
        if l_val == selected_l and t_val >= 0.1:
            data.append({'k': k_val, 't': t_val, metric: df[metric].median()})

    df = pd.DataFrame(data)
    df_grouped = df.groupby(['k', 't'], as_index=False).median()

    z = df_grouped.pivot(index='k', columns='t', values=metric).values
    x = df_grouped['k'].unique()
    y = df_grouped['t'].unique()

    # Interpolación para suavizar la superficie

    # Crear una malla regular para k y t
    k_vals = df_grouped['k'].unique()
    t_vals = df_grouped['t'].unique()
    k_grid, t_grid = np.meshgrid(
        np.logspace(np.log10(k_vals.min()), np.log10(k_vals.max()), 50),
        np.linspace(t_vals.min(), t_vals.max(), 50)
    )

    # Interpolar los valores de la métrica
    points = df_grouped[['k', 't']].values
    values = df_grouped[metric].values
    z_grid = griddata(points, values, (k_grid, t_grid), method='cubic')

    fig = go.Figure(data=go.Surface(
        z=z_grid,
        x=k_grid,
        y=t_grid,
        colorscale='Viridis',
        showscale=False
    ))

    fig.update_scenes(
        xaxis=dict(
            type='log',
            tickvals=k_vals,
            ticktext=[str(int(val)) for val in k_vals],
            title_text='k'
        )
    )

    fig.update_layout(
        # title=f'Superficie de {metric.capitalize()} en función de k y t (l={selected_l})',
        scene=dict(
            xaxis_title='K',
            yaxis_title='T',
            zaxis_title=metric.capitalize(),
        ),
    margin=dict(l=0, r=0, t=0, b=0),
    height=600,
    width=600
    )
    return fig

metric_values = ['accuracy', 'precision']
l_values = [1, 2]

precalculated_figs = {
    (metric, l): plot_surface(metric, l)
    for metric in metric_values for l in l_values
}

# for (metric, l), fig in precalculated_figs.items():
#     if metric == 'accuracy':
#         fig.write_image(f"../Memoria/images/graphs/{metric}-l_{l}-surface.png")

def plot_surface(metric: str, selected_l: int) -> go.Figure:
    if (metric, selected_l) in precalculated_figs:
        return precalculated_figs[(metric, selected_l)]
    else:
        return plot_surface(metric, selected_l)

# Widgets para seleccionar la métrica y el valor de l
metric_selector = widgets.ToggleButtons(
    options=metric_values,
    value='accuracy',
    description='Métrica:',
    button_style=''
)
l_selector = widgets.SelectionSlider(
    options=l_values,
    value=1,
    description='l:',
    continuous_update=False
)
# Usar Output widget para actualizar la gráfica sin duplicados
output = widgets.Output()

def update_plot(change=None):
    with output:
        output.clear_output(wait=True)
        fig = plot_surface(metric_selector.value, l_selector.value)
        fig.show()

metric_selector.observe(update_plot, names='value')
l_selector.observe(update_plot, names='value')

display(metric_selector, l_selector, output)
update_plot()


ToggleButtons(description='Métrica:', options=('accuracy', 'precision'), value='accuracy')

SelectionSlider(continuous_update=False, description='l:', options=(1, 2), value=1)

Output()