In [1]:
#### PARAMS ####
NUM_ROUNDS = 100
NUM_CLIENTS = 10
RANDOM_SEED = 42

BATCH_SIZE = 32
NUM_SHADOW_MODELS = 5
GLOBAL_MODEL_EPOCHS = 10
SHADOW_TRAIN_ROUNDS = 10
SHADOW_DATA_FRACTION = 0.01

PROB_RANGE = 0.1

ENTRENO_ANTES = True  # True: Shadow models entrenan antes, False: Shadow models infieren dinámicamente

# Global constants
FRACTION = 0.2  # Fraction of data to be used for training

APLICAR_RUIDO = False  # Determina si se aplica ruido en el proceso.
RUIDO_ANTES = False # Si True, el ruido se aplica antes del entrenamiento; si False, después.
RUIDO_PER = 0.4  # Proporción de datos afectados por ruido.
NOISE_STD = 0.2  # Desviación estándar del ruido aplicado.
EPSILON = 1.0  # Parámetro de privacidad (menor = más privacidad, más ruido)
DELTA = 1e-5  # Delta para ruido gaussiano (puedes dejarlo en None si solo usas Laplace)
SENSITIVITY = 1.0  # Sensibilidad del ruido (depende del rango de los datos o actualizaciones)


LABEL_FLIPPING = False
FLIPPING_ANTES = False
PROB_FLIP_0 = 0.2
PROB_FLIP_1 = 0.2
FLIP_TARGET = "Slice"

PROPERTY_THRESHOLD = 0.5
LEARNING_RATE = 0.1
DATA_FILE_PATH = 'label_bi_10.csv'

PREFIJO_SAVE = "resultados_iniciales"
RESULTS_CSV_FILE = f'{PREFIJO_SAVE}/federated_learning_results.csv'
SHADOW_WEIGHTS_CSV_FILE = f"{PREFIJO_SAVE}/weights.csv"

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import auc, roc_curve, confusion_matrix
import csv
import os
import random
import logging
import tensorflow as tf
from concurrent.futures import ThreadPoolExecutor
import tqdm
from numpy.random import laplace, normal

random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)


In [3]:
#### LOG ####
LOG_DIR = 'logs'
if not os.path.exists(LOG_DIR):
    os.makedirs(LOG_DIR)

if not os.path.exists(PREFIJO_SAVE):
    os.makedirs(PREFIJO_SAVE)
    
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(os.path.join(LOG_DIR, 'execution.log')),
        logging.StreamHandler()
    ]
)

logger = logging.getLogger(__name__)

In [4]:
def validate_configuration():
    """
    Valida las configuraciones de parámetros y asegura que las combinaciones sean coherentes.
    """
    logger.info("Validating configuration...")

    # Verificar configuraciones inválidas
    if not ENTRENO_ANTES and FLIPPING_ANTES:
        raise ValueError("FLIPPING_ANTES=True no tiene sentido cuando ENTRENO_ANTES=False.")

    # Informar configuraciones redundantes
    if not APLICAR_RUIDO and RUIDO_ANTES:
        raise ValueError("RUIDO_ANTES=True no tiene sentido cuando APLICAR_RUIDO=False.")
    if not LABEL_FLIPPING and FLIPPING_ANTES:
        raise ValueError("FLIPPING_ANTES=True no tiene sentido cuando LABEL_FLIPPING=False.")

    if not APLICAR_RUIDO and not LABEL_FLIPPING:
        logger.info("Ni ruido ni label flipping están activados. El experimento no incluye perturbaciones en los datos.")

    # Validar parámetros relacionados con ruido
    if APLICAR_RUIDO:
        if EPSILON <= 0:
            raise ValueError("EPSILON debe ser mayor que 0.")
        if DELTA is not None and not (0 < DELTA < 1):
            raise ValueError("DELTA debe estar entre 0 y 1 si se usa ruido gaussiano.")
        if SENSITIVITY <= 0:
            raise ValueError("SENSITIVITY debe ser mayor que 0.")


    # Validar parámetros relacionados con flipping
    if LABEL_FLIPPING:
        if not (0 <= PROB_FLIP_0 <= 1):
            raise ValueError("PROB_FLIP_0 debe estar entre 0 y 1.")
        if not (0 <= PROB_FLIP_1 <= 1):
            raise ValueError("PROB_FLIP_1 debe estar entre 0 y 1.")

    logger.info("Configuration validation completed successfully.")


In [5]:
#### DATA ####

def load_and_preprocess_data(file_path):

    logger.info(f"Loading and preprocessing data from {file_path}")
    
    # Load the CSV file into a pandas DataFrame
    df = pd.read_csv(file_path)
    
    # Select necessary columns for the experiment
    necessary_columns = [
        'Src IP', 'Src Port', 'Dst Port', 'Protocol', 'Flow Duration', 'Total Fwd Packet',
        'Fwd Packet Length Std', 'ACK Flag Count', 'Fwd Seg Size Min', 'label', 'Slice',
    ]
    df = df[necessary_columns]

    # Randomly sample the data based on the FRACTION constant and drop missing values
    df = df.sample(frac=FRACTION).reset_index(drop=True)
    df = df.dropna()
    print(f"Tamaño del dataset {df.shape}")
    
    # Separate features (X) from the labels (y_label) and the property indicator (y_slice)
    X = df.drop(['label', 'Slice'], axis=1)
    y_label = df['label']
    y_slice = df['Slice']
    
    logger.info(f"Columns after loading: {list(df.columns)}")

    # Ensure that the dataset contains enough data for further processing
    assert len(X) > 0, "The dataset does not contain enough data after preprocessing."

    return X, y_label, y_slice

def create_client_data(X, y_label, y_slice, num_clients=NUM_CLIENTS):
    """
    Divide los datos para los clientes y aplica ruido o flipping si está configurado.
    """
    logger.info(f"Creating data for {num_clients} clients with security configurations")

    # Mezclar y dividir datos
    data = pd.concat([X, y_label, y_slice], axis=1).sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)
    X, y_label, y_slice = data.iloc[:, :-2], data.iloc[:, -2], data.iloc[:, -1]

    client_data = []

    # Separar datos por propiedad
    X_with_property = X[y_slice == 1]
    y_label_with_property = y_label[y_slice == 1]
    X_without_property = X[y_slice == 0]
    y_label_without_property = y_label[y_slice == 0]

    min_data_size = min(len(X_with_property) // (num_clients // 2), len(X_without_property) // (num_clients // 2))

    for i in range(num_clients // 2):
        client_data.append({
            'X': X_with_property.iloc[i * min_data_size:(i + 1) * min_data_size],
            'y_label': y_label_with_property.iloc[i * min_data_size:(i + 1) * min_data_size].copy(),
            'y_slice': 1,
            'has_property': True
        })

        client_data.append({
            'X': X_without_property.iloc[i * min_data_size:(i + 1) * min_data_size],
            'y_label': y_label_without_property.iloc[i * min_data_size:(i + 1) * min_data_size].copy(),
            'y_slice': 0,
            'has_property': False
        })

    # Aplicar ruido y flipping antes del entrenamiento (si está configurado)
    if FLIPPING_ANTES:
        logger.info("Applying label flipping BEFORE local training")
        for client in client_data:
            client['y_label'] = apply_flipping(client['y_label'], FLIP_TARGET, PROB_FLIP_0, PROB_FLIP_1)

    if RUIDO_ANTES:
        logger.info("Applying noise BEFORE local training")
        for client in client_data:
            client['X'] = apply_noise(client['X'], RUIDO_PER, NOISE_STD, EPSILON, DELTA, SENSITIVITY, ruido_antes=True)

    return client_data


In [6]:
#### MODELS ####
def split_data_for_models(X, y_label, y_slice, shadow_fraction=SHADOW_DATA_FRACTION):
    """
    Divide los datos en conjuntos para el modelo global y los modelos sombra.
    """
    logger.info("Dividiendo datos para el modelo global y los modelos sombra.")

    num_shadow_samples = int(len(X) * shadow_fraction)
    indices = np.arange(len(X))
    np.random.shuffle(indices)

    shadow_indices = indices[:num_shadow_samples]
    global_indices = indices[num_shadow_samples:]

    X_shadow = X.iloc[shadow_indices]
    y_label_shadow = y_label.iloc[shadow_indices]
    y_slice_shadow = y_slice.iloc[shadow_indices]

    X_global = X.iloc[global_indices]
    y_label_global = y_label.iloc[global_indices]
    y_slice_global = y_slice.iloc[global_indices]

    logger.info(f"Datos divididos: {len(X_global)} para el modelo global, {len(X_shadow)} para los modelos sombra.")
    return (X_global, y_label_global, y_slice_global), (X_shadow, y_label_shadow, y_slice_shadow), len(X_shadow)

def create_global_model(input_shape):
    logger.info(f"Creating global model with input shape {input_shape}")
    
    # Define the model architecture
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(input_shape,)),  # Input layer with the specified input shape
        tf.keras.layers.Dense(64, activation='relu'),  # Hidden layer with 64 units and ReLU activation
        tf.keras.layers.Dense(32, activation='relu'),  # Hidden layer with 32 units and ReLU activation
        tf.keras.layers.Dense(1, activation='sigmoid')  # Output layer for binary classification
    ])
    
    # Compile the model with Adam optimizer and binary crossentropy loss
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def create_shadow_model(input_shape):

    logger.info(f"Creating shadow model with input shape {input_shape}")
    return create_global_model(input_shape)

def initialize_clients(client_data, global_model, global_model_epochs, batch_size):
    logger.info("Initializing simulated clients in parallel")

    def init_client(client_id, data):
        # Pasar el modelo global completo
        return SimulatedFlowerClient(client_id, data, global_model, global_model_epochs, batch_size)

    # Paralelizar la inicialización de los clientes
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(init_client, i, data) for i, data in enumerate(client_data)]
        clients = [f.result() for f in futures]

    return clients

class SimulatedFlowerClient:
    def __init__(self, cid, data, global_model, global_model_epochs, batch_size):
        """
        Inicializa un cliente simulado con sus datos, modelo y parámetros.
        """
        logger.info(f"Initializing simulated client {cid}")
        self.cid = cid
        self.data = data
        self.global_model_epochs = global_model_epochs
        self.batch_size = batch_size

        # Usar el modelo global proporcionado y compilarlo
        self.model = tf.keras.models.clone_model(global_model)
        self.model.set_weights(global_model.get_weights())
        self.model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

            
    def get_parameters(self):
        """Devuelve los parámetros actuales del modelo del cliente."""
        return self.model.get_weights()

    def fit(self, parameters):
        """
        Entrena el modelo del cliente con los parámetros globales y aplica ruido/flipping si es necesario.
        """
        self.model.set_weights(parameters)
        self.model.fit(self.data['X'], self.data['y_label'], epochs=self.global_model_epochs, batch_size=self.batch_size, verbose=0)

        # Calcular actualizaciones de los pesos
        updates = [new_w - old_w for new_w, old_w in zip(self.model.get_weights(), parameters)]

        # Aplicar ruido a las actualizaciones si está configurado
        if APLICAR_RUIDO and not RUIDO_ANTES:
            logger.info(f"Applying noise to updates for client {self.cid} AFTER training")
            updates = apply_noise(updates, RUIDO_PER, NOISE_STD, EPSILON, DELTA, SENSITIVITY, ruido_antes=False)

        # Aplicar flipping después del entrenamiento si está configurado
        if LABEL_FLIPPING and not FLIPPING_ANTES:
            logger.info(f"Applying label flipping AFTER local training for client {self.cid}")
            self.data['y_label'] = apply_flipping(self.data['y_label'], FLIP_TARGET, PROB_FLIP_0, PROB_FLIP_1)

        return self.model.get_weights(), len(self.data['X']), {"updates": updates}

    def evaluate(self, parameters):
        """Evalúa el modelo global en los datos locales del cliente."""
        self.model.set_weights(parameters)
        loss, accuracy = self.model.evaluate(self.data['X'], self.data['y_label'], verbose=0)
        return loss, len(self.data['X']), {"accuracy": accuracy}

def train_shadow_models(X_shadow, y_label_shadow, y_slice_shadow, num_shadow_models, global_model, global_model_epochs, batch_size):
    """
    Entrena los shadow models usando las actualizaciones de los pesos.
    """
    if not ENTRENO_ANTES:
        logger.info("Skipping shadow model training because ENTRENO_ANTES is False.")
        return []
    
    logger.info(f"Training {num_shadow_models} shadow models prior to federated learning.")
    
    # Resetear el índice de y_slice_shadow para asegurar acceso por índice entero
    y_slice_shadow = y_slice_shadow.reset_index(drop=True)
    
    # Simular actualizaciones para los datos de shadow
    X_shadow_data = []
    y_shadow_labels = []
    
    logging_count = 0
    for idx in range(len(X_shadow)):
        
        if logging_count % 10 == 0:
            logger.info(f'Processing shadow model {idx + 1}/{len(X_shadow)}')
        logging_count += 1
        
        shadow_model = tf.keras.models.clone_model(global_model)
        shadow_model.set_weights(global_model.get_weights())
        shadow_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

        initial_weights = shadow_model.get_weights()
        
        # Entrenar el modelo en un único ejemplo para generar actualizaciones
        shadow_model.fit(X_shadow[idx:idx+1], y_label_shadow[idx:idx+1],
                         epochs=1, batch_size=1, verbose=0)
        final_weights = shadow_model.get_weights()
        
        # Calcular las actualizaciones de los pesos
        updates = [final_w - init_w for final_w, init_w in zip(final_weights, initial_weights)]
        flattened_updates = np.concatenate([u.flatten() for u in updates])

        X_shadow_data.append(flattened_updates)
        y_shadow_labels.append(y_slice_shadow[idx])  # Ahora se puede acceder con índice entero
    
    # Convertir listas a numpy arrays
    X_shadow_data = np.array(X_shadow_data)
    y_shadow_labels = np.array(y_shadow_labels)
    
    # Entrenar los modelos sombra con las actualizaciones generadas
    shadow_models = []
    for i in range(num_shadow_models):
        logger.info(f"Training shadow model {i + 1}/{num_shadow_models}")
        shadow_model = tf.keras.Sequential([
            tf.keras.layers.Input(shape=(X_shadow_data.shape[1],)),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(32, activation='relu'),
            tf.keras.layers.Dense(1, activation='sigmoid')
        ])
        shadow_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        shadow_model.fit(X_shadow_data, y_shadow_labels,
                         epochs=global_model_epochs, batch_size=batch_size, verbose=0)
        shadow_models.append(shadow_model)
    
    logger.info("All shadow models trained successfully.")
    return shadow_models

def dynamic_inference_with_custom_model(averaged_updates):
    """
    Realiza inferencia dinámica con un modelo sombra diseñado para ENTRENO_ANTES=False.
    """
    input_dim = 2753  # Ajustar según las dimensiones esperadas
    logger.info(f"Dynamic inference with input dimension {input_dim}")

    # Ajustar las dimensiones del input
    if len(averaged_updates) > input_dim:
        logger.warning(f"Truncating updates from {len(averaged_updates)} to {input_dim}")
        adjusted_updates = averaged_updates[:input_dim]
    elif len(averaged_updates) < input_dim:
        logger.warning(f"Padding updates from {len(averaged_updates)} to {input_dim}")
        adjusted_updates = np.pad(averaged_updates, (0, input_dim - len(averaged_updates)), mode='constant')
    else:
        adjusted_updates = averaged_updates

    # Normalizar las actualizaciones
    normalized_updates = (adjusted_updates - np.mean(adjusted_updates)) / (np.std(adjusted_updates) + 1e-8)
    flattened_updates = normalized_updates.reshape(1, -1)

    # Realizar la inferencia
    shadow_model = custom_model(input_dim)
    prob = shadow_model.predict(flattened_updates, verbose=0)[0][0]
    logger.info(f"Dynamic inference probability: {prob:.4f}")
    return prob


def infer_property(shadow_models, updates):
    """
    Infieren la probabilidad de la propiedad a partir de las actualizaciones de los pesos.
    """
    flattened_updates = np.concatenate([u.flatten() for u in updates]).reshape(1, -1)
    probabilities = []
    for model in shadow_models:
        prob = model.predict(flattened_updates, verbose=0)[0][0]
        probabilities.append(prob)

    # Promediar las probabilidades predichas por los modelos sombra
    return np.mean(probabilities)

def average_client_updates(client_updates):
    """
    Promedia las actualizaciones de los clientes asegurándose de que todas tengan la misma longitud.
    """
    # Asegurar que todas las actualizaciones tengan la misma estructura
    num_layers = len(client_updates[0])
    averaged_updates = []
    
    for layer_idx in range(num_layers):
        layer_updates = [update[layer_idx] for update in client_updates]
        averaged_layer = np.mean(layer_updates, axis=0)
        averaged_updates.append(averaged_layer)
    
    return averaged_updates

def custom_model(input_shape):
    """
    Crea un modelo específico para inferencia en el modo ENTRENO_ANTES=False.
    """
    logger.info(f"Creating custom shadow model with input shape {input_shape}")

    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(input_shape,)),  # Input layer

        # Primera capa densa con normalización
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.BatchNormalization(),

        # Segunda capa para aprender relaciones complejas
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.3),  # Regularización para evitar sobreajuste

        # Capa de salida para predicción binaria
        tf.keras.layers.Dense(1, activation='sigmoid')  # Predicción de probabilidad de "Slice1"
    ])
    
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model


In [7]:
#### SEGURIDAD ####
def apply_label_flipping(y_data, flip_target, prob_flip_0, prob_flip_1):
    """
    Aplica flipping de etiquetas o propiedades basado en el parámetro flip_target.
    """
    if flip_target not in ["label", "Slice"]:
        raise ValueError(f"Invalid flip_target: {flip_target}. Must be 'label' or 'Slice'.")

    if isinstance(y_data, pd.Series):
        # Caso Series: se asume que el target es directamente la serie
        target_data = y_data

    elif isinstance(y_data, pd.DataFrame):
        # Caso DataFrame: se accede a la columna específica indicada por flip_target
        if flip_target in y_data.columns:
            target_data = y_data[flip_target]
        else:
            raise KeyError(f"Column '{flip_target}' is not found in DataFrame data.")

    else:
        raise TypeError(f"Unsupported data type: {type(y_data)}. Must be pandas DataFrame or Series.")

    # Flipping 0 -> 1
    flip_0_indices = target_data[target_data == 0].index
    num_flips_0_to_1 = int(len(flip_0_indices) * prob_flip_0)
    flipped_0_indices = np.random.choice(flip_0_indices, num_flips_0_to_1, replace=False)
    target_data.loc[flipped_0_indices] = 1

    # Flipping 1 -> 0
    flip_1_indices = target_data[target_data == 1].index
    num_flips_1_to_0 = int(len(flip_1_indices) * prob_flip_1)
    flipped_1_indices = np.random.choice(flip_1_indices, num_flips_1_to_0, replace=False)
    target_data.loc[flipped_1_indices] = 0

    logger.info(f"Flipped {num_flips_0_to_1} entries from 0 -> 1 and {num_flips_1_to_0} from 1 -> 0.")

    return target_data


def add_continuous_noise(data, ruido_per, ruido_std, epsilon, delta=None, sensitivity=1.0, ruido_antes=True):
    """
    Aplica ruido gaussiano o laplaciano según los principios de privacidad diferencial.

    Parameters:
    - data: pd.DataFrame o list, datos o actualizaciones a las que se aplicará ruido.
    - ruido_per: float, proporción de muestras afectadas (0 a 1).
    - ruido_std: float, desviación estándar del ruido.
    - epsilon: float, parámetro de privacidad diferencial (mayor = menos ruido).
    - delta: float (opcional), requerido solo para ruido gaussiano.
    - sensitivity: float, sensibilidad del ruido (magnitud máxima de cambio).
    - ruido_antes: bool, indica si el ruido se aplica antes o después del entrenamiento (solo para logs).

    Returns:
    - DataFrame o lista con ruido aplicado.
    """
    if not (0 <= ruido_per <= 1):
        raise ValueError("El parámetro ruido_per debe estar entre 0 y 1.")

    if epsilon <= 0:
        raise ValueError("El parámetro epsilon debe ser mayor que 0.")

    if isinstance(data, pd.DataFrame):
        # Identificar columnas continuas si se aplica a datos
        continuous_columns = data.select_dtypes(include=['float64', 'int64']).columns.difference(['label', 'Slice'])
        logger.info(f"Columnas continuas seleccionadas para aplicar ruido: {list(continuous_columns)}")

        # Seleccionar filas que recibirán ruido
        num_samples = int(len(data) * ruido_per)
        noisy_indices = np.random.choice(data.index, size=num_samples, replace=False)
        logger.info(f"Número de muestras seleccionadas para ruido: {num_samples} de {len(data)}")

        # Escoger distribución de ruido
        if epsilon > 0:
            if delta is not None:
                # Ruido Gaussiano
                noise_scale = sensitivity * np.sqrt(2 * np.log(1.25 / delta)) / epsilon
                logger.info(f"Aplicando ruido Gaussiano con privacidad diferencial {'antes' if ruido_antes else 'después'} del entrenamiento.")
                noise = normal(0, noise_scale, size=(num_samples, len(continuous_columns)))
            else:
                # Ruido Laplaciano
                noise_scale = sensitivity / epsilon
                logger.info(f"Aplicando ruido Laplaciano con privacidad diferencial {'antes' if ruido_antes else 'después'} del entrenamiento.")
                noise = laplace(0, noise_scale, size=(num_samples, len(continuous_columns)))
        else:
            # Usar ruido estándar directo (NOISE_STD)
            logger.warning(f"Usando ruido estándar directo sin privacidad diferencial {'antes' if ruido_antes else 'después'} del entrenamiento.")
            noise = normal(0, ruido_std, size=(num_samples, len(continuous_columns)))

        # Aplicar ruido a las columnas seleccionadas
        noisy_data = data.copy()
        noisy_data.loc[noisy_indices, continuous_columns] += noise
        return noisy_data

    elif isinstance(data, list):
        # Asumir que es una lista de actualizaciones de los pesos
        logger.info(f"Aplicando ruido a actualizaciones de los pesos {'antes' if ruido_antes else 'después'} del entrenamiento")
        if epsilon > 0:
            if delta is not None:
                # Ruido Gaussiano
                noise_scale = sensitivity * np.sqrt(2 * np.log(1.25 / delta)) / epsilon
                logger.info(f"Aplicando ruido Gaussiano con privacidad diferencial a los pesos {'antes' if ruido_antes else 'después'} del entrenamiento.")
                noisy_updates = [u + normal(0, noise_scale, size=u.shape) for u in data]
            else:
                # Ruido Laplaciano
                noise_scale = sensitivity / epsilon
                logger.info(f"Aplicando ruido Laplaciano con privacidad diferencial a los pesos {'antes' if ruido_antes else 'después'} del entrenamiento.")
                noisy_updates = [u + laplace(0, noise_scale, size=u.shape) for u in data]
        else:
            # Usar ruido estándar directo (NOISE_STD)
            logger.warning(f"Usando ruido estándar directo sin privacidad diferencial para los pesos {'antes' if ruido_antes else 'después'} del entrenamiento.")
            noisy_updates = [u + normal(0, ruido_std, size=u.shape) for u in data]
        return noisy_updates

    else:
        raise TypeError("El parámetro 'data' debe ser un DataFrame o una lista.")
    
def apply_noise(data, ruido_per, ruido_std, epsilon, delta, sensitivity, ruido_antes):
    """
    Aplica ruido con privacidad diferencial a los datos o actualizaciones.
    """
    return add_continuous_noise(data, ruido_per, ruido_std, epsilon, delta, sensitivity, ruido_antes)


def apply_flipping(data, flip_target, prob_flip_0, prob_flip_1):
    """
    Aplica flipping de etiquetas o propiedades basado en configuraciones.
    """
    return apply_label_flipping(data, flip_target, prob_flip_0, prob_flip_1)


In [8]:
#### OUTPUTS ####

def calculate_correct_predictions(results, dynamic_threshold=0.50):
    correct_predictions = 0
    incorrect_predictions = 0
    threshold = dynamic_threshold

    # Ignorar las primeras tres rondas
    for i in range(3, len(results)):
        current_prob = results[i]['property_probability']
        previous_prob = results[i - 1]['property_probability']
        previous_property = results[i - 1]['has_property']
        current_property = results[i]['has_property']

        # Ajustar el threshold dinámico
        if current_prob > previous_prob:
            threshold = max(threshold, (current_prob + previous_prob) / 2)
        elif current_prob < previous_prob:
            threshold = min(threshold, (current_prob + previous_prob) / 2)

        # Clasificar movimientos según las reglas dadas
        if not previous_property and not current_property:  # Ambas sin propiedad
            if abs(current_prob - previous_prob) < threshold * 0.1:
                correct_predictions += 1
            else:
                incorrect_predictions += 1
        elif previous_property and not current_property:  # De propiedad a sin propiedad
            if current_prob < previous_prob:
                correct_predictions += 1
            else:
                incorrect_predictions += 1
        elif not previous_property and current_property:  # De sin propiedad a propiedad
            if current_prob > previous_prob:
                correct_predictions += 1
            else:
                incorrect_predictions += 1
        elif previous_property and current_property:  # Ambas con propiedad
            if abs(current_prob - previous_prob) < threshold * 0.1:
                correct_predictions += 1
            else:
                incorrect_predictions += 1

    total_predictions = correct_predictions + incorrect_predictions
    accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0

    logger.info(f"Correct predictions: {correct_predictions}, Incorrect predictions: {incorrect_predictions}, Accuracy: {accuracy:.4f}")
    return correct_predictions, incorrect_predictions, threshold


def plot_combined_probability_loss_accuracy(results, output_path):
    """
    Genera una imagen con dos gráficos:
    - Superior: Evolución de la probabilidad por ronda, ignorando las 3 primeras rondas.
    - Inferior: Evolución de la pérdida y precisión por ronda.
    """
    round_nums = [r['round'] for r in results][3:]
    probabilities = [r['property_probability'] for r in results][3:]
    losses = [r['loss'] for r in results][3:]
    accuracies = [r['accuracy'] for r in results][3:]
    property_present = [r['has_property'] for r in results][3:]

    fig, axs = plt.subplots(2, 1, figsize=(12, 10))

    # Gráfico 1: Probabilidad por ronda
    axs[0].plot(round_nums, probabilities, marker='o', color='blue', label='Property Probability')
    for round_, prob, present in zip(round_nums, probabilities, property_present):
        if present:
            axs[0].scatter(round_, prob, color='red', zorder=5, label='Rounds with Property' if round_nums.index(round_) == 0 else "")
    axs[0].set_xlabel('Round')
    axs[0].set_ylabel('Property Probability')
    axs[0].set_title('Property Probability by Round (ignoring first 3 rounds)')
    axs[0].grid(True)
    axs[0].legend()

    # Gráfico 2: Pérdida y Precisión
    ax1 = axs[1]
    ax1.plot(round_nums, losses, label='Loss', color='red', marker='x')
    ax1.set_xlabel('Round')
    ax1.set_ylabel('Loss', color='red')
    ax1.tick_params(axis='y', labelcolor='red')
    ax1.grid(True)

    ax2 = ax1.twinx()
    ax2.plot(round_nums, accuracies, label='Accuracy', color='blue', marker='o')
    ax2.set_ylabel('Accuracy', color='blue')
    ax2.tick_params(axis='y', labelcolor='blue')

    fig.tight_layout()
    plt.savefig(f"{output_path}/probability_loss_accuracy.png")
    plt.close()
    

def plot_combined_roc_threshold(fpr, tpr, thresholds, auc_roc, optimal_threshold, output_path):
    """
    Genera una imagen con dos gráficos:
    - Izquierda: Curva ROC.
    - Derecha: TPR/FPR vs Threshold con umbral óptimo.
    """
    fig, axs = plt.subplots(1, 2, figsize=(14, 6))

    # Gráfico 1: Curva ROC
    axs[0].plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC Curve (AUC = {auc_roc:.2f})')
    axs[0].plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Guess')
    axs[0].set_xlabel('False Positive Rate')
    axs[0].set_ylabel('True Positive Rate')
    axs[0].set_title('ROC Curve')
    axs[0].legend(loc="lower right")
    axs[0].grid(True)

    # Gráfico 2: TPR/FPR vs Threshold
    axs[1].plot(thresholds, tpr, label='True Positive Rate (TPR)', color='green', lw=2)
    axs[1].plot(thresholds, fpr, label='False Positive Rate (FPR)', color='red', lw=2)
    axs[1].axvline(optimal_threshold, color='blue', linestyle='--', label=f'Optimal Threshold ({optimal_threshold:.2f})')
    axs[1].set_xlabel('Threshold')
    axs[1].set_ylabel('Rate')
    axs[1].set_title('TPR and FPR vs. Threshold')
    axs[1].legend(loc="best")
    axs[1].grid(True)

    fig.tight_layout()
    plt.savefig(f"{output_path}/roc_threshold.png")
    plt.close()


def save_results_to_csv(results, transitions, metrics, custom_precision):
    """
    Guarda los resultados en tres CSV:
    - Resultados detallados por ronda.
    - Transiciones entre rondas.
    - Resultados finales (métricas globales).
    """
    # Resultados por Ronda
    with open(f"{PREFIJO_SAVE}/round_results.csv", 'w', newline='') as csvfile:
        fieldnames = ['Round', 'Prediction', 'Probability', 'Clients with Property', 
                      'Clients without Property', 'Has Property', 'Loss', 'Accuracy']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for r in results:
            writer.writerow({
                'Round': r['round'],
                'Prediction': r['prediction'],
                'Probability': r['property_probability'],
                'Clients with Property': r['clients_with_property'],
                'Clients without Property': r['clients_without_property'],
                'Has Property': r['has_property'],
                'Loss': r['loss'],
                'Accuracy': r['accuracy']
            })

    # Transiciones
    with open(f"{PREFIJO_SAVE}/transitions.csv", 'w', newline='') as csvfile:
        fieldnames = ['Round', 'Prev Probability', 'Current Probability', 'Transition']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for t in transitions:
            writer.writerow({
                'Round': t['round'],
                'Prev Probability': t['prev_probability'],
                'Current Probability': t['current_probability'],
                'Transition': t['transition']
            })

    # Métricas Finales
    with open(f"{PREFIJO_SAVE}/final_metrics.csv", 'w', newline='') as csvfile:
        fieldnames = ['Metric', 'Value']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for key, value in metrics.items():
            writer.writerow({'Metric': key, 'Value': value})
        writer.writerow({'Metric': 'Custom Precision', 'Value': f"{custom_precision:.2f}%"})

def calculate_and_log_metrics(results):
    """
    Calcula las métricas principales y devuelve un diccionario con los resultados.
    """
    y_true = [r['has_property'] for r in results][3:]  # Ignorar las 3 primeras rondas
    y_prob = [r['property_probability'] for r in results][3:]
    y_pred = [1 if prob > PROPERTY_THRESHOLD else 0 for prob in y_prob]

    # Calcular ROC, precisión, recall, F1-score, y matriz de confusión
    fpr, tpr, thresholds = roc_curve(y_true, y_prob)
    auc_roc = auc(fpr, tpr)
    optimal_idx = np.argmax(tpr - fpr)
    optimal_threshold = thresholds[optimal_idx]

    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    # Precisión basada en transiciones de probabilidad
    correct_transitions, incorrect_transitions, dynamic_threshold = calculate_correct_predictions(results)
    total_transitions = correct_transitions + incorrect_transitions
    custom_precision = (correct_transitions / total_transitions) * 100 if total_transitions > 0 else 0

    metrics = {
        'ROC AUC': auc_roc,
        'Optimal Threshold': optimal_threshold,
        'Dynamic Threshold': dynamic_threshold,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'True Positive': tp,
        'False Positive': fp,
        'True Negative': tn,
        'False Negative': fn,
    }

    return metrics, fpr, tpr, thresholds, custom_precision


def analyze_probability_transitions(results, prob_range=PROB_RANGE):
    """
    Analiza las transiciones de probabilidad entre rondas, ignorando valores extremos y 
    ajustando las clasificaciones de transiciones en función de un rango dinámico.
    """
    transitions = []
    probabilities = [r['property_probability'] for r in results][3:]  # Ignorar las primeras 3 rondas
    rounds = [r['round'] for r in results][3:]  # Ignorar las primeras 3 rondas

    # Filtrar valores extremos basados en desviación estándar
    prob_mean = np.mean(probabilities)
    prob_std = np.std(probabilities)
    lower_bound = prob_mean - 2 * prob_std
    upper_bound = prob_mean + 2 * prob_std
    filtered_probs = [p if lower_bound <= p <= upper_bound else prob_mean for p in probabilities]

    min_prob, max_prob = min(filtered_probs), max(filtered_probs)
    range_value = max_prob - min_prob if max_prob > min_prob else 1
    significant_move = range_value * prob_range

    for i in range(1, len(filtered_probs)):
        prev_prob = filtered_probs[i - 1]
        curr_prob = filtered_probs[i]
        diff = curr_prob - prev_prob
        transition = "unsure"

        # Clasificar transiciones con rango dinámico
        if abs(diff) < significant_move:
            transition = "stable"
        elif diff > significant_move:
            transition = "increase"
        elif diff < -significant_move:
            transition = "decrease"

        transitions.append({
            'round': rounds[i],
            'prev_probability': prev_prob,
            'current_probability': curr_prob,
            'transition': transition
        })

    logger.info(f"Transitions analyzed: {len(transitions)} transitions processed.")
    return transitions



In [9]:
#### EXPERIMENT ####
def simulated_federated_learning(clients, shadow_models, global_model, num_rounds=NUM_ROUNDS, threshold=PROPERTY_THRESHOLD):
    logger.info(f"Starting simulated federated learning with {num_rounds} rounds")
    results = []

    clients_with_property = [client for client in clients if client.data['has_property']]
    clients_without_property = [client for client in clients if not client.data['has_property']]

    logger.info(f"{len(clients_with_property)} clients have the property.")
    logger.info(f"{len(clients_without_property)} clients do not have the property.")

    for round_num in tqdm.tqdm(range(1, num_rounds + 1), desc="Federated Learning Progress"):
        logger.info(f"Starting round {round_num}")

        # Elegir aleatoriamente si usar clientes con o sin propiedad
        use_property_clients = random.choice([True, False])
        selected_clients = clients_with_property if use_property_clients else clients_without_property

        # Obtener pesos actuales del modelo global
        global_weights = global_model.get_weights()

        client_weights = []
        client_updates = []

        logger.info(f"Training {len(selected_clients)} selected clients...")

        # Recopilar actualizaciones de los clientes
        for client in selected_clients:
            w, _, update_info = client.fit(global_weights)
            client_weights.append(w)
            client_updates.append(np.concatenate([u.flatten() for u in update_info['updates']]))

        # Promediar pesos y actualizaciones
        averaged_weights = [np.mean(layer, axis=0) for layer in zip(*client_weights)]
        global_model.set_weights(averaged_weights)
        averaged_updates = average_client_updates(client_updates)
        
        logger.info(f"Round {round_num}: Averaged updates shape: {len(averaged_updates)}, Expected input dimension: {global_model.input_shape[1]}")

        # Aplicar flipping a los clientes seleccionados después del entrenamiento
        if LABEL_FLIPPING and not FLIPPING_ANTES:
            logger.info(f"Applying label flipping AFTER local training for round {round_num}")
            for client in selected_clients:
                client.data['y_label'] = apply_label_flipping(
                    client.data['y_label'], FLIP_TARGET, PROB_FLIP_0, PROB_FLIP_1
                )

        # Inferencia de los modelos sombra
        if ENTRENO_ANTES:
            property_prob = infer_property(shadow_models, averaged_updates)
        else:
            property_prob = dynamic_inference_with_custom_model(averaged_updates)

        # Evaluar el modelo global en todos los clientes
        total_loss, total_accuracy, total_samples = 0, 0, 0
        for client in clients:
            loss, num_samples, metrics = client.evaluate(global_model.get_weights())
            total_loss += loss * num_samples
            total_accuracy += metrics['accuracy'] * num_samples
            total_samples += num_samples

        # Calcular métricas promedio de la ronda
        avg_loss = total_loss / total_samples
        avg_accuracy = total_accuracy / total_samples

        # Guardar resultados
        results.append({
            'round': round_num,
            'property_probability': property_prob,
            'has_property': use_property_clients,
            'prediction': property_prob > threshold,
            'clients_with_property': len(clients_with_property) if use_property_clients else 0,
            'clients_without_property': len(clients_without_property) if not use_property_clients else 0,
            'loss': avg_loss,
            'accuracy': avg_accuracy
        })

        logger.info(f"Round {round_num} completed. Loss: {avg_loss:.4f}, Accuracy: {avg_accuracy:.4f}, Property Prob: {property_prob:.4f}")

    logger.info("Federated learning simulation completed.")
    return results


def main():
    # Paso 0: Validar configuración de parámetros
    validate_configuration()

    # Paso 1: Cargar y preprocesar datos
    X, y_label, y_slice = load_and_preprocess_data(DATA_FILE_PATH)

    # Paso 2: Dividir datos para modelo global y modelos sombra
    (X_global, y_label_global, y_slice_global), (X_shadow, y_label_shadow, y_slice_shadow), _ = split_data_for_models(
        X, y_label, y_slice
    )

    # Paso 3: Crear datos para los clientes con opciones de ruido y flipping antes del entrenamiento
    logger.info("Creating client data with configured security options")
    client_data = create_client_data(X_global, y_label_global, y_slice_global)

    # Paso 4: Crear e inicializar modelo global
    logger.info("Initializing global model")
    global_model = create_global_model(client_data[0]['X'].shape[1])
    
    # Inicializar clientes simulados
    clients = initialize_clients(client_data, global_model, GLOBAL_MODEL_EPOCHS, BATCH_SIZE)

    # Paso 5: Entrenar modelos sombra (si ENTRENO_ANTES=True)
    if ENTRENO_ANTES:
        logger.info("Training shadow models prior to federated learning")
        shadow_models = train_shadow_models(
            X_shadow, y_label_shadow, y_slice_shadow, NUM_SHADOW_MODELS, global_model, GLOBAL_MODEL_EPOCHS, BATCH_SIZE
        )
    else:
        logger.info("Skipping shadow model training (ENTRENO_ANTES=False)")
        shadow_models = []  # No se entrenan previamente

    # Paso 6: Simulación de federated learning
    logger.info("Starting federated learning simulation")
    results = simulated_federated_learning(clients, shadow_models, global_model)

    # Paso 7: Analizar transiciones de probabilidad
    logger.info("Analyzing probability transitions")
    transitions = analyze_probability_transitions(results, PROB_RANGE)

    # Paso 8: Calcular métricas
    logger.info("Calculating evaluation metrics")
    metrics, fpr, tpr, thresholds, custom_precision = calculate_and_log_metrics(results)

    # Paso 9: Guardar resultados y generar gráficos
    logger.info("Saving results and generating visualizations")
    save_results_to_csv(results, transitions, metrics, custom_precision)
    plot_combined_probability_loss_accuracy(results, f"{PREFIJO_SAVE}/")
    plot_combined_roc_threshold(fpr, tpr, thresholds, metrics['ROC AUC'], metrics['Optimal Threshold'], f"{PREFIJO_SAVE}/")

    logger.info("Federated learning simulation completed")


In [None]:
if __name__ == "__main__":
    main()

2025-02-03 06:25:27,005 - __main__ - INFO - Validating configuration...
2025-02-03 06:25:27,007 - __main__ - INFO - Ni ruido ni label flipping están activados. El experimento no incluye perturbaciones en los datos.
2025-02-03 06:25:27,008 - __main__ - INFO - Configuration validation completed successfully.
2025-02-03 06:25:27,008 - __main__ - INFO - Loading and preprocessing data from label_bi_10.csv
2025-02-03 06:25:28,002 - __main__ - INFO - Columns after loading: ['Src IP', 'Src Port', 'Dst Port', 'Protocol', 'Flow Duration', 'Total Fwd Packet', 'Fwd Packet Length Std', 'ACK Flag Count', 'Fwd Seg Size Min', 'label', 'Slice']
2025-02-03 06:25:28,002 - __main__ - INFO - Dividiendo datos para el modelo global y los modelos sombra.
2025-02-03 06:25:28,013 - __main__ - INFO - Datos divididos: 122192 para el modelo global, 1234 para los modelos sombra.
2025-02-03 06:25:28,013 - __main__ - INFO - Creating client data with configured security options
2025-02-03 06:25:28,014 - __main__ - INF

Tamaño del dataset (123426, 11)


2025-02-03 06:25:28,181 - __main__ - INFO - Initializing simulated client 2
2025-02-03 06:25:28,183 - __main__ - INFO - Initializing simulated client 3
2025-02-03 06:25:28,184 - __main__ - INFO - Initializing simulated client 4
2025-02-03 06:25:28,187 - __main__ - INFO - Initializing simulated client 5
2025-02-03 06:25:28,190 - __main__ - INFO - Initializing simulated client 6
2025-02-03 06:25:28,191 - __main__ - INFO - Initializing simulated client 7
2025-02-03 06:25:28,196 - __main__ - INFO - Initializing simulated client 8
2025-02-03 06:25:28,198 - __main__ - INFO - Initializing simulated client 9
2025-02-03 06:25:28,341 - __main__ - INFO - Training shadow models prior to federated learning
2025-02-03 06:25:28,341 - __main__ - INFO - Training 5 shadow models prior to federated learning.
2025-02-03 06:25:28,342 - __main__ - INFO - Processing shadow model 1/1234








2025-02-03 06:25:34,506 - __main__ - INFO - Processing shadow model 11/1234
2025-02-03 06:25:40,627 - __main__ - INFO - Processing shadow model 21/1234
2025-02-03 06:25:47,132 - __main__ - INFO - Processing shadow model 31/1234
2025-02-03 06:25:53,101 - __main__ - INFO - Processing shadow model 41/1234
2025-02-03 06:25:58,878 - __main__ - INFO - Processing shadow model 51/1234
2025-02-03 06:26:04,673 - __main__ - INFO - Processing shadow model 61/1234
2025-02-03 06:26:10,665 - __main__ - INFO - Processing shadow model 71/1234
2025-02-03 06:26:16,954 - __main__ - INFO - Processing shadow model 81/1234
