# Hito 2: FL Inter-empresa

Se entrena un único modelo de manera federada entre las dos empresas. De esta forma, se mantendrá la privacidad de los datos de cada empresa PERO no internamente dentro de las empresas. Se medirá el rendimiento del modelo respecto a los baselines de empresas y centralizado.

El cuaderno es similar a Hito1-IntraEmpresa pero con las modificaciones necesarias para que cada cliente (dos en total) tiene los datos completos de una de las empresas.

# Configuración del escenario mediante Flower

Se hará uso del modulo 'simulation' (version 0.18.0 de la libería) que elimina las restricciones de escalabilidad que se tenían respecto a versiones anteriores

Es necesario instalar Flower con el extra de 'simulation'

In [1]:
# !pip install -U flwr["simulation"]

In [2]:
import os
import math

# Make TensorFlow logs less verbose
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

import flwr as fl
import tensorflow as tf

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

from keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization
from tensorflow.keras.optimizers import Adadelta

from typing import Dict

from flwr.common.logger import log
from logging import INFO
from csv import writer


from numpy.random import seed
from tensorflow.keras.utils import set_random_seed

import matplotlib.pyplot as plt

In [3]:
class FlowerClient(fl.client.NumPyClient):
    def __init__(self, cid, model, x_train, y_train, x_val, y_val) -> None:
        self.cid = cid
        self.model = model
        self.x_train, self.y_train = x_train, y_train
        self.x_val, self.y_val = x_val, y_val

    def get_parameters(self):
        return self.model.get_weights()

    def fit(self, parameters, config):
        self.model.set_weights(parameters)
        
        seed(1)
        set_random_seed(2)
        
        self.model.fit(self.x_train, self.y_train,
                       epochs=int(config["epochs"]),
                       batch_size=32,
                       verbose=0)
        
        return self.model.get_weights(), len(self.x_train), {}

    def evaluate(self, parameters, config):
        self.model.set_weights(parameters)
        
        loss, acc = self.model.evaluate(self.x_val, self.y_val, verbose=0)
        
        return loss, len(self.x_val), {"accuracy": acc, "client": self.cid}

In [4]:
''' FUNCION PARA CARGAR LOS DATOS DE UN CLIENTE EN PARTICULAR '''
def prepare_model_data(client_file):
    df = pd.read_csv(client_file)
    
    train, test = train_test_split(df, test_size=0.30, random_state=42)
    
    X_train = train[['psd_delta', 'psd_theta', 'psd_alpha', 'psd_beta', 'psd_gamma','eog_blinks', 'eog_var']]
    X_test = test[['psd_delta', 'psd_theta', 'psd_alpha', 'psd_beta', 'psd_gamma','eog_blinks', 'eog_var']]
    y_train = train['y_class']
    y_test = test['y_class']
    
    scaler = StandardScaler()

    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    return X_train, X_test, y_train, y_test

''' FUNCION PARA CARGAR LOS DATOS DE UNA EMPRESA '''
def get_data_empresa(empresa):
    base_path = "./data/horizontal_v3"
    # Cargar y procesar datos de todos sus clientes
    clientes = os.listdir(f"{base_path}/{empresa}/")
    
    try:
        clientes.remove(UNSEEN_CLIENT)
    except:
        pass
    
    X_train, X_test, y_train, y_test = prepare_model_data(f'{base_path}/{empresa}/{clientes[0]}')
    
    for file in clientes[1:]:
        path = f'{base_path}/{empresa}/{file}'
        X_train_act, X_test_act, y_train_act, y_test_act = prepare_model_data(path)

        X_train = np.vstack((X_train, X_train_act))
        X_test = np.vstack((X_test, X_test_act))
        y_train = np.concatenate((y_train, y_train_act))
        y_test = np.concatenate((y_test, y_test_act))
        
    return X_train, X_test, y_train, y_test

In [5]:
def client_fn(cid: str) -> fl.client.Client:
    # Model best hyperparameters (Ver notebook Hito0-Optimizacion-Baseline)
    neurons = 36
    activation = "relu"
    learning_rate = 0.180165
    optimizer = Adadelta(learning_rate=learning_rate)
    
    input_shape = (7,)
    
    # Create model
    model = Sequential()
    
    model.add(Dense(neurons, input_shape=input_shape, activation=activation))
    
    model.add(BatchNormalization())
        
    model.add(Dense(neurons, activation=activation))
    model.add(Dense(neurons, activation=activation))
    model.add(Dense(neurons, activation=activation))
    
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    # Load data partition    
    x_train_cid, x_val_cid, y_train_cid, y_val_cid = get_data_empresa(cid)

    # Create and return client
    return FlowerClient(cid, model, x_train_cid, y_train_cid, x_val_cid, y_val_cid)

In [15]:
class SaveModelStrategy(fl.server.strategy.FedAvg):
    def aggregate_fit(self, rnd, results, failures):
        aggregated_weights = super().aggregate_fit(rnd, results, failures)
        
#         if aggregated_weights is not None:
#             # Save aggregated_weights
#             print(f"Saving round {rnd} aggregated_weights...")
#             np.savez(f"./rounds/hito2_v3-UC_{UNSEEN_CLIENT}-round-{rnd}-weights.npz", aggregated_weights)
            
        return aggregated_weights

    def aggregate_evaluate(self, rnd, results, failures):
        super_result = super().aggregate_evaluate(rnd, results, failures)
        
        log(
            INFO,
            f"round-{rnd}-EVALUATION"
        )
        
        accuracy = []
        data = {}
        for r in results:
            acc = r[1].metrics["accuracy"]
            client = r[1].metrics["client"]
            data[client] = acc
            accuracy.append(acc)
        
        df = pd.DataFrame(data, index=[0], columns=sorted(data.keys()))
        df.to_csv(f"./results/hito2.csv", mode='a', index=False, header=False)
        
        log(
            INFO,
            sorted(data.items())
        )
        
        np.array(accuracy)
        log(
            INFO,
            f"Aggregated accuracy: {np.mean(accuracy)} +- {np.std(accuracy)}"
        )
        
        return super_result

In [16]:
seed(1)
set_random_seed(2)

neurons = 36
activation = "relu"
learning_rate = 0.180165
optimizer = Adadelta(learning_rate=learning_rate)

input_shape = (7,)

# Create model
model = Sequential()

model.add(Dense(neurons, input_shape=input_shape, activation=activation))

model.add(BatchNormalization())

model.add(Dense(neurons, activation=activation))
model.add(Dense(neurons, activation=activation))
model.add(Dense(neurons, activation=activation))

model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Get model weights as a list of NumPy ndarray's
weights = model.get_weights()
# Serialize ndarrays to `Parameters`
parameters = fl.common.weights_to_parameters(weights)

In [17]:
# Configuracion de parametros para el entrenamiento desde el servidor
def fit_config(rnd: int) -> Dict[str, str]:
    config = {
        "round": str(rnd),
        "epochs": str(1) 
    }
    return config

In [None]:
# Ahora los clientes son las dos empresas
CLIENTS_IDS = ["empresa_1", "empresa_2", "empresa_3"]

 # Se inicializa el fichero de resultados
header = CLIENTS_IDS

# Start Flower simulation
    # min_fit_clients y min_eval_clients se ponen al maximo de clientes disponibles ya que estamos en un escenario controlado
    
results_fed = pd.DataFrame(columns=["UC", "best_rnd", "acc"])

seed(1)
set_random_seed(2)
for i in range(1,22):
    global UNSEEN_CLIENT 
    UNSEEN_CLIENT = f'cliente_{i}.csv'
    
    with open(f"./results/hito2.csv", 'w', ) as f:
        csv_writer = writer(f)
        csv_writer.writerow(header)
    
    fl.simulation.start_simulation(
        client_fn=client_fn,
        clients_ids=CLIENTS_IDS,
        client_resources={"num_cpus": 6},
        num_rounds=50,
        strategy=SaveModelStrategy(
            min_available_clients = len(CLIENTS_IDS),
            min_fit_clients = len(CLIENTS_IDS),
            min_eval_clients = len(CLIENTS_IDS),
            on_fit_config_fn = fit_config,
            on_evaluate_config_fn = fit_config,
            accept_failures=False,
            initial_parameters=parameters
        ),
    )
    
    df = pd.read_csv('./results/hito2.csv')
    df["mean"] = df.mean(numeric_only=True, axis=1)
    
    fed_res = {
        "UC": UNSEEN_CLIENT,
        "best_rnd": df["mean"].idxmax()+1,
        "acc": df["mean"].max()
    }
    
    results_fed = results_fed.append(fed_res, ignore_index=True)
    
results_fed.to_csv("./results/hito2_UCs.csv", index=False, header=True)

INFO flower 2022-04-14 23:02:05,950 | app.py:144 | Ray initialized with resources: {'GPU': 1.0, 'object_store_memory': 5872553164.0, 'memory': 11745106331.0, 'CPU': 8.0, 'node:127.0.0.1': 1.0}
INFO flower 2022-04-14 23:02:05,951 | app.py:153 | Starting Flower simulation running: {'num_rounds': 50}
INFO flower 2022-04-14 23:02:05,952 | server.py:128 | Initializing global parameters
INFO flower 2022-04-14 23:02:05,952 | server.py:323 | Using initial parameters provided by strategy
INFO flower 2022-04-14 23:02:05,952 | server.py:130 | Evaluating initial parameters
INFO flower 2022-04-14 23:02:05,953 | server.py:143 | FL starting
DEBUG flower 2022-04-14 23:02:05,953 | server.py:265 | fit_round: strategy sampled 3 clients (out of 3)
DEBUG flower 2022-04-14 23:02:14,815 | server.py:277 | fit_round received 3 results and 0 failures
DEBUG flower 2022-04-14 23:02:14,821 | server.py:211 | evaluate_round: strategy sampled 3 clients (out of 3)
DEBUG flower 2022-04-14 23:02:16,048 | server.py:223 |

# Visualización de los resultados

Se mostrarán las métricas obtenidas durante el proceso de entrenamiento federado junto con visualizaciones de la evolucion

In [1]:
import matplotlib

font = {'family' : 'serif',
        'weight' : 'normal',
        'size'   : 16}

matplotlib.rc('font', **font)

df = pd.read_csv('./results/hito2.csv')
# df["mean"] = df.mean(numeric_only=True, axis=1)
df.columns = ["ORG 1", "ORG 2", "ORG 3"]

print(f'Acc máximo: {df.mean(numeric_only=True, axis=1).max()}')

f = plt.figure(figsize=(10,6))
ax = f.add_subplot()

df.plot(ax=ax, ylim=(0.5,1), linewidth=2.5)
ax.set_ylabel("Accuracy")
ax.set_xlabel("Federated round")

plt.savefig('./entrenamiento-federado.pdf')

NameError: name 'pd' is not defined

In [11]:
# Acc máximo: 0.7018698453903198
# 0.7018698453903198

In [12]:
df = pd.read_csv('./results/hito2_UCs.csv')
df

Unnamed: 0,UC,best_rnd,acc
0,cliente_1.csv,16,0.721702
1,cliente_2.csv,46,0.72861
2,cliente_3.csv,11,0.721907
3,cliente_4.csv,15,0.712347
4,cliente_5.csv,12,0.717796
5,cliente_6.csv,49,0.730177
6,cliente_7.csv,18,0.723837
7,cliente_8.csv,12,0.724856
8,cliente_9.csv,9,0.723093
9,cliente_10.csv,45,0.719289
