In [62]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import random
import copy
import torch
import torch.nn as nn
import torch.nn.functional as F

# Seminar 5 - Federated Learning

Javier González Otero - 243078

Jordi Guillén González - 253027

David Sánchez Maldonado - 253798

## Part 0: Data preprocessing

In [20]:
file_path_clients = "data/client_datasets/"  # Path to the directory containing training data for each client

# Lists to store training features and labels DataFrames for each client
train_features_dfs = []
train_labels_dfs = []

# Loop over the 10 clients
for client_id in range(1, 11):
    # Load training features and labels for the current client
    features_df = pd.read_csv(f"{file_path_clients}client_{client_id}_features.csv", header=None)
    labels_df = pd.read_csv(f"{file_path_clients}client_{client_id}_labels.csv", header=None)

    # Append the loaded data to the lists
    train_features_dfs.append(features_df)
    train_labels_dfs.append(labels_df)

# Load test features and labels
test_features_df = pd.read_csv('data/test_features.csv', header=None)
test_labels_df = pd.read_csv('data/test_labels.csv', header=None)

**Check for missing values**

In [36]:
def check_missing_values(df):
    """
    Checks for missing (NaN) values in a DataFrame.

    Prints the total number of missing values and 
    the row indices where they occur, if any.
    """
    total_missing = df.isnull().sum().sum()
    
    if total_missing > 0:
        print(f"Missing values found: {total_missing}")
        missing_rows = df[df.isnull().any(axis=1)]
        print(f"Rows with missing values:\n{missing_rows.index.tolist()}")
    else:
        print("No missing values found.")


In [42]:
print("Checking training data (features and labels):")
for i in range(10):  # 10 clients
    print(f"\nClient {i+1}")
    print("- Features:")
    check_missing_values(train_features_dfs[i])
    print("- Labels:")
    check_missing_values(train_labels_dfs[i])
print("\nChecking test data (features and labels):")
print("- Features:")
check_missing_values(test_features_df)
print("- Labels:")
check_missing_values(test_labels_df)


Checking training data (features and labels):

Client 1
- Features:
No missing values found.
- Labels:
No missing values found.

Client 2
- Features:
No missing values found.
- Labels:
No missing values found.

Client 3
- Features:
No missing values found.
- Labels:
No missing values found.

Client 4
- Features:
No missing values found.
- Labels:
No missing values found.

Client 5
- Features:
No missing values found.
- Labels:
No missing values found.

Client 6
- Features:
No missing values found.
- Labels:
No missing values found.

Client 7
- Features:
No missing values found.
- Labels:
No missing values found.

Client 8
- Features:
No missing values found.
- Labels:
No missing values found.

Client 9
- Features:
No missing values found.
- Labels:
No missing values found.

Client 10
- Features:
No missing values found.
- Labels:
No missing values found.

Checking test data (features and labels):
- Features:
No missing values found.
- Labels:
No missing values found.


No cleaning is needed

**Obtain the data proportion associated to each client**

In [50]:
num_samples_client_1 = len(train_features_dfs[0])
num_samples_client_2 = len(train_features_dfs[1])
num_samples_client_3 = len(train_features_dfs[2])
num_samples_client_4 = len(train_features_dfs[3])
num_samples_client_5 = len(train_features_dfs[4])
num_samples_client_6 = len(train_features_dfs[5])
num_samples_client_7 = len(train_features_dfs[6])
num_samples_client_8 = len(train_features_dfs[7])
num_samples_client_9 = len(train_features_dfs[8])
num_samples_client_10 = len(train_features_dfs[9])

total_samples = (
    num_samples_client_1 + num_samples_client_2 + num_samples_client_3 +
    num_samples_client_4 + num_samples_client_5 + num_samples_client_6 +
    num_samples_client_7 + num_samples_client_8 + num_samples_client_9 +
    num_samples_client_10
)

## Part 1 - ML model preparation

De momenento uso el svm del semi 2 que ha dicho que lo podemos usar. Se puede cambiar al que queramos.

In [57]:
def train_svm_model(X_train, y_train, kernel='rbf', C=1.0, gamma='scale'):
    '''
    Trains a Support Vector Machine (SVM) classifier on the given CSI feature data.

    Parameters:
    - X_train (pd.DataFrame or np.ndarray): CSI features (shape: [n_samples, 270]).
    - y_train (pd.Series or np.ndarray): Labels (values from 1 to 5).
    - kernel (str): Kernel type ('linear', 'rbf', 'poly', etc.)
    - C (float): Regularization parameter.
    - gamma (str or float): Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.

    Returns:
    - model: Trained SVM model.
    - scaler: StandardScaler used for normalization.
    '''
    # flatten train labels
    y_train = y_train.iloc[0, :].values.ravel()

    # Normalize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_train)

    # Train SVM
    svm = SVC(kernel=kernel, C=C, gamma=gamma)
    svm.fit(X_scaled, y_train)

    return svm, scaler

def predict_with_svm(model, scaler, X_test):
    '''
    Generates predictions on the test dataset using a trained SVM model.

    Parameters:
    - model: Trained SVM classifier.
    - scaler: StandardScaler used during training.
    - X_test (pd.DataFrame or np.ndarray): Test features.

    Returns:
    - np.ndarray: Predicted labels for the test samples.
    '''
    X_scaled = scaler.transform(X_test)
    y_pred = model.predict(X_scaled)
    return y_pred

In [64]:
class PoseClassifier(nn.Module):
    def __init__(self, input_dim=270, hidden_dim=128, output_dim=12):
        super(PoseClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        return self.fc2(x)  # logits (use CrossEntropyLoss)

## Part 2 - Preparation of the FL setting

In [66]:
def fl_round_fn(clients_data, selected_clients, global_model, epochs=5, batch_size=32, lr=0.01):
    """
    Performs local training for each selected client and returns their updated weights and dataset sizes.

    Returns
    -------
    updates : list of dict
        List of state_dicts with updated model weights.
    sizes : list of int
        Number of training samples for each client.
    """
    updates = []
    sizes = []

    for client_id in selected_clients:
        X_train, y_train = clients_data[client_id]
        model = PoseClassifier()
        model.load_state_dict(global_model.state_dict())  # Copy global model
        
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        criterion = nn.CrossEntropyLoss()

        model.train()
        X_tensor = torch.tensor(X_train.values, dtype=torch.float32)
        y_tensor = torch.tensor(y_train.values.ravel(), dtype=torch.long)

        dataset = torch.utils.data.TensorDataset(X_tensor, y_tensor)
        loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

        for _ in range(epochs):
            for xb, yb in loader:
                optimizer.zero_grad()
                loss = criterion(model(xb), yb)
                loss.backward()
                optimizer.step()

        updates.append(model.state_dict())
        sizes.append(len(X_train))

    return updates, sizes


In [68]:
def fed_avg(weight_updates, sizes):
    """
    Performs Federated Averaging over model weights.

    Parameters
    ----------
    weight_updates : list of state_dicts (models' weights)
    sizes : list of int, number of samples per client

    Returns
    -------
    avg_weights : state_dict representing the averaged model
    """
    total_size = sum(sizes)
    avg_weights = {}

    for key in weight_updates[0].keys():
        avg_weights[key] = sum(
            update[key] * (size / total_size)
            for update, size in zip(weight_updates, sizes)
        )

    return avg_weights


In [70]:
class Orchestrator:
    def __init__(self, clients_data, model_class):
        """
        Initialize the Orchestrator with client training data and model class.

        Parameters
        ----------
        clients_data : list of tuples
            Each tuple contains (X_train, y_train) for a client.
        model_class : callable
            A class (e.g., PoseClassifier) that returns a PyTorch model instance.
        """
        self.clients_data = clients_data
        self.model_class = model_class
        self.global_model = None

    def initialize_model(self):
        """
        Initializes the global model.
        """
        self.global_model = self.model_class()

    def select_clients(self, num_clients):
        """
        Randomly selects a subset of clients for the current training round.

        Parameters
        ----------
        num_clients : int
            Number of clients to select.
        
        Returns
        -------
        list of int : Selected client indices.
        """
        return random.sample(range(len(self.clients_data)), num_clients)

    def run_federated_training(self, fl_round_fn, fedavg_fn, num_rounds=5, clients_per_round=3, tolerance=1e-3):
        """
        Runs the Federated Learning process over several rounds.

        Parameters
        ----------
        fl_round_fn : function
            Function that performs a federated round and returns weights and sizes.
        fedavg_fn : function
            Function that aggregates model weights using FedAvg.
        num_rounds : int
            Maximum number of training rounds.
        clients_per_round : int
            Number of clients selected in each round.
        tolerance : float
            Convergence criterion based on weight change norm.
        """
        prev_weights = None

        for round_num in range(num_rounds):
            print(f"\n[Round {round_num + 1}] Selecting clients...")
            selected = self.select_clients(clients_per_round)

            print(f"[Round {round_num + 1}] Training local models...")
            updates, sizes = fl_round_fn(self.clients_data, selected, self.global_model)

            print(f"[Round {round_num + 1}] Aggregating with FedAvg...")
            avg_weights = fedavg_fn(updates, sizes)
            self.global_model.load_state_dict(avg_weights)

            # Check for convergence (optional)
            if prev_weights is not None:
                deltas = sum(torch.norm(avg_weights[k] - prev_weights[k]).item() for k in avg_weights)
                print(f"[Round {round_num + 1}] Weight change: {deltas:.6f}")
                if deltas < tolerance:
                    print("[Convergence reached]")
                    break

            prev_weights = {k: v.clone().detach() for k, v in avg_weights.items()}


In [53]:
# Assuming train_svm_model and predict_with_svm functions are defined in the notebook as provided in the context.

def run_federated_svm_training(client_datasets, model_params, num_iterations, clients_per_iteration):
    """
    Runs a Federated Learning training process for SVM models.

    Args:
        client_datasets (list): A list of tuples, where each tuple contains
                                (features_df, labels_df) for a client.
        model_params (dict): Parameters for the initial global model (e.g., kernel, C, gamma).
        num_iterations (int): The number of training iterations.
        clients_per_iteration (int): The number of clients to select in each iteration.

    Returns:
        sklearn.svm.SVC: The final global SVM model after federated training.
                         Note: The aggregation for SVM is a placeholder as direct FedAvg
                         of SVM parameters is not standard.
    """
    # INITIALIZE GLOBAL ML MODEL
    num_clients = len(client_datasets)
    scalers = {} # Store scalers for each client

    # Train an initial model on the first client's data as a starting point
    X_train_initial, y_train_initial = client_datasets[0]
    # Flatten labels for training
    y_train_initial_flat = y_train_initial.iloc[0, :].values.ravel()
    scaler_initial = StandardScaler()
    X_scaled_initial = scaler_initial.fit_transform(X_train_initial)
    global_model = SVC(**model_params)
    global_model.fit(X_scaled_initial, y_train_initial_flat)
    scalers[0] = scaler_initial # Store the scaler for the first client

    print("Global model initialized.")

    # REPEAT STEPS 2-5 UNTIL CONVERGENCE (simulated by num_iterations)
    for iteration in range(num_iterations):
        print(f"\n--- Iteration {iteration + 1} ---")

        # SELECT A SUBSET OF CLIENTS
        selected_clients = random.sample(range(num_clients), clients_per_iteration)
        print(f"Selected clients: {selected_clients}")

        client_models = []
        for client_id in selected_clients:
            print(f"Training on client {client_id}...")

            # SEND GLOBAL MODEL TO CLIENTS (simulated by passing the global_model object)
            # RETRAINED BY THE CLIENTS USING THEIR LOCAL DATASETS
            X_train, y_train = client_datasets[client_id]
            # Use the scaler fitted during initial training or fit a new one per client
            scaler = StandardScaler()
            X_scaled = scaler.fit_transform(X_train)
            y_train_flat = y_train.iloc[0, :].values.ravel()

            client_model = copy.deepcopy(global_model) # Client receives a copy
            client_model.fit(X_scaled, y_train_flat)
            scalers[client_id] = scaler # Store the scaler for this client

            client_models.append(client_model)
            print(f"Training on client {client_id} complete.")

        # RETRIEVE INDIVIDUAL MODELS FROM SELECTED CLIENTS (collected in client_models list)

        # AGGREGATE THE INDIVIDUAL CONTRIBUTIONS TO UPDATE THE GLOBAL MODEL (FedAvg placeholder)
        print("Aggregating models...")
        # Placeholder for FedAvg aggregation for SVM.
        # As noted previously, direct averaging of SVM parameters is not standard or effective.
        # A proper implementation for SVM in FL would require a different aggregation method.
        # For the purpose of demonstrating the FedAvg structure, we print the weights
        # but do not perform actual SVM parameter averaging. The global model is not updated here.
        total_data_size = sum([client_datasets[i][0].shape[0] for i in selected_clients])
        for client_id in selected_clients:
             client_data_size = client_datasets[client_id][0].shape[0]
             alpha = client_data_size / total_data_size
             print(f"Client {client_id} aggregation weight (alpha): {alpha}")
        # In a real SVM FL, you would update global_model based on client_models here
        # using an appropriate SVM aggregation method.
        # global_model = new_aggregated_model

        print("Aggregation complete.")

    print("\nFederated training finished.")
    return global_model

# Example usage (assuming you have loaded client data into a list of tuples):
# client_datasets = [
#     (client_1_features_df, client_1_labels_df),
#     (client_2_features_df, client_2_labels_df),
#     # Add more client datasets here
# ]

# Dummy data for demonstration
num_clients = 10
client_datasets = []
for i in range(num_clients):
    # Load data files
    client_file = f"{file_path}data/client_datasets/client_{i+1}_features.csv"
    label_file = f"{file_path}data/client_datasets/client_{i+1}_labels.csv"
    features = pd.read_csv(client_file)#pd.DataFrame(pd.read_csv(client_file))
    labels = pd.read_csv(label_file)#pd.DataFrame(pd.read_csv(label_file))
    client_datasets.append([features, labels])

X_train_initial, y_train_initial = client_datasets[0]
y_train_initial.head()

# Define model parameters
#model_params = {'kernel': 'rbf', 'C': 1.0, 'gamma': 'scale'}

# Run the federated training function
# final_global_model = run_federated_svm_training(
#     client_datasets=client_datasets,
#     model_params=model_params,
#     num_iterations=5,
#     clients_per_iteration=2
# )

# After training, you can use the final_global_model for evaluation or inference.
# Note: The scaler used during prediction would depend on how you handle scaling in a real FL scenario.
# For this example, let's assume you use the scaler of the first client as a placeholder:
# if final_global_model and len(client_datasets) > 0:
#     dummy_test_features = pd.DataFrame(np.random.rand(10, 270))
#     # Assuming you stored scalers during training and want to use one (e.g., client 0's scaler)
#     # You would need a way to access the scalers if the function returned them or stored them globally/in an object
#     # For simplicity in this example, this part is commented out as the function doesn't explicitly return scalers.
#     # You would need to adapt the function or class to manage scalers for prediction.
#     print("\nFinal global model is available for prediction (scaling needs to be handled).")
# else:
#      print("\nGlobal model not available after training.")

Unnamed: 0,11,11.1,11.2,11.3,11.4,11.5,11.6,11.7,11.8,11.9,...,9.19,9.20,9.21,9.22,9.23,9.24,9.25,9.26,9.27,9.28


In [None]:
#Load data files

clients_datasets=[]

for i in range(10):
  client_file = f"{file_path}data/client_datasets/client_{i+1}_features.csv"
  label_file = f"{file_path}data/client_datasets/client_{i+1}_labels.csv"
  clients_datasets.append((pd.DataFrame(pd.read_csv(client_file)),pd.DataFrame(pd.read_csv(label_file))))

def federated_learning(clients_datasets):



## Part 3 - Collaborative training of the model