In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, mean_squared_error, r2_score

In [2]:
from google.colab import drive
drive.mount('/content/drive')
#Path archivo:
file_path="/content/drive/MyDrive/4º/3r Trimestre/ML/S5/"

Mounted at /content/drive


# Seminar 5 - Federated Learning

Federated learning:

En las slides lo explica

client 1(w1)...client k(wk) (all models with t weights) --> parameter server

Iteration:

1. Generate global model w0
2. Client selection, S belongs to K
3. Push global model to S
4. Local server + aggregation (FedAvg)

Javier González Otero - 243078

Jordi Guillén González - 253027

David Sánchez Maldonado - 253798

## 0 - Data preparation and cleaning (Poner uno de los archivos de cliente como ejemplo ya que todos tienen las mismas features)

In [40]:
# Paths to skelleton data:
train_client1_features = "data/client_datasets/client_1_features.csv"
train_client1_labels = "data/client_datasets/client_1_labels.csv"

# CSI
# Load train_features into a pandas dataframe
dataframe = pd.read_csv(file_path+train_client1_features, sep=r',', header=None)
dataframe2 = pd.read_csv(file_path+train_client1_labels, sep=r',', header=None)

In [34]:
# Check for missing values
null_counts = dataframe.isnull().sum()
print("Missing values per column:\n", null_counts)

Missing values per column:
 0      0
1      0
2      0
3      0
4      0
      ..
309    0
310    0
311    0
312    0
313    0
Length: 314, dtype: int64


### Data statistics

In [35]:
dataframe.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,304,305,306,307,308,309,310,311,312,313
0,11,11,11,11,11,11,11,11,11,11,...,9,9,9,9,9,9,9,9,9,9


## Part 1 - ML model preparation

De momenento uso el svm del semi 2 que ha dicho que lo podemos usar. Se puede cambiar al que queramos.

In [16]:
def train_svm_model(X_train, y_train, kernel='rbf', C=1.0, gamma='scale'):
    '''
    Trains a Support Vector Machine (SVM) classifier on the given CSI feature data.

    Parameters:
    - X_train (pd.DataFrame or np.ndarray): CSI features (shape: [n_samples, 270]).
    - y_train (pd.Series or np.ndarray): Labels (values from 1 to 5).
    - kernel (str): Kernel type ('linear', 'rbf', 'poly', etc.)
    - C (float): Regularization parameter.
    - gamma (str or float): Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.

    Returns:
    - model: Trained SVM model.
    - scaler: StandardScaler used for normalization.
    '''
    # flatten train labels
    y_train = y_train.iloc[0, :].values.ravel()

    # Normalize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_train)

    # Train SVM
    svm = SVC(kernel=kernel, C=C, gamma=gamma)
    svm.fit(X_scaled, y_train)

    return svm, scaler

def predict_with_svm(model, scaler, X_test):
    '''
    Generates predictions on the test dataset using a trained SVM model.

    Parameters:
    - model: Trained SVM classifier.
    - scaler: StandardScaler used during training.
    - X_test (pd.DataFrame or np.ndarray): Test features.

    Returns:
    - np.ndarray: Predicted labels for the test samples.
    '''
    X_scaled = scaler.transform(X_test)
    y_pred = model.predict(X_scaled)
    return y_pred

In [41]:
model, scaler = train_svm_model(dataframe, dataframe2, kernel='rbf', C=10, gamma='scale')
#pred_labels = predict_with_svm(model, scaler, test_features)

## Part 2 - Preparation of the FL setting

In [53]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import random
import copy

# Assuming train_svm_model and predict_with_svm functions are defined in the notebook as provided in the context.

def run_federated_svm_training(client_datasets, model_params, num_iterations, clients_per_iteration):
    """
    Runs a Federated Learning training process for SVM models.

    Args:
        client_datasets (list): A list of tuples, where each tuple contains
                                (features_df, labels_df) for a client.
        model_params (dict): Parameters for the initial global model (e.g., kernel, C, gamma).
        num_iterations (int): The number of training iterations.
        clients_per_iteration (int): The number of clients to select in each iteration.

    Returns:
        sklearn.svm.SVC: The final global SVM model after federated training.
                         Note: The aggregation for SVM is a placeholder as direct FedAvg
                         of SVM parameters is not standard.
    """
    # INITIALIZE GLOBAL ML MODEL
    num_clients = len(client_datasets)
    scalers = {} # Store scalers for each client

    # Train an initial model on the first client's data as a starting point
    X_train_initial, y_train_initial = client_datasets[0]
    # Flatten labels for training
    y_train_initial_flat = y_train_initial.iloc[0, :].values.ravel()
    scaler_initial = StandardScaler()
    X_scaled_initial = scaler_initial.fit_transform(X_train_initial)
    global_model = SVC(**model_params)
    global_model.fit(X_scaled_initial, y_train_initial_flat)
    scalers[0] = scaler_initial # Store the scaler for the first client

    print("Global model initialized.")

    # REPEAT STEPS 2-5 UNTIL CONVERGENCE (simulated by num_iterations)
    for iteration in range(num_iterations):
        print(f"\n--- Iteration {iteration + 1} ---")

        # SELECT A SUBSET OF CLIENTS
        selected_clients = random.sample(range(num_clients), clients_per_iteration)
        print(f"Selected clients: {selected_clients}")

        client_models = []
        for client_id in selected_clients:
            print(f"Training on client {client_id}...")

            # SEND GLOBAL MODEL TO CLIENTS (simulated by passing the global_model object)
            # RETRAINED BY THE CLIENTS USING THEIR LOCAL DATASETS
            X_train, y_train = client_datasets[client_id]
            # Use the scaler fitted during initial training or fit a new one per client
            scaler = StandardScaler()
            X_scaled = scaler.fit_transform(X_train)
            y_train_flat = y_train.iloc[0, :].values.ravel()

            client_model = copy.deepcopy(global_model) # Client receives a copy
            client_model.fit(X_scaled, y_train_flat)
            scalers[client_id] = scaler # Store the scaler for this client

            client_models.append(client_model)
            print(f"Training on client {client_id} complete.")

        # RETRIEVE INDIVIDUAL MODELS FROM SELECTED CLIENTS (collected in client_models list)

        # AGGREGATE THE INDIVIDUAL CONTRIBUTIONS TO UPDATE THE GLOBAL MODEL (FedAvg placeholder)
        print("Aggregating models...")
        # Placeholder for FedAvg aggregation for SVM.
        # As noted previously, direct averaging of SVM parameters is not standard or effective.
        # A proper implementation for SVM in FL would require a different aggregation method.
        # For the purpose of demonstrating the FedAvg structure, we print the weights
        # but do not perform actual SVM parameter averaging. The global model is not updated here.
        total_data_size = sum([client_datasets[i][0].shape[0] for i in selected_clients])
        for client_id in selected_clients:
             client_data_size = client_datasets[client_id][0].shape[0]
             alpha = client_data_size / total_data_size
             print(f"Client {client_id} aggregation weight (alpha): {alpha}")
        # In a real SVM FL, you would update global_model based on client_models here
        # using an appropriate SVM aggregation method.
        # global_model = new_aggregated_model

        print("Aggregation complete.")

    print("\nFederated training finished.")
    return global_model

# Example usage (assuming you have loaded client data into a list of tuples):
# client_datasets = [
#     (client_1_features_df, client_1_labels_df),
#     (client_2_features_df, client_2_labels_df),
#     # Add more client datasets here
# ]

# Dummy data for demonstration
num_clients = 10
client_datasets = []
for i in range(num_clients):
    # Load data files
    client_file = f"{file_path}data/client_datasets/client_{i+1}_features.csv"
    label_file = f"{file_path}data/client_datasets/client_{i+1}_labels.csv"
    features = pd.read_csv(client_file)#pd.DataFrame(pd.read_csv(client_file))
    labels = pd.read_csv(label_file)#pd.DataFrame(pd.read_csv(label_file))
    client_datasets.append([features, labels])

X_train_initial, y_train_initial = client_datasets[0]
y_train_initial.head()

# Define model parameters
#model_params = {'kernel': 'rbf', 'C': 1.0, 'gamma': 'scale'}

# Run the federated training function
# final_global_model = run_federated_svm_training(
#     client_datasets=client_datasets,
#     model_params=model_params,
#     num_iterations=5,
#     clients_per_iteration=2
# )

# After training, you can use the final_global_model for evaluation or inference.
# Note: The scaler used during prediction would depend on how you handle scaling in a real FL scenario.
# For this example, let's assume you use the scaler of the first client as a placeholder:
# if final_global_model and len(client_datasets) > 0:
#     dummy_test_features = pd.DataFrame(np.random.rand(10, 270))
#     # Assuming you stored scalers during training and want to use one (e.g., client 0's scaler)
#     # You would need a way to access the scalers if the function returned them or stored them globally/in an object
#     # For simplicity in this example, this part is commented out as the function doesn't explicitly return scalers.
#     # You would need to adapt the function or class to manage scalers for prediction.
#     print("\nFinal global model is available for prediction (scaling needs to be handled).")
# else:
#      print("\nGlobal model not available after training.")

Unnamed: 0,11,11.1,11.2,11.3,11.4,11.5,11.6,11.7,11.8,11.9,...,9.19,9.20,9.21,9.22,9.23,9.24,9.25,9.26,9.27,9.28


In [None]:
#Load data files

clients_datasets=[]

for i in range(10):
  client_file = f"{file_path}data/client_datasets/client_{i+1}_features.csv"
  label_file = f"{file_path}data/client_datasets/client_{i+1}_labels.csv"
  clients_datasets.append((pd.DataFrame(pd.read_csv(client_file)),pd.DataFrame(pd.read_csv(label_file))))

def federated_learning(clients_datasets):



## Part 3 - Collaborative training of the model