<a href="https://colab.research.google.com/github/Giuse1093/CSI_Project4/blob/main/data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
# Installazione librerie (se mancano)
!pip install -q flwr tensorflow pandas numpy scipy scikit-learn

import os
import zipfile

# Decomprimiamo il file di training se non esiste già la cartella
if not os.path.exists("CSV_train") and os.path.exists("CSV_train.zip"):
    print("Decompressione CSV_train.zip in corso...")
    with zipfile.ZipFile("CSV_train.zip", 'r') as zip_ref:
        zip_ref.extractall("CSV_train")
    print("Fatto!")
elif not os.path.exists("CSV_train.zip"):
    print("ATTENZIONE: Carica il file 'CSV_train.zip' su Colab!")
else:
    print("Cartella CSV_train già presente.")

Cartella CSV_train già presente.


In [21]:
import pandas as pd
import numpy as np
import ast
from scipy.fft import fft
from sklearn.preprocessing import StandardScaler
import warnings

# Ignoriamo warning minori di pandas
warnings.filterwarnings('ignore')

# Colonne contenenti le serie temporali
TS_COLS = ['hr_time_series', 'resp_time_series', 'stress_time_series']

def clean_series_interpolation(series_str):
    """
    Replica la logica di 'Accelerometer_Preprocessing.ipynb':
    1. Parsing da stringa a lista.
    2. Sostituzione valori errati (<=0) con NaN.
    3. Interpolazione Lineare per riempire i buchi mantenendo il trend temporale.
    """
    if not isinstance(series_str, str):
        return np.array([])

    try:
        # 1. Parsing
        data = np.array(ast.literal_eval(series_str), dtype=float)

        # 2. Rimuovi outlier/errori sensore (-1, -2, 0) impostandoli a NaN
        data[data <= 0] = np.nan

        # Trasforma in Series per usare interpolate
        s_data = pd.Series(data)

        # 3. Interpolazione (Linear) - Fondamentale per non perdere la sequenza
        s_data = s_data.interpolate(method='linear', limit_direction='both')

        # Se restano NaN (es. serie vuota), riempi con 0
        s_data = s_data.fillna(0)

        return s_data.values
    except:
        return np.array([])

def extract_features_datasets_py(clean_data):
    """
    Replica ESATTA di 'datasets.py' -> 'calculate_params':
    Estrae: Mean, Std, Percentili (25, 50, 75), Energia Spettrale.
    """
    if len(clean_data) == 0:
        return [0.0] * 6

    # Statistiche base
    mean = np.mean(clean_data)
    std = np.std(clean_data)
    p25 = np.percentile(clean_data, 25)
    p50 = np.percentile(clean_data, 50) # Mediana
    p75 = np.percentile(clean_data, 75)

    # Energia Spettrale (FFT) - Come nel file della prof
    f_transform = fft(clean_data)
    spectral_energy = np.sum(np.abs(f_transform)**2) / len(clean_data)

    return [mean, std, p25, p50, p75, spectral_energy]

def process_dataframe(df):
    """
    Prende il dataframe grezzo e restituisce un dataframe di feature numeriche.
    """
    extracted_features = []

    for _, row in df.iterrows():
        row_feats = []
        for col in TS_COLS:
            # A. Pulizia
            clean_seq = clean_series_interpolation(row[col])
            # B. Estrazione Feature
            feats = extract_features_datasets_py(clean_seq)
            row_feats.extend(feats)

        extracted_features.append(row_feats)

    # Creiamo nomi colonne parlanti
    col_names = []
    prefixes = ['hr', 'resp', 'stress']
    suffixes = ['mean', 'std', 'p25', 'p50', 'p75', 'energy']
    for p in prefixes:
        for s in suffixes:
            col_names.append(f"{p}_{s}")

    return pd.DataFrame(extracted_features, columns=col_names)

print("Funzioni di preprocessing definite correttamente.")

Funzioni di preprocessing definite correttamente.


In [22]:
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks

def get_model(input_dim):
    """
    Crea una rete neurale densa per regressione.
    """
    model = models.Sequential([
        layers.Input(shape=(input_dim,)),

        # Primo layer ampio
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.3), # Dropout per evitare overfitting (visto che i dati sono pochi per client)

        # Secondo layer
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.2),

        # Terzo layer
        layers.Dense(32, activation='relu'),

        # Output Layer (1 neurone lineare per predire 0-100)
        layers.Dense(1, activation='linear')
    ])

    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

print("Modello definito.")

Modello definito.


In [23]:
# --- CELLA 4: FEDERATED LEARNING (CORRETTA) ---
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split  # <--- ECCO L'IMPORT CHE MANCAVA
from sklearn.preprocessing import StandardScaler
from tensorflow.keras import callbacks

# --- CONFIGURAZIONE ---
TRAIN_ROOT = "CSV_train"
ROUNDS = 15
EPOCHS_PER_CLIENT = 3
BATCH_SIZE = 16

def run_federated_learning_smart():
    # 1. SCOPERTA AUTOMATICA DEI GRUPPI (CLIENT)
    client_folders = []
    for root, dirs, files in os.walk(TRAIN_ROOT):
        csv_files = [f for f in files if f.endswith('.csv') and 'train' in f]
        if len(csv_files) > 0:
            client_folders.append(root)

    if not client_folders:
        print("❌ ERRORE: Nessun file .csv di training trovato! Verifica il file zip.")
        return None, None

    print(f"✅ Trovati {len(client_folders)} Gruppi/Client:")
    for c in client_folders:
        n_files = len([f for f in os.listdir(c) if f.endswith('.csv')])
        print(f"   - {c} ({n_files} file csv)")

    # 2. INIZIALIZZAZIONE MODELLO GLOBALE
    # Usiamo il primo file trovato per capire quante feature abbiamo
    first_folder = client_folders[0]
    first_csv = [f for f in os.listdir(first_folder) if f.endswith('.csv')][0]

    print("\nAnalisi dimensioni input...")
    sample_df = pd.read_csv(os.path.join(first_folder, first_csv), sep=';')
    # Nota: process_dataframe deve essere stata definita nella Cella 2
    sample_X = process_dataframe(sample_df.head(5))
    input_dim = sample_X.shape[1]
    print(f"Dimensioni Feature Input: {input_dim}")

    global_model = get_model(input_dim) # Nota: get_model deve essere definita nella Cella 3
    global_weights = global_model.get_weights()

    # --- LOOP FEDERATO ---
    print("\nInizio Training Federato...")
    for round_num in range(ROUNDS):
        print(f"\n--- Round {round_num + 1}/{ROUNDS} ---")
        local_weights_list = []
        local_losses = []

        for client_folder in client_folders:
            try:
                # A. CARICAMENTO DATI GRUPPO
                group_csvs = [os.path.join(client_folder, f) for f in os.listdir(client_folder) if f.endswith('.csv')]

                df_list = []
                for csv_path in group_csvs:
                    try:
                        df_temp = pd.read_csv(csv_path, sep=';')
                        df_list.append(df_temp)
                    except:
                        pass

                if not df_list: continue

                # Unisce i dati di tutti gli utenti del gruppo
                df_client = pd.concat(df_list, ignore_index=True)

                # B. PREPROCESSING
                X_client = process_dataframe(df_client)
                if 'label' not in df_client.columns: continue
                y_client = df_client['label'].values

                # C. SPLIT TRAIN/VAL LOCALE
                # Se abbiamo pochi dati, saltiamo questo client per evitare errori
                if len(X_client) < 10:
                    continue

                # *** QUI SI VERIFICAVA L'ERRORE ***
                X_train, X_val, y_train, y_val = train_test_split(X_client, y_client, test_size=0.2, random_state=42)

                # D. NORMALIZZAZIONE (StandardScaler Locale)
                scaler = StandardScaler()
                X_train_scaled = scaler.fit_transform(X_train)
                X_val_scaled = scaler.transform(X_val)

                # E. TRAINING LOCALE
                client_model = get_model(input_dim)
                client_model.set_weights(global_weights)

                es = callbacks.EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

                history = client_model.fit(
                    X_train_scaled, y_train,
                    validation_data=(X_val_scaled, y_val),
                    epochs=EPOCHS_PER_CLIENT,
                    batch_size=BATCH_SIZE,
                    verbose=0,
                    callbacks=[es]
                )

                local_weights_list.append(client_model.get_weights())
                local_losses.append(history.history['val_loss'][-1])

            except Exception as e:
                print(f"   Errore nel gruppo {os.path.basename(client_folder)}: {e}")

        # F. AGGREGAZIONE (FEDAVG)
        if local_weights_list:
            avg_weights = []
            for layer_idx in range(len(global_weights)):
                layer_weights = [w[layer_idx] for w in local_weights_list]
                avg_weights.append(np.mean(layer_weights, axis=0))

            global_weights = avg_weights
            global_model.set_weights(global_weights)

            avg_loss = np.mean(local_losses)
            print(f"   >>> Loss di Validazione Media (Global): {avg_loss:.2f}")
        else:
            print("   Nessun client ha completato il training in questo round.")

    return global_model

# ESECUZIONE
final_model = run_federated_learning_smart()

✅ Trovati 9 Gruppi/Client:
   - CSV_train/CSV_train/group1 (5 file csv)
   - CSV_train/CSV_train/group3 (5 file csv)
   - CSV_train/CSV_train/group4 (5 file csv)
   - CSV_train/CSV_train/group2 (5 file csv)
   - CSV_train/CSV_train/group0 (5 file csv)
   - CSV_train/CSV_train/group6 (5 file csv)
   - CSV_train/CSV_train/group7 (5 file csv)
   - CSV_train/CSV_train/group5 (5 file csv)
   - CSV_train/CSV_train/group8 (5 file csv)

Analisi dimensioni input...
Dimensioni Feature Input: 18

Inizio Training Federato...

--- Round 1/15 ---
   >>> Loss di Validazione Media (Global): 5786.79

--- Round 2/15 ---
   >>> Loss di Validazione Media (Global): 5197.73

--- Round 3/15 ---
   >>> Loss di Validazione Media (Global): 4049.89

--- Round 4/15 ---
   >>> Loss di Validazione Media (Global): 2427.88

--- Round 5/15 ---
   >>> Loss di Validazione Media (Global): 1304.96

--- Round 6/15 ---
   >>> Loss di Validazione Media (Global): 974.91

--- Round 7/15 ---
   >>> Loss di Validazione Media (Gl

In [24]:
if final_model:
    print("\nGenerazione predizioni per x_test.csv...")
    df_test = pd.read_csv("x_test.csv", sep=';')

    # Feature Extraction
    X_test_extracted = process_dataframe(df_test)

    # Normalizzazione (Fit sul test set per adattamento dominio semplice)
    scaler_test = StandardScaler()
    X_test_scaled = scaler_test.fit_transform(X_test_extracted)

    # Predizione
    predictions = final_model.predict(X_test_scaled)

    # Creazione CSV
    submission = pd.DataFrame({
        'id': df_test['id'],
        'label': predictions.flatten()
    })

    submission.to_csv('submission.csv', index=False)
    print("File 'submission.csv' pronto per il download!")
    print(submission.head())
else:
    print("Training fallito, nessun modello generato.")


Generazione predizioni per x_test.csv...
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
File 'submission.csv' pronto per il download!
   id      label
0   0  84.308388
1   1  72.784363
2   2  72.416115
3   3  77.850410
4   4  75.255302
