<a href="https://colab.research.google.com/github/Giuse1093/CSI_Project4/blob/main/data_preprocessing4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ============================================================
# 1) SETUP LIBRERIE E UNZIP
# ============================================================
!pip install -q flwr tensorflow pandas numpy scipy scikit-learn
!pip install -q "cryptography>=44.0.1,<45.0.0"

import os
import zipfile
import warnings
warnings.filterwarnings('ignore')

import flwr as fl
import tensorflow as tf

print("Flower OK:", fl.__version__)
print("TensorFlow OK:", tf.__version__)

# Decomprimiamo il file di training se non esiste già la cartella
if not os.path.exists("CSV_train") and os.path.exists("CSV_train.zip"):
    print("Decompressione CSV_train.zip in corso...")
    with zipfile.ZipFile("CSV_train.zip", 'r') as zip_ref:
        zip_ref.extractall("CSV_train")
    print("Fatto!")
elif not os.path.exists("CSV_train.zip"):
    print("ATTENZIONE: Carica il file 'CSV_train.zip' su Colab!")
else:
    print("Cartella CSV_train già presente.")


# ============================================================
# 2) PREPROCESSING: PULIZIA + FEATURES DA SERIE TEMPORALI
# ============================================================
import pandas as pd
import numpy as np
import ast
from scipy.fft import fft
from sklearn.preprocessing import StandardScaler

# Colonne contenenti le serie temporali
TS_COLS = ['hr_time_series', 'resp_time_series', 'stress_time_series']

def clean_series_interpolation(series_str):
    """
    1. Parsing stringa -> lista.
    2. Valori <=0 (errori sensore: -1, -2, 0) -> NaN.
    3. Interpolazione lineare.
    4. Riempimento NaN rimanenti con la mediana.
    """
    if not isinstance(series_str, str):
        return np.array([])

    try:
        data = np.array(ast.literal_eval(series_str), dtype=float)

        # Serie troppo corte = poco informative
        if len(data) < 5:
            return np.array([])

        # Sostituisco valori "errati" con NaN
        data[data <= 0] = np.nan

        s_data = pd.Series(data)
        s_data = s_data.interpolate(method='linear', limit_direction='both')

        # Se restano NaN, uso la mediana
        if s_data.isna().any():
            s_data = s_data.fillna(s_data.median())

        return s_data.values
    except Exception:
        return np.array([])

def extract_features_datasets_py(clean_data):
    """
    Estrae: Mean, Std, Percentili (25, 50, 75), Energia Spettrale.
    """
    if len(clean_data) == 0:
        return [0.0] * 6

    mean = np.mean(clean_data)
    std = np.std(clean_data)
    p25 = np.percentile(clean_data, 25)
    p50 = np.percentile(clean_data, 50)
    p75 = np.percentile(clean_data, 75)

    f_transform = fft(clean_data)
    spectral_energy = np.sum(np.abs(f_transform) ** 2) / len(clean_data)

    return [mean, std, p25, p50, p75, spectral_energy]

def process_dataframe(df):
    """
    Da dataframe grezzo -> dataframe di feature numeriche da TS_COLS.
    """
    extracted_features = []

    for _, row in df.iterrows():
        row_feats = []
        for col in TS_COLS:
            clean_seq = clean_series_interpolation(row.get(col, np.nan))
            feats = extract_features_datasets_py(clean_seq)
            row_feats.extend(feats)
        extracted_features.append(row_feats)

    prefixes = ['hr', 'resp', 'stress']
    suffixes = ['mean', 'std', 'p25', 'p50', 'p75', 'energy']
    col_names = [f"{p}_{s}" for p in prefixes for s in suffixes]

    return pd.DataFrame(extracted_features, columns=col_names)

def clean_main_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """
    Pulizia generale del dataframe:
    - rimozione righe senza label
    - sostituzione sentinel (-1, -2) con NaN sulle colonne numeriche
    - drop righe con troppi NaN
    - imputazione NaN con mediana colonna
    """
    df = df.copy()

    # 1) Rimuovo righe senza label
    if 'label' in df.columns:
        df = df[df['label'].notna()]

    # 2) Sostituisco codici sentinella con NaN su colonne numeriche
    sentinel_values = [-1, -2]
    num_cols = df.select_dtypes(include=[np.number]).columns
    df[num_cols] = df[num_cols].replace(sentinel_values, np.nan)

    # 3) Drop righe con più del 40% di NaN
    thresh = int(0.6 * df.shape[1])
    df = df.dropna(axis=0, thresh=thresh)

    # 4) Imputo i NaN numerici rimanenti con mediana di colonna
    for c in num_cols:
        median_val = df[c].median()
        df[c] = df[c].fillna(median_val)

    return df

print("Preprocessing definito correttamente.")


# ============================================================
# 3) MODELLO NN MIGLIORATO
# ============================================================
from tensorflow.keras import layers, models, callbacks, losses

def get_model(input_dim):
    """
    Rete neurale densa per regressione (0-100), con BatchNorm e Huber loss.
    """
    model = models.Sequential([
        layers.Input(shape=(input_dim,)),

        layers.Dense(128, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.2),

        layers.Dense(64, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.2),

        layers.Dense(32, activation='relu'),
        layers.BatchNormalization(),

        layers.Dense(1, activation='linear')
    ])

    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
        loss=losses.Huber(delta=5.0),
        metrics=['mae', 'mse']
    )
    return model

print("Modello definito.")


# ============================================================
# 4) FEDERATED LEARNING MIGLIORATO (FEDAVG PESATO)
# ============================================================
from sklearn.model_selection import train_test_split

TRAIN_ROOT = "CSV_train"
ROUNDS = 15
EPOCHS_PER_CLIENT = 5
BATCH_SIZE = 16

def run_federated_learning_smart():
    # 1. Scoperta automatica dei client
    client_folders = []
    for root, dirs, files in os.walk(TRAIN_ROOT):
        csv_files = [f for f in files if f.endswith('.csv') and 'train' in f]
        if len(csv_files) > 0:
            client_folders.append(root)

    if not client_folders:
        print("❌ ERRORE: Nessun file .csv di training trovato! Verifica il file zip.")
        return None

    print(f"✅ Trovati {len(client_folders)} Gruppi/Client:")
    for c in client_folders:
        n_files = len([f for f in os.listdir(c) if f.endswith('.csv')])
        print(f"   - {c} ({n_files} file csv)")

    # 2. Inizializzazione modello globale
    first_folder = client_folders[0]
    first_csv = [f for f in os.listdir(first_folder) if f.endswith('.csv')][0]

    print("\nAnalisi dimensioni input...")
    sample_df = pd.read_csv(os.path.join(first_folder, first_csv), sep=';')
    sample_df = clean_main_dataframe(sample_df)
    sample_X = process_dataframe(sample_df.head(5))
    input_dim = sample_X.shape[1]
    print(f"Dimensioni Feature Input: {input_dim}")

    global_model = get_model(input_dim)
    global_weights = global_model.get_weights()

    # --- LOOP FEDERATO ---
    print("\nInizio Training Federato...")
    for round_num in range(ROUNDS):
        print(f"\n--- Round {round_num + 1}/{ROUNDS} ---")
        local_weights_list = []
        local_losses = []
        local_sizes = []

        for client_folder in client_folders:
            try:
                group_csvs = [
                    os.path.join(client_folder, f)
                    for f in os.listdir(client_folder)
                    if f.endswith('.csv')
                ]

                df_list = []
                for csv_path in group_csvs:
                    try:
                        df_temp = pd.read_csv(csv_path, sep=';')
                        df_list.append(df_temp)
                    except Exception:
                        pass

                if not df_list:
                    continue

                df_client = pd.concat(df_list, ignore_index=True)
                df_client = clean_main_dataframe(df_client)

                if 'label' not in df_client.columns:
                    continue

                # Feature + target
                X_client = process_dataframe(df_client)
                y_client = df_client['label'].values

                # Skip se pochi dati
                if len(X_client) < 10:
                    continue

                # Split train/val locale
                X_train, X_val, y_train, y_val = train_test_split(
                    X_client, y_client, test_size=0.2, random_state=42
                )
                n_train = len(X_train)
                if n_train < 5:
                    continue

                # Scaling locale
                scaler = StandardScaler()
                X_train_scaled = scaler.fit_transform(X_train)
                X_val_scaled = scaler.transform(X_val)

                # Modello locale
                client_model = get_model(input_dim)
                client_model.set_weights(global_weights)

                es = callbacks.EarlyStopping(
                    monitor='val_loss',
                    patience=2,
                    restore_best_weights=True
                )

                history = client_model.fit(
                    X_train_scaled, y_train,
                    validation_data=(X_val_scaled, y_val),
                    epochs=EPOCHS_PER_CLIENT,
                    batch_size=BATCH_SIZE,
                    verbose=0,
                    callbacks=[es]
                )

                last_hist = history.history
                print(f"   Client {os.path.basename(client_folder)} "
                      f"val_mae={last_hist['val_mae'][-1]:.2f}, "
                      f"val_loss={last_hist['val_loss'][-1]:.2f}")

                local_weights_list.append(client_model.get_weights())
                local_losses.append(last_hist['val_loss'][-1])
                local_sizes.append(n_train)

            except Exception as e:
                print(f"   Errore nel gruppo {os.path.basename(client_folder)}: {e}")

        # Aggregazione (FedAvg pesato)
        if local_weights_list:
            total_samples = np.sum(local_sizes)
            avg_weights = []
            for layer_idx in range(len(global_weights)):
                layer_stack = np.stack(
                    [
                        w[layer_idx] * (n / total_samples)
                        for w, n in zip(local_weights_list, local_sizes)
                    ],
                    axis=0
                )
                avg_weights.append(np.sum(layer_stack, axis=0))

            global_weights = avg_weights
            global_model.set_weights(global_weights)

            avg_loss = np.average(local_losses, weights=local_sizes)
            print(f"   >>> Loss di Validazione Media (Global pesata): {avg_loss:.3f}")
        else:
            print("   Nessun client ha completato il training in questo round.")

    return global_model

# ESECUZIONE TRAINING FEDERATO
final_model = run_federated_learning_smart()


# ============================================================
# 5) PREDIZIONI SU x_test.csv E CREAZIONE submission.csv
# ============================================================
if final_model is not None:
    print("\nGenerazione predizioni per x_test.csv...")

    df_test = pd.read_csv("x_test.csv", sep=';')

    # Pulizia base sul test (senza label)
    df_test_clean = df_test.copy()
    sentinel_values = [-1, -2]
    num_cols_test = df_test_clean.select_dtypes(include=[np.number]).columns
    df_test_clean[num_cols_test] = df_test_clean[num_cols_test].replace(sentinel_values, np.nan)
    for c in num_cols_test:
        median_val = df_test_clean[c].median()
        df_test_clean[c] = df_test_clean[c].fillna(median_val)

    # Feature Extraction
    X_test_extracted = process_dataframe(df_test_clean)

    # Normalizzazione (fit sul test per semplicità)
    scaler_test = StandardScaler()
    X_test_scaled = scaler_test.fit_transform(X_test_extracted)

    # Predizione
    predictions = final_model.predict(X_test_scaled)

    # Creazione CSV di submission
    submission = pd.DataFrame({
        'id': df_test['id'],
        'label': predictions.flatten()
    })

    submission.to_csv('submission.csv', index=False)
    print("File 'submission.csv' pronto per il download!")
    print(submission.head())
else:
    print("Training fallito, nessun modello generato.")
