<a href="https://colab.research.google.com/github/Giuse1093/CSI_Project4/blob/main/data_preprocessing4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# ============================================================
# 1) SETUP LIBRERIE
# ============================================================
!pip install -q flwr tensorflow pandas numpy scipy scikit-learn
!pip install -q "cryptography>=44.0.1,<45.0.0"

import os
import zipfile
import warnings
warnings.filterwarnings('ignore')

import flwr as fl
import tensorflow as tf
print("Flower OK:", fl.__version__)
print("TensorFlow OK:", tf.__version__)

if not os.path.exists("CSV_train") and os.path.exists("CSV_train.zip"):
    print("Decompressione CSV_train.zip in corso...")
    with zipfile.ZipFile("CSV_train.zip", 'r') as zip_ref:
        zip_ref.extractall("CSV_train")
    print("Fatto!")
elif not os.path.exists("CSV_train.zip"):
    print("ATTENZIONE: Carica il file 'CSV_train.zip' su Colab!")
else:
    print("Cartella CSV_train già presente.")

import pandas as pd
import numpy as np
import ast
from scipy.fft import fft
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers, models, callbacks, losses

# ============================================================
# 2) PREPROCESSING ROBUSTO
# ============================================================

TS_COLS = ['hr_time_series', 'resp_time_series', 'stress_time_series']

# Feature statiche forti da estrarre
STATIC_COLS = [
    'sleep_sleepTimeSeconds',
    'sleep_deepSleepSeconds',
    'sleep_lightSleepSeconds',
    'sleep_remSleepSeconds',
    'sleep_awakeSleepSeconds',
    'sleep_averageRespirationValue',
    'sleep_avgHeartRate',
    'sleep_avgSleepStress',
    'hr_restingHeartRate',
    'act_totalCalories',
    'act_activeTime'
]

def clean_series_interpolation(series_str):
    """
    Pulizia robusta della serie temporale:
    - Parsing sicuro
    - NaN per valori invalidi
    - Interpolazione lineare
    - Riempimento con mediana (non 0)
    """
    if not isinstance(series_str, str):
        return np.array([])

    try:
        data = np.array(ast.literal_eval(series_str), dtype=float)

        # Serie troppo corte sono inutili
        if len(data) < 5:
            return np.array([])

        # Valori invalidi -> NaN
        data[data <= 0] = np.nan

        s_data = pd.Series(data)

        # Interpolazione lineare (preserva trend)
        s_data = s_data.interpolate(method='linear', limit_direction='both')

        # Se restano NaN, usa mediana della serie (robusto)
        if s_data.isna().any():
            median_val = s_data.median()
            if not np.isfinite(median_val):
                return np.array([])
            s_data = s_data.fillna(median_val)

        return s_data.values
    except Exception:
        return np.array([])

def extract_features_ts(clean_data):
    """
    Estrae features da serie temporale (stile prof):
    Mean, Std, P25, P50, P75, Energia Spettrale FFT
    """
    if len(clean_data) == 0:
        return [0.0] * 6

    mean = float(np.mean(clean_data))
    std = float(np.std(clean_data))
    p25 = float(np.percentile(clean_data, 25))
    p50 = float(np.percentile(clean_data, 50))
    p75 = float(np.percentile(clean_data, 75))

    f_transform = fft(clean_data)
    spectral_energy = float(np.sum(np.abs(f_transform) ** 2) / len(clean_data))

    # Sanity check
    vals = [mean, std, p25, p50, p75, spectral_energy]
    vals = [0.0 if (not np.isfinite(v)) else v for v in vals]

    return vals

def clean_main_dataframe(df: pd.DataFrame, is_train: bool = True) -> pd.DataFrame:
    """
    Pulizia del dataframe principale:
    - Rimuove righe senza label (se train)
    - Sostituisce sentinelle (-1, -2) con NaN
    - Droppa righe con troppi NaN
    - Imputa NaN con mediana per colonna
    """
    df = df.copy()

    # Rimuovi righe senza label
    if is_train and 'label' in df.columns:
        df = df[df['label'].notna()]
        # Clipping label nel range realistico [0, 100]
        df = df[df['label'].between(0, 100)]

    # Sostituisci sentinelle
    sentinel_values = [-1, -2]
    num_cols = df.select_dtypes(include=[np.number]).columns
    for col in num_cols:
        df[col] = df[col].replace(sentinel_values, np.nan)

    # Droppa righe con troppi NaN (>40%)
    thresh = int(0.6 * df.shape[1])
    df = df.dropna(axis=0, thresh=thresh)

    # Imputa NaN con mediana per colonna
    for col in num_cols:
        if col in df.columns:
            median_val = df[col].median()
            if np.isfinite(median_val):
                df[col] = df[col].fillna(median_val)
            else:
                df[col] = df[col].fillna(0.0)

    return df

def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """
    Estrae feature da series temporali + feature statiche.
    Output: DataFrame con ~29 feature (18 TS + 11 statiche)
    """
    df = df.copy()

    # === FEATURES DA SERIE TEMPORALI ===
    ts_features = []

    for _, row in df.iterrows():
        row_feats = []

        for col in TS_COLS:
            if col not in row.index:
                # Se colonna manca, 6 zeri
                row_feats.extend([0.0] * 6)
            else:
                clean_seq = clean_series_interpolation(row[col])
                feats = extract_features_ts(clean_seq)
                row_feats.extend(feats)

        ts_features.append(row_feats)

    prefixes = ['hr', 'resp', 'stress']
    suffixes = ['mean', 'std', 'p25', 'p50', 'p75', 'energy']
    ts_col_names = [f"{p}_{s}" for p in prefixes for s in suffixes]

    df_ts = pd.DataFrame(ts_features, columns=ts_col_names, index=df.index)

    # === FEATURES STATICHE ===
    static_cols_present = [c for c in STATIC_COLS if c in df.columns]
    df_static = df[static_cols_present].copy() if static_cols_present else pd.DataFrame(index=df.index)

    # Sanity check su statiche
    if len(df_static) > 0:
        df_static = df_static.replace([np.inf, -np.inf], np.nan)
        for c in df_static.columns:
            median_val = df_static[c].median()
            if np.isfinite(median_val):
                df_static[c] = df_static[c].fillna(median_val)
            else:
                df_static[c] = 0.0

    # Concatena
    X = pd.concat([df_ts, df_static], axis=1)
    X = X.replace([np.inf, -np.inf], 0.0).fillna(0.0)

    return X

print("Preprocessing robusto definito.")

# ============================================================
# 3) MODELLO NN MIGLIORATO
# ============================================================

def get_model(input_dim):
    """
    NN densa con:
    - BatchNormalization (stabilizza training)
    - Dropout moderato (0.2, non 0.3)
    - Huber loss (robusto agli outlier)
    - Adam con lr=1e-3
    """
    model = models.Sequential([
        layers.Input(shape=(input_dim,)),

        layers.Dense(128, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.2),

        layers.Dense(64, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.2),

        layers.Dense(32, activation='relu'),
        layers.BatchNormalization(),

        layers.Dense(1, activation='linear')
    ])

    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
        loss=losses.Huber(delta=5.0),
        metrics=['mae', 'mse']
    )
    return model

print("Modello NN definito.")

# ============================================================
# 4) FEDERATED LEARNING MIGLIORATO
# ============================================================

TRAIN_ROOT = "CSV_train"
ROUNDS = 15
EPOCHS_PER_CLIENT = 5
BATCH_SIZE = 16

def run_federated_learning_smart():
    """
    Federated Learning con:
    - FedAvg pesato per numero campioni
    - Normalizzazione coerente per client
    - Early stopping per client
    - Logging dettagliato
    """

    # 1. Scoperta client
    client_folders = []
    for root, dirs, files in os.walk(TRAIN_ROOT):
        csv_files = [f for f in files if f.endswith('.csv') and 'train' in f]
        if len(csv_files) > 0:
            client_folders.append(root)

    if not client_folders:
        print("❌ ERRORE: Nessun file CSV trovato!")
        return None

    print(f"✅ Trovati {len(client_folders)} Client:")
    for c in client_folders:
        n_files = len([f for f in os.listdir(c) if f.endswith('.csv')])
        print(f"   - {c} ({n_files} file)")

    # 2. Inizializza modello globale
    first_folder = client_folders[0]
    first_csv = [f for f in os.listdir(first_folder) if f.endswith('.csv')][0]

    print("\nAnalisi dimensioni...")
    sample_df = pd.read_csv(os.path.join(first_folder, first_csv), sep=';')
    sample_df = clean_main_dataframe(sample_df, is_train=True)
    sample_X = process_dataframe(sample_df.head(5))
    input_dim = sample_X.shape[1]
    print(f"Feature Input: {input_dim}")

    global_model = get_model(input_dim)
    global_weights = global_model.get_weights()

    # 3. Loop federato
    print("\nInizio Training Federato...")
    for round_num in range(ROUNDS):
        print(f"\n--- Round {round_num + 1}/{ROUNDS} ---")

        local_weights_list = []
        local_losses = []
        local_sizes = []

        for client_folder in client_folders:
            try:
                # Carica dati del client
                group_csvs = [
                    os.path.join(client_folder, f)
                    for f in os.listdir(client_folder)
                    if f.endswith('.csv')
                ]

                df_list = []
                for csv_path in group_csvs:
                    try:
                        df_temp = pd.read_csv(csv_path, sep=';')
                        df_list.append(df_temp)
                    except Exception:
                        pass

                if not df_list:
                    continue

                df_client = pd.concat(df_list, ignore_index=True)
                df_client = clean_main_dataframe(df_client, is_train=True)

                if 'label' not in df_client.columns or len(df_client) == 0:
                    continue

                # Estrai feature
                X_client = process_dataframe(df_client)
                y_client = df_client['label'].values

                if len(X_client) < 20:
                    continue

                # Split train/val
                X_train, X_val, y_train, y_val = train_test_split(
                    X_client, y_client, test_size=0.2, random_state=42
                )

                n_train = len(X_train)
                if n_train < 10:
                    continue

                # Scaling locale
                scaler = StandardScaler()
                X_train_scaled = scaler.fit_transform(X_train)
                X_val_scaled = scaler.transform(X_val)

                # Training locale
                client_model = get_model(input_dim)
                client_model.set_weights(global_weights)

                es = callbacks.EarlyStopping(
                    monitor='val_loss',
                    patience=3,
                    restore_best_weights=True
                )

                history = client_model.fit(
                    X_train_scaled, y_train,
                    validation_data=(X_val_scaled, y_val),
                    epochs=EPOCHS_PER_CLIENT,
                    batch_size=BATCH_SIZE,
                    verbose=0,
                    callbacks=[es]
                )

                val_mae = history.history['val_mae'][-1]
                val_loss = history.history['val_loss'][-1]

                print(f"   {os.path.basename(client_folder)}: "
                      f"mae={val_mae:.2f}, loss={val_loss:.2f}, samples={n_train}")

                local_weights_list.append(client_model.get_weights())
                local_losses.append(val_loss)
                local_sizes.append(n_train)

            except Exception as e:
                print(f"   Errore in {os.path.basename(client_folder)}: {str(e)[:40]}")

        # Aggregazione (FedAvg pesato)
        if local_weights_list:
            total_samples = np.sum(local_sizes)
            avg_weights = []

            for layer_idx in range(len(global_weights)):
                weighted_layer = np.zeros_like(global_weights[layer_idx])

                for w, n in zip(local_weights_list, local_sizes):
                    weighted_layer += w[layer_idx] * (n / total_samples)

                avg_weights.append(weighted_layer)

            global_weights = avg_weights
            global_model.set_weights(global_weights)

            avg_loss = np.average(local_losses, weights=local_sizes)
            print(f"   >>> Media pesata Loss: {avg_loss:.3f}")
        else:
            print("   ⚠️  Nessun client completato.")

    return global_model

final_model = run_federated_learning_smart()

# ============================================================
# 5) PREDIZIONI E SUBMISSION
# ============================================================

if final_model is not None:
    print("\nGenerazione predizioni...")

    df_test = pd.read_csv("x_test.csv", sep=';')

    # Pulizia test (senza label)
    df_test_clean = clean_main_dataframe(df_test, is_train=False)

    # Estrazione feature
    X_test_extracted = process_dataframe(df_test_clean)

    # Scaling
    scaler_test = StandardScaler()
    X_test_scaled = scaler_test.fit_transform(X_test_extracted)

    # Predizioni
    predictions = final_model.predict(X_test_scaled, verbose=0)

    # Post-processing
    predictions = np.nan_to_num(predictions, nan=75.0, posinf=100.0, neginf=0.0)
    predictions = np.clip(predictions, 0, 100)
    predictions = np.rint(predictions).astype(int)

    # Submission
    submission = pd.DataFrame({
        'id': df_test_clean['id'], # Changed from df_test['id'] to df_test_clean['id']
        'label': predictions.flatten()
    })

    # Sanity check
    assert not submission['label'].isna().any(), "⚠️  Ci sono NaN!"
    assert (submission['label'] >= 0).all(), "⚠️  Valori < 0!"
    assert (submission['label'] <= 100).all(), "⚠️  Valori > 100!"

    submission.to_csv('submission.csv', index=False)

    print("✅ File 'submission.csv' pronto!")
    print("\nStatistiche predizioni:")
    print(f"  Min: {submission['label'].min()}")
    print(f"  Max: {submission['label'].max()}")
    print(f"  Media: {submission['label'].mean():.1f}")
    print(f"  Mediana: {submission['label'].median():.1f}")
    print(f"  Std: {submission['label'].std():.1f}")
    print("\nPrime 10 righe:")
    print(submission.head(10))

else:
    print("❌ Training fallito.")

IndentationError: unexpected indent (ipython-input-2663707233.py, line 107)