<a href="https://colab.research.google.com/github/Giuse1093/CSI_Project4/blob/main/data_processing5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
 # --- CELLA 1: INSTALLAZIONE E SETUP ---

!pip install -q flwr pandas numpy scipy scikit-learn


import os

import zipfile

import ast

import numpy as np

import pandas as pd

from scipy.fft import fft

from scipy.stats import skew, kurtosis, linregress, entropy

from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error, mean_absolute_error

import warnings


warnings.filterwarnings('ignore')


# Decompressione Dataset

if not os.path.exists("CSV_train") and os.path.exists("CSV_train.zip"):

    print("Decompressione CSV_train.zip...")

    with zipfile.ZipFile("CSV_train.zip", 'r') as zip_ref:

        zip_ref.extractall("CSV_train")

    print("Fatto!")

elif not os.path.exists("CSV_train.zip"):

    print(" ATTENZIONE: Carica il file 'CSV_train.zip' e 'x_test.csv' su Colab!")



Decompressione CSV_train.zip...
Fatto!


In [None]:
 # --- CELLA 2: PREPROCESSING MEDICAL GRADE ---


TS_COLS = ['hr_time_series', 'resp_time_series', 'stress_time_series', 'activity_time_series']


def clean_series_interpolation(series_str):

    if not isinstance(series_str, str):

        return np.array([])

    try:

        data = np.array(ast.literal_eval(series_str), dtype=float)

        data[data <= 0] = np.nan

        if np.all(np.isnan(data)):

            return np.array([0.0])

        s_data = pd.Series(data)

        s_data = s_data.interpolate(method='linear', limit_direction='both')

        s_data = s_data.fillna(0)

        return s_data.values

    except:

        return np.array([0.0])


def calculate_params(matrix):

    """

    Estrae feature avanzate + metriche fisiologiche (HRV proxy).

    Restituisce 14 feature totali.

    """

    if len(matrix) <= 1:

        # CORREZIONE: Ora sono 14 feature, non 13

        return [0.0] * 14


    # 1. Statistiche Base

    mean = np.mean(matrix)

    std = np.std(matrix)

    min_val = np.min(matrix)

    max_val = np.max(matrix)


    # 2. Percentili

    p25 = np.percentile(matrix, 25)

    p50 = np.percentile(matrix, 50)

    p75 = np.percentile(matrix, 75)


    # 3. Forma e Trend

    sk = skew(matrix) if std > 0 else 0

    ku = kurtosis(matrix) if std > 0 else 0


    try:

        slope, _, _, _, _ = linregress(np.arange(len(matrix)), matrix)

        if np.isnan(slope): slope = 0.0

    except:

        slope = 0.0


    # 4. Energia Spettrale

    f_trans = fft(matrix)

    energy = np.sum(np.abs(f_trans)**2) / len(matrix)


    # --- NUOVE FEATURE AGGIUNTE ---


    # 5. RMSSD Proxy (Root Mean Square of Successive Differences)

    diff = np.diff(matrix)

    rmssd = np.sqrt(np.mean(diff**2)) if len(diff) > 0 else 0


    # 6. Zero Crossings (Rispetto alla media)

    zero_crossings = np.where(np.diff(np.sign(matrix - mean)))[0].size


    # 7. Entropia di Shannon

    counts, _ = np.histogram(matrix, bins=10, density=True)

    ent = entropy(counts + 1e-10)


    # Totale 14 feature

    return [mean, std, min_val, max_val, p25, p50, p75, sk, ku, slope, energy, rmssd, zero_crossings, ent]


def process_dataframe(df):

    extracted_data = []

    # Escludiamo le colonne non scalari e label/id/date

    scalar_cols = [c for c in df.columns if 'time_series' not in c and c != 'label' and c != 'id' and 'date' not in c]


    for _, row in df.iterrows():

        row_features = []

        # Feature Scalari esistenti

        for col in scalar_cols:

            val = row[col] if pd.notnull(row[col]) else 0.0

            row_features.append(val)


        # Feature Time Series

        for col in TS_COLS:

            if col in df.columns:

                clean_seq = clean_series_interpolation(row[col])

                feats = calculate_params(clean_seq)

                row_features.extend(feats)


        extracted_data.append(row_features)


    col_names = list(scalar_cols)

    suffixes = ['mean', 'std', 'min', 'max', 'p25', 'p50', 'p75', 'skew', 'kurt', 'slope', 'energy', 'rmssd', 'zcross', 'entropy']

    for ts_col in TS_COLS:

        if ts_col in df.columns:

            base_name = ts_col.replace('_time_series', '')

            for s in suffixes:

                col_names.append(f"{base_name}_{s}")


    return pd.DataFrame(extracted_data, columns=col_names)


print("Preprocessing Medical Grade pronto.")

Preprocessing Medical Grade pronto.


In [None]:
 # --- CELLA 3: TRAINING FEDERATO "SMART" (Quality Gating) ---


def run_federated_simulation_smart():

    TRAIN_ROOT = "CSV_train"

    global_estimators = []



    all_val_features = []

    all_val_labels = []



    client_folders = []

    for root, dirs, files in os.walk(TRAIN_ROOT):

        csv_files = [f for f in files if f.endswith('.csv')]

        if csv_files:

            client_folders.append(root)



    print(f"Training Federato Smart su {len(client_folders)} client...")



    # Variabili per inizializzazione modello globale

    first_X_train = None

    first_y_train = None



    accepted_clients = 0

    rejected_clients = 0


    for i, folder in enumerate(client_folders):

        files = [os.path.join(folder, f) for f in os.listdir(folder) if f.endswith('.csv')]

        df_list = [pd.read_csv(f, sep=';') for f in files]

        if not df_list: continue



        df_client = pd.concat(df_list, ignore_index=True)

        if 'label' not in df_client.columns: continue



        y = df_client['label'].values

        X = process_dataframe(df_client)



        # Saltiamo client con troppi pochi dati

        if len(X) < 10: continue


        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)



        # Salviamo il primo per inizializzare il global container

        if first_X_train is None:

            first_X_train = X_train

            first_y_train = y_train


        # Salviamo i dati di validazione per il test finale globale

        all_val_features.append(X_val)

        all_val_labels.append(y_val)


        # 3. Training Locale (PiÃ¹ aggressivo ora)

        clf = RandomForestRegressor(

            n_estimators=60,       # Aumentato: piÃ¹ alberi per client

            max_depth=20,          # Aumentato: cattura relazioni piÃ¹ complesse

            min_samples_leaf=2,

            max_features='sqrt',   # Evita overfitting

            n_jobs=-1,

            random_state=42

        )



        clf.fit(X_train, y_train)



        # --- QUALITY GATE ---

        # Il client valuta se stesso prima di inviare

        local_pred = clf.predict(X_val)

        local_mae = mean_absolute_error(y_val, local_pred)



        # Se il MAE locale Ã¨ > 18 (modello confuso), scartiamo il contributo

        if local_mae < 18.0:

            global_estimators.extend(clf.estimators_)

            accepted_clients += 1

        else:

            rejected_clients += 1

            # (Opzionale) Print per debug

            # print(f"Client {i} scartato (MAE Locale: {local_mae:.2f})")


        if (i+1) % 10 == 0:

            print(f"   ...processati {i+1} client (Accettati: {accepted_clients}, Scartati: {rejected_clients})")


    print(f"\nTraining concluso. Contributi integrati: {accepted_clients}/{len(client_folders)}")

    print("Assemblaggio Modello Globale...")



    # 1. Container vuoto

    dummy_model = RandomForestRegressor(n_estimators=len(global_estimators), random_state=42, n_jobs=-1)

    # 2. Inizializzazione struttura

    dummy_model.fit(first_X_train, first_y_train)

    # 3. Iniezione intelligenza federata

    dummy_model.estimators_ = global_estimators



    print("\n--- VALUTAZIONE GLOBALE ---")



    X_global_val = pd.concat(all_val_features, ignore_index=True)

    y_global_true = np.concatenate(all_val_labels)



    raw_preds = dummy_model.predict(X_global_val)



    preds_clipped = np.clip(raw_preds, 0, 100)

    preds_integer = np.rint(preds_clipped).astype(int)



    mae = mean_absolute_error(y_global_true, preds_integer)



    print(f"Campioni testati: {len(y_global_true)}")

    print(f"Errore Medio Assoluto (MAE) REALE: {mae:.4f}")



    return dummy_model


# ESECUZIONE

global_model = run_federated_simulation_smart()

Training Federato Smart su 9 client...

Training concluso. Contributi integrati: 9/9
Assemblaggio Modello Globale...

--- VALUTAZIONE GLOBALE ---
Campioni testati: 204
Errore Medio Assoluto (MAE) REALE: 11.4118

ðŸš€ OTTIMO! Stiamo scendendo verso il target.


In [None]:
 # --- CELLA 4: SUBMISSION FINALE ---

if os.path.exists("x_test.csv"):

    print("\nGenerazione file submission...")

    df_test = pd.read_csv("x_test.csv", sep=';')


    # 1. Preprocessing

    X_test = process_dataframe(df_test)


    # 2. Allineamento Colonne (Fix per feature mancanti/extra)

    trained_features = global_model.feature_names_in_

    # Aggiungi mancanti

    for c in trained_features:

        if c not in X_test.columns: X_test[c] = 0.0

    # Rimuovi extra e riordina

    X_test = X_test[trained_features]


    # 3. Predizione e Arrotondamento

    raw_preds = global_model.predict(X_test)


    # STESSA LOGICA DELLA CELLA 3

    preds_clipped = np.clip(raw_preds, 0, 100)

    preds_int = np.rint(preds_clipped).astype(int)


    submission = pd.DataFrame({

        'id': df_test['id'],

        'label': preds_int

    })


    submission.to_csv('submission_int.csv', index=False)

    print(" File 'submission_int.csv' pronto! (Valori interi 0-100)")

    print(submission.head())

else:

    print(" Manca x_test.csv")


Generazione file submission...
âœ… File 'submission_int.csv' pronto! (Valori interi 0-100)
   id  label
0   0     68
1   1     69
2   2     70
3   3     69
4   4     75
