In [1]:
import os

import pandas as pd
from sklearn.inspection import PartialDependenceDisplay

DATA_PATH = "vm/fl_architectural_dataset.csv"
PLOT_DIR = "plots/rq4"

os.makedirs(PLOT_DIR, exist_ok=True)

print("Setup complete.")


Setup complete.


In [2]:
df = pd.read_csv(DATA_PATH)

# Remove first round (delta undefined)
df = df[df["FL Round"] > 1].copy()

# Remove extreme ΔF1 outliers
lower = df["delta_val_f1"].quantile(0.01)
upper = df["delta_val_f1"].quantile(0.99)

# Standardize Dataset names
df["Dataset"] = df["Dataset"].replace({
    "CIFAR-10": "CIFAR10",
    "CIFAR 10": "CIFAR10"
})

df_clean = df[
    (df["delta_val_f1"] >= lower) &
    (df["delta_val_f1"] <= upper)
    ].copy()

required_cols = [
    "delta_val_f1",
    "Total Time of FL Round",
    "client_selector",
    "heterogeneous_data_handler",
    "message_compressor",
    "Nhigh",
    "Nlow",
    "iid",
    "JSD",
    "FL Round"
]

df_clean = df_clean.dropna(subset=required_cols).copy()

print("Dataset cleaned.")
print(df_clean.shape)


Dataset cleaned.
(161882, 33)


In [3]:
features = [
    "client_selector",
    "heterogeneous_data_handler",
    "message_compressor",
    "Nhigh",
    "Nlow",
    "iid",
    "JSD",
    "FL Round"
]

group_cols = ["Model Type", "Dataset"]

print("Features defined.")


Features defined.


In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline

linear_models_delta = {}
linear_coeffs_delta = {}

for (model_type, dataset), subdf in df_clean.groupby(["Model Type", "Dataset"]):

    if len(subdf) < 500:
        continue

    X = subdf[[
        "client_selector",
        "heterogeneous_data_handler",
        "message_compressor",
        "Nhigh",
        "Nlow",
        "iid",
        "JSD",
        "FL Round"
    ]]

    y = subdf["delta_val_f1"]

    model = Pipeline([
        ("scaler", StandardScaler()),
        ("ridge", Ridge(alpha=1.0))
    ])

    model.fit(X, y)

    coef_df = pd.Series(
        model.named_steps["ridge"].coef_,
        index=X.columns
    )

    key = f"{model_type}_{dataset}"
    linear_models_delta[key] = model
    linear_coeffs_delta[key] = coef_df

    print(f"Fitted ΔValF1 linear model for {key}")


Fitted ΔValF1 linear model for CNN 16k_CIFAR10
Fitted ΔValF1 linear model for CNN 64k_CIFAR10
Fitted ΔValF1 linear model for TextMLP_AGNEWS


In [5]:
linear_models_time = {}
linear_coeffs_time = {}

for (model_type, dataset), subdf in df_clean.groupby(["Model Type", "Dataset"]):

    if len(subdf) < 500:
        continue

    X = subdf[[
        "client_selector",
        "heterogeneous_data_handler",
        "message_compressor",
        "Nhigh",
        "Nlow",
        "iid",
        "JSD",
        "FL Round"
    ]]

    y = subdf["Total Time of FL Round"]

    model = Pipeline([
        ("scaler", StandardScaler()),
        ("ridge", Ridge(alpha=1.0))
    ])

    model.fit(X, y)

    coef_df = pd.Series(
        model.named_steps["ridge"].coef_,
        index=X.columns
    )

    key = f"{model_type}_{dataset}"
    linear_models_time[key] = model
    linear_coeffs_time[key] = coef_df

    print(f"Fitted Round Time linear model for {key}")


Fitted Round Time linear model for CNN 16k_CIFAR10
Fitted Round Time linear model for CNN 64k_CIFAR10
Fitted Round Time linear model for TextMLP_AGNEWS


In [11]:
import matplotlib.pyplot as plt
import os

PLOT_DIR = "plots/rq4"
os.makedirs(PLOT_DIR, exist_ok=True)

for key, coef_df in linear_coeffs_delta.items():
    coef_sorted = coef_df.sort_values()

    plt.figure(figsize=(8, 6))

    colors = ["green" if v > 0 else "red" for v in coef_sorted.values]

    coef_sorted.plot(kind="barh", color=colors)

    plt.title(f"Linear Coefficients (ΔValF1)\n{key}")
    plt.xlabel("Standardized Coefficient")
    plt.tight_layout()

    save_path = os.path.join(PLOT_DIR, f"linear_delta_{key}.png")
    plt.savefig(save_path, dpi=300)
    plt.close()

    print("Saved:", save_path)

Saved: plots/rq4/linear_delta_CNN 16k_CIFAR10.png
Saved: plots/rq4/linear_delta_CNN 64k_CIFAR10.png
Saved: plots/rq4/linear_delta_TextMLP_AGNEWS.png


In [10]:
for key, coef_df in linear_coeffs_time.items():
    coef_sorted = coef_df.sort_values()

    plt.figure(figsize=(8, 6))

    colors = ["red" if v > 0 else "green" for v in coef_sorted.values]

    coef_sorted.plot(kind="barh", color=colors)

    plt.title(f"Linear Coefficients (Round Time)\n{key}")
    plt.xlabel("Coefficient (standardized)")
    plt.tight_layout()

    save_path = os.path.join(PLOT_DIR, f"linear_time_{key}.png")
    plt.savefig(save_path, dpi=300)
    plt.close()

    print("Saved:", save_path)


Saved: plots/rq4/linear_time_CNN 16k_CIFAR10.png
Saved: plots/rq4/linear_time_CNN 64k_CIFAR10.png
Saved: plots/rq4/linear_time_TextMLP_AGNEWS.png


In [8]:
for key, (rf, X_train) in models_delta.items():
    importance = (
        pd.Series(rf.feature_importances_, index=features)
        .sort_values(ascending=False)
    )

    plt.figure(figsize=(8, 5))
    importance.plot(kind="bar")
    plt.title(f"Feature Importance — ΔValF1 — {key}")
    plt.tight_layout()

    save_path = os.path.join(PLOT_DIR, f"importance_delta_{key}.png")
    plt.savefig(save_path, dpi=300)
    plt.close()

    print("Saved:", save_path)


NameError: name 'models_delta' is not defined

In [12]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

models_delta = {}
models_time = {}

features = [
    "client_selector",
    "heterogeneous_data_handler",
    "message_compressor",
    "Nhigh",
    "Nlow",
    "iid",
    "JSD",
    "FL Round"
]

for (model_type, dataset), subdf in df_clean.groupby(["Model Type", "Dataset"]):

    if len(subdf) < 500:
        continue

    print("\n======================================")
    print(f"Training RF models for {model_type} | {dataset}")
    print("======================================")

    X = subdf[features]

    # -------- ΔValF1 model --------
    y_delta = subdf["delta_val_f1"]

    X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(
        X, y_delta, test_size=0.2, random_state=42
    )

    rf_delta = RandomForestRegressor(
        n_estimators=200,
        max_depth=12,
        random_state=42,
        n_jobs=-1
    )

    rf_delta.fit(X_train_d, y_train_d)

    key = f"{model_type}_{dataset}"
    models_delta[key] = (rf_delta, X_train_d)

    print("ΔValF1 R²:", rf_delta.score(X_test_d, y_test_d))

    # -------- Round Time model --------
    y_time = subdf["Total Time of FL Round"]

    X_train_t, X_test_t, y_train_t, y_test_t = train_test_split(
        X, y_time, test_size=0.2, random_state=42
    )

    rf_time = RandomForestRegressor(
        n_estimators=200,
        max_depth=12,
        random_state=42,
        n_jobs=-1
    )

    rf_time.fit(X_train_t, y_train_t)

    models_time[key] = (rf_time, X_train_t)

    print("Round Time R²:", rf_time.score(X_test_t, y_test_t))



Training RF models for CNN 16k | CIFAR10
ΔValF1 R²: 0.5281759760609304
Round Time R²: 0.5723291957794329

Training RF models for CNN 64k | CIFAR10
ΔValF1 R²: 0.5072718117297605
Round Time R²: 0.47466478037987925

Training RF models for TextMLP | AGNEWS
ΔValF1 R²: 0.4283694546705855
Round Time R²: 0.30359073853050034


In [13]:
for key, (rf, X_train) in models_time.items():
    importance = (
        pd.Series(rf.feature_importances_, index=features)
        .sort_values(ascending=False)
    )

    plt.figure(figsize=(8, 5))
    importance.plot(kind="bar")
    plt.title(f"Feature Importance — Round Time — {key}")
    plt.tight_layout()

    save_path = os.path.join(PLOT_DIR, f"importance_time_{key}.png")
    plt.savefig(save_path, dpi=300)
    plt.close()

    print("Saved:", save_path)


Saved: plots/rq4/importance_time_CNN 16k_CIFAR10.png
Saved: plots/rq4/importance_time_CNN 64k_CIFAR10.png
Saved: plots/rq4/importance_time_TextMLP_AGNEWS.png


In [14]:
patterns = [
    "client_selector",
    "heterogeneous_data_handler",
    "message_compressor"
]

for key, (rf, X_train) in models_delta.items():

    for pattern in patterns:
        fig, ax = plt.subplots(figsize=(6, 4))

        PartialDependenceDisplay.from_estimator(
            rf,
            X_train,
            [pattern],
            ax=ax
        )

        plt.title(f"PDP — ΔValF1 — {pattern} — {key}")
        plt.tight_layout()

        save_path = os.path.join(
            PLOT_DIR,
            f"pdp_delta_{pattern}_{key}.png"
        )

        plt.savefig(save_path, dpi=300)
        plt.close()

        print("Saved:", save_path)


Saved: plots/rq4/pdp_delta_client_selector_CNN 16k_CIFAR10.png
Saved: plots/rq4/pdp_delta_heterogeneous_data_handler_CNN 16k_CIFAR10.png
Saved: plots/rq4/pdp_delta_message_compressor_CNN 16k_CIFAR10.png
Saved: plots/rq4/pdp_delta_client_selector_CNN 64k_CIFAR10.png
Saved: plots/rq4/pdp_delta_heterogeneous_data_handler_CNN 64k_CIFAR10.png
Saved: plots/rq4/pdp_delta_message_compressor_CNN 64k_CIFAR10.png
Saved: plots/rq4/pdp_delta_client_selector_TextMLP_AGNEWS.png
Saved: plots/rq4/pdp_delta_heterogeneous_data_handler_TextMLP_AGNEWS.png
Saved: plots/rq4/pdp_delta_message_compressor_TextMLP_AGNEWS.png


In [15]:
for key, (rf, X_train) in models_time.items():

    for pattern in patterns:
        fig, ax = plt.subplots(figsize=(6, 4))

        PartialDependenceDisplay.from_estimator(
            rf,
            X_train,
            [pattern],
            ax=ax
        )

        plt.title(f"PDP — Round Time — {pattern} — {key}")
        plt.tight_layout()

        save_path = os.path.join(
            PLOT_DIR,
            f"pdp_time_{pattern}_{key}.png"
        )

        plt.savefig(save_path, dpi=300)
        plt.close()

        print("Saved:", save_path)


Saved: plots/rq4/pdp_time_client_selector_CNN 16k_CIFAR10.png
Saved: plots/rq4/pdp_time_heterogeneous_data_handler_CNN 16k_CIFAR10.png
Saved: plots/rq4/pdp_time_message_compressor_CNN 16k_CIFAR10.png
Saved: plots/rq4/pdp_time_client_selector_CNN 64k_CIFAR10.png
Saved: plots/rq4/pdp_time_heterogeneous_data_handler_CNN 64k_CIFAR10.png
Saved: plots/rq4/pdp_time_message_compressor_CNN 64k_CIFAR10.png
Saved: plots/rq4/pdp_time_client_selector_TextMLP_AGNEWS.png
Saved: plots/rq4/pdp_time_heterogeneous_data_handler_TextMLP_AGNEWS.png
Saved: plots/rq4/pdp_time_message_compressor_TextMLP_AGNEWS.png


In [21]:
interactions = [
    ("client_selector", "heterogeneous_data_handler"),
    ("client_selector", "message_compressor"),
    ("heterogeneous_data_handler", "message_compressor")
]

for key, (rf, X_train) in models_delta.items():

    for pair in interactions:
        fig, ax = plt.subplots(figsize=(6, 4))

        PartialDependenceDisplay.from_estimator(
            rf,
            X_train,
            [pair],
            ax=ax
        )

        plt.title(f"ΔValF1 — {key}")
        plt.tight_layout()

        save_path = os.path.join(
            PLOT_DIR,
            f"interaction_delta_{pair[0]}x{pair[1]}_{key}.png"
        )

        plt.savefig(save_path, dpi=300)
        plt.close()

        print("Saved:", save_path)


Saved: plots/rq4/interaction_delta_client_selectorxheterogeneous_data_handler_CNN 16k_CIFAR10.png
Saved: plots/rq4/interaction_delta_client_selectorxmessage_compressor_CNN 16k_CIFAR10.png
Saved: plots/rq4/interaction_delta_heterogeneous_data_handlerxmessage_compressor_CNN 16k_CIFAR10.png
Saved: plots/rq4/interaction_delta_client_selectorxheterogeneous_data_handler_CNN 64k_CIFAR10.png
Saved: plots/rq4/interaction_delta_client_selectorxmessage_compressor_CNN 64k_CIFAR10.png
Saved: plots/rq4/interaction_delta_heterogeneous_data_handlerxmessage_compressor_CNN 64k_CIFAR10.png
Saved: plots/rq4/interaction_delta_client_selectorxheterogeneous_data_handler_TextMLP_AGNEWS.png
Saved: plots/rq4/interaction_delta_client_selectorxmessage_compressor_TextMLP_AGNEWS.png
Saved: plots/rq4/interaction_delta_heterogeneous_data_handlerxmessage_compressor_TextMLP_AGNEWS.png


In [22]:
for key, (rf, X_train) in models_time.items():

    for pair in interactions:
        fig, ax = plt.subplots(figsize=(6, 4))

        PartialDependenceDisplay.from_estimator(
            rf,
            X_train,
            [pair],
            ax=ax
        )

        plt.title(f"Round Time — {key}")
        plt.tight_layout()

        save_path = os.path.join(
            PLOT_DIR,
            f"interaction_time_{pair[0]}_{pair[1]}_{key}.png"
        )

        plt.savefig(save_path, dpi=300)
        plt.close()

        print("Saved:", save_path)


Saved: plots/rq4/interaction_time_client_selector_heterogeneous_data_handler_CNN 16k_CIFAR10.png
Saved: plots/rq4/interaction_time_client_selector_message_compressor_CNN 16k_CIFAR10.png
Saved: plots/rq4/interaction_time_heterogeneous_data_handler_message_compressor_CNN 16k_CIFAR10.png
Saved: plots/rq4/interaction_time_client_selector_heterogeneous_data_handler_CNN 64k_CIFAR10.png
Saved: plots/rq4/interaction_time_client_selector_message_compressor_CNN 64k_CIFAR10.png
Saved: plots/rq4/interaction_time_heterogeneous_data_handler_message_compressor_CNN 64k_CIFAR10.png
Saved: plots/rq4/interaction_time_client_selector_heterogeneous_data_handler_TextMLP_AGNEWS.png
Saved: plots/rq4/interaction_time_client_selector_message_compressor_TextMLP_AGNEWS.png
Saved: plots/rq4/interaction_time_heterogeneous_data_handler_message_compressor_TextMLP_AGNEWS.png
