# Table 1 — Clustering Scores for different k-means values.

This notebook reproduces **Table 1** of the manuscript  
*"Noise-shaped Synchrony in Neuronal Oscillator Networks"*.

It computes and reports **clustering and classification scores** obtained from the analysis of precomputed network synchrony data.

---

## What this notebook does

- Loads precomputed feature and label data from CSV files  
- Applies k-means clustering and related classification metrics  
- Computes quantitative clustering scores reported in the manuscript

This notebook **does not run full network simulations** and is therefore safe and fast to execute.

---

## Input data

The following data files (located in this folder) are used as inputs:

- Precomputed feature matrices and label CSV files used for clustering analysis

---

## Output

Running all cells will generate the numerical results reported in **Table 1** of the manuscript and any supporting diagnostic plots saved by the notebook.

---

## How to run

Simply execute all cells from top to bottom:

- Jupyter menu: **Kernel → Restart & Run All**
- Or execute cells sequentially

Typical runtime on a standard laptop: **a few seconds to under one minute**.


In [6]:
import pandas as pd
import sys

from sklearn.cluster import KMeans
from sklearn.metrics import (
    silhouette_score,
    calinski_harabasz_score,
    davies_bouldin_score,
)

In [7]:
noise_coupling_file = "n_c_.csv"

In [8]:

def run_kmeans_on_file(input_file, ks=(3,4,5,6,7), k_for_labels=6, random_state=0, n_init=10, algorithm="lloyd"):
    import pandas as pd
    from sklearn.cluster import KMeans
    from sklearn.metrics import silhouette_score, davies_bouldin_score
    import numpy as np

    noise_coupling_file = "n_c_.csv"

    # Derive base name the same way as before
    base_parts = input_file.split("_")
    print(base_parts)
    base = f"{base_parts[1]}_{base_parts[3]}" if len(base_parts) >= 4 else base_parts[0]

    # Read data
    df = pd.read_csv(input_file)

    # --- Build the noise/coupling frame robustly ---
    nc = None
    try:
        nc = pd.read_csv(noise_coupling_file, header=None, names=["noise", "coupling"])
    except Exception:
        nc = None

    # Prefer columns from df if present
    if {"noise","coupling"}.issubset(df.columns):
        nc_df = df[["noise","coupling"]].reset_index(drop=True)
    elif nc is not None and len(nc) == len(df):
        nc_df = nc.reset_index(drop=True)
    else:
        # Fallback: create placeholders with NaNs but keep row index for traceability
        nc_df = pd.DataFrame({
            "noise": np.nan,
            "coupling": np.nan,
        }, index=range(len(df))).reset_index(drop=True)
        if nc is not None:
            print(f"Warning: n_c_.csv length ({len(nc)}) != input rows ({len(df)}). Proceeding without attaching noise/coupling values.")

    # --- Feature matrix ---
    feature_cols = [c for c in df.columns if c not in ["noise", "coupling", "row_index"]]
    X = df[feature_cols].to_numpy()
    print("Used features:", feature_cols)

    metrics = []
    labels_dict = {}

    for k in ks:
        kmeans = KMeans(n_clusters=k, random_state=random_state, n_init=n_init, algorithm=algorithm)
        kmeans.fit(X)
        labels = kmeans.labels_

        sil = silhouette_score(X, labels)
        db  = davies_bouldin_score(X, labels)

        metrics.append({
            "k": k,
            "silhouette": sil,
            "davies_bouldin": db,
        })

        labels_df = nc_df.copy()
        labels_df["cluster"] = labels
        labels_df.to_csv(f"clusters_{base}_k{k}.csv", index=False)
        labels_dict[k] = labels_df
        globals()[f"labels_k_{k}_df"] = labels_df

    metrics_df = pd.DataFrame(metrics, columns=["k", "silhouette", "davies_bouldin"])
    metrics_path = f"metrics_{base}.csv"
    metrics_df.to_csv(metrics_path, index=False)

    print(f"Saved metrics -> {metrics_path}")
    print(f"Saved labels  -> clusters_{base}_k{k_for_labels}.csv (and for all k in {ks})")

    return metrics_df, labels_dict[k_for_labels]


In [9]:
def plot_labels_heatmap(labels_df, output_file=None, use_tex=True):

    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    from matplotlib.colors import ListedColormap

    # --- column handling (case-insensitive)
    col_map = {c.lower(): c for c in labels_df.columns}
    for req in ("noise", "coupling", "cluster"):
        if req not in col_map:
            raise ValueError(f"labels_df must contain '{req}' column.")
    ncol, ccol, lcol = col_map["noise"], col_map["coupling"], col_map["cluster"]

    # --- extract arrays
    noise_vals = labels_df[ncol].astype(float).to_numpy()
    coupling_vals = labels_df[ccol].astype(float).to_numpy()
    cluster_vals = labels_df[lcol].astype(int).to_numpy()

    # --- unique sorted axes
    noise_unique = np.unique(noise_vals)
    coupling_unique = np.unique(coupling_vals)

    # --- grid and index maps (faster than np.where in a loop)
    heatmap_data = np.full((len(coupling_unique), len(noise_unique)), np.nan, dtype=float)
    noise_idx = {v: j for j, v in enumerate(noise_unique)}
    coupling_idx = {v: i for i, v in enumerate(coupling_unique)}
    for n, c, cl in zip(noise_vals, coupling_vals, cluster_vals):
        j = noise_idx.get(n, None)
        i = coupling_idx.get(c, None)
        if i is not None and j is not None:
            heatmap_data[i, j] = cl

    # --- detect number of clusters (assumes 0-based labels)
    k = int(np.nanmax(heatmap_data)) + 1

    # --- discrete colormap with k colors
    # tab20 handles up to 20 distinct colors; beyond that we request k colors from hsv
    try:
        cmap_base = plt.get_cmap("tab20", k if k <= 20 else k)
        #cmap_base = plt.cm.get_cmap("tab20", k if k <= 20 else k)
    except Exception:
        cmap_base = plt.cm.get_cmap("hsv", k)
    cluster_cmap = ListedColormap(cmap_base.colors if hasattr(cmap_base, "colors")
                                  else [cmap_base(i) for i in range(k)])

    # --- styling
    sns.set_theme(style="white")
    plt.rcParams.update({
        "text.usetex": bool(use_tex),
        "font.family": "serif",
        "axes.linewidth": 1.2,
        "xtick.direction": "in",
        "ytick.direction": "in",
        "xtick.major.size": 6,
        "ytick.major.size": 6,
        "xtick.minor.size": 3,
        "ytick.minor.size": 3,
    })

    fig, ax = plt.subplots(figsize=(10, 6))
    hm = sns.heatmap(
        heatmap_data,
        ax=ax,
        xticklabels=np.round(noise_unique, 4),
        yticklabels=np.round(coupling_unique, 4),
        cmap=cluster_cmap,
        cbar_kws={'label': 'Cluster', 'pad': 0.01, 'ticks': range(k)},
        linewidths=0.1,
        linecolor='gray',
        vmin=0,
        vmax=max(0, k - 1),
    )

    # colorbar cosmetics
    cbar = hm.collections[0].colorbar
    cbar.set_label('Cluster', rotation=90, labelpad=2, fontsize=16)
    cbar.ax.yaxis.set_label_coords(2.5, 0.55)

    # labels
    ax.set_xlabel(r'$n$', fontsize=18)
    ax.set_ylabel(r'$c$', fontsize=18, labelpad=10, rotation=0)
    ax.tick_params(axis='x', labelsize=10, rotation=90)
    ax.tick_params(axis='y', labelsize=10)
    ax.invert_yaxis()

    plt.tight_layout()
    if output_file:
        plt.savefig(output_file, format="png", dpi=300, bbox_inches='tight')
    plt.show()
    return fig, ax


In [10]:

input_file = "set_B_pca_3_.csv"
metrics_df, labels_df = run_kmeans_on_file(input_file, ks=(3,4,5,6,7))

scores = metrics_df.rename(columns={
    "silhouette": "Silhouette",
    "davies_bouldin": "DB"
})[["Silhouette", "DB"]]

print("\nScores per k (Silhouette higher, DB lower is better):\n")
print(scores.to_string(float_format=lambda x: f"{x:.6f}"))


SyntaxError: unterminated string literal (detected at line 7) (2904521664.py, line 7)