In [None]:
import os
import numpy as np
import pandas as pd
from scipy.stats import pearsonr, wasserstein_distance
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from tqdm import tqdm
import warnings
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.metrics import mutual_info_score
import random
from collections import Counter, defaultdict
from itertools import combinations
from scipy.spatial.distance import euclidean
from scipy.optimize import linear_sum_assignment
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score, adjusted_rand_score
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from scipy.stats import norm
from scipy import stats
import re

from sklearn.metrics import adjusted_rand_score
from scipy.spatial.distance import cdist
from scipy.optimize import linear_sum_assignment
import warnings

from tqdm import tqdm
import random

def ep_read(filename):
    """
    Read data from a Cartool .ep file and convert it to a numpy array.

    Parameters:
    filename (str): Path to the file to be read.

    Returns:
    numpy.ndarray: A 2D array containing the data from the file, with each line
                   converted to a list of floats.
    """
    with open(filename, 'r') as file:
        data = np.array([list(map(float, line.split())) for line in file if line.strip()])
    return data

def calculate_spatial_correlation(v1, v2):
    """
    Calculate the Pearson correlation coefficient between two vectors, ignoring NaN values.

    Parameters:
    v1 (array-like): First vector.
    v2 (array-like): Second vector.

    Returns:
    float: The Pearson correlation coefficient between the non-NaN elements of v1 and v2.
    """
    return pearsonr(v1[~np.isnan(v1)], v2[~np.isnan(v2)])[0]

def calculate_inverse_emd(v1, v2):
    """
    Calculate the "inverse" of the normalized Earth Mover's Distance (EMD) (1-EMD) between two vectors.

    Parameters:
    v1 (array-like): First vector.
    v2 (array-like): Second vector.

    Returns:
    float: The inverse of the normalized EMD, ranging from 0 to 1.
           1 indicates identical distributions, 0 indicates maximally different distributions.
    """
    v1_clean = v1[~np.isnan(v1)].ravel()
    v2_clean = v2[~np.isnan(v2)].ravel()
    emd = wasserstein_distance(v1_clean, v2_clean)

    min_val = min(np.min(v1_clean), np.min(v2_clean))
    max_val = max(np.max(v1_clean), np.max(v2_clean))
    max_emd = wasserstein_distance([min_val]*len(v1_clean), [max_val]*len(v2_clean))

    if max_emd == 0:
        normalized_emd = 0
    else:
        normalized_emd = emd / max_emd

    return 1 - normalized_emd

def calculate_mutual_information(v1, v2):
    """
    Calculate the normalized mutual information between two vectors.

    Parameters:
    v1 (array-like): First vector.
    v2 (array-like): Second vector.

    Returns:
    float: The normalized mutual information, ranging from 0 to 1.
           Higher values indicate stronger statistical dependence between the vectors.
    """
    v1_clean = v1[~np.isnan(v1)].ravel()
    v2_clean = v2[~np.isnan(v2)].ravel()

    discretizer = KBinsDiscretizer(n_bins=20, encode='ordinal', strategy='uniform', subsample=None)
    discretizer.fit(v1_clean.reshape(-1, 1))
    v1_discretized = discretizer.fit_transform(v1_clean.reshape(-1, 1)).ravel()
    v2_discretized = discretizer.fit_transform(v2_clean.reshape(-1, 1)).ravel()

    mi = mutual_info_score(v1_discretized, v2_discretized)

    h1 = mutual_info_score(v1_discretized, v1_discretized)
    h2 = mutual_info_score(v2_discretized, v2_discretized)
    return mi / min(h1, h2)

def calculate_difference(map1, map2):
    """
    Calculate a difference metric.

    Parameters:
    map1 (numpy.ndarray): First map (A).
    map2 (numpy.ndarray): Second map (B).
    scale (float): Scaling factor to adjust the sensitivity of the metric.

    Returns:
    float: Difference metric that is 1 when A = B and approaches 0 when A and B differ significantly.
    """
    # Calculate the absolute difference
    difference = np.abs(map2 - map1)

    # Use an exponential decay function to create the metric
    metric = np.exp(-difference)

    return np.nanmean(metric)

In [None]:
import os
import re
import warnings
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.metrics import adjusted_rand_score
from scipy import stats
from scipy.optimize import linear_sum_assignment
import csv

def plot_clusters_2d_tsne(folder_paths, n_clusters, output_path='.', max_iter=1000, n_perturbations=10, noise_levels=np.linspace(0, 1, 10)):
    """
    Plot 2D t-SNE representation of microstate clusters for multiple folders, perform second-level (group) clustering,
    calculate clustering robustness using noise perturbations, and save metrics to a CSV file.

    Parameters:
    folder_paths (list): List of paths to the folders containing .ep files.
    n_clusters (int): Number of clusters for K-means clustering.
    output_path (str, optional): Path to save the output plot and CSV. Defaults to current directory.
    max_iter (int, optional): Maximum number of iterations for t-SNE. Defaults to 1000.
    n_perturbations (int, optional): Number of noise perturbations for analysis. Defaults to 10.
    noise_levels (np.array, optional): Array of noise levels for perturbation analysis. Defaults to np.linspace(0, 1, 10).
    """

    warnings.filterwarnings("ignore", message="KMeans is known to have a memory leak on Windows with MKL")

    np.random.seed(42)

    all_ari_scores = []
    all_ari_confidence_intervals = []
    folder_labels = []
    reference_centroids = None

    output_ep_folder = os.path.join(output_path, 'Output_ep')
    os.makedirs(output_ep_folder, exist_ok=True)

    # Dictionary to store metrics for CSV
    csv_data = {}

    def align_clusters(centroids, reference_centroids):
        correlation_matrix = np.corrcoef(centroids, reference_centroids)
        correlation_submatrix = correlation_matrix[:n_clusters, n_clusters:]
        row_ind, col_ind = linear_sum_assignment(-correlation_submatrix)
        return col_ind

    for idx, folder_path in enumerate(folder_paths):
        # Only select files that match the pattern ending with ".ep"
        files = [f for f in os.listdir(folder_path) if re.search(r'.ep$', f)]

        subject_files = {}
        for file in files:
            subject_id = file.split('.')[1]  # Assumes the subject ID is the second element in the split by '.'
            if subject_id not in subject_files:
                subject_files[subject_id] = []
            subject_files[subject_id].append(file)

        all_subject_centroids = []
        all_subject_labels = []
        all_subject_microstates = []

        folder_name = os.path.basename(folder_path)

        for subject_id, subject_file_list in subject_files.items():
            subject_microstates = [ep_read(os.path.join(folder_path, file)) for file in subject_file_list]

            # Check that all microstates have the same shape
            microstate_shapes = [microstate.shape for microstate in subject_microstates]
            if len(set(microstate_shapes)) > 1:
                print(f"Warning: Microstates for subject {subject_id} in folder {folder_name} have inconsistent shapes: {microstate_shapes}")
                continue  # Skip this subject or handle it differently (e.g., pad or interpolate the shapes)

            # If all shapes are consistent, proceed
            subject_microstates = np.array(subject_microstates)

            n_samples, n_microstates, n_channels = subject_microstates.shape
            subject_microstates_reshaped = subject_microstates.reshape(n_samples * n_microstates, n_channels)

            kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)
            subject_labels = kmeans.fit_predict(subject_microstates_reshaped)
            subject_centroids = kmeans.cluster_centers_

            if reference_centroids is None:
                reference_centroids = subject_centroids
                reference_labels = subject_labels
            else:
                aligned_indices = align_clusters(subject_centroids, reference_centroids)
                subject_centroids = subject_centroids[aligned_indices]
                subject_labels = np.array([aligned_indices[label] for label in subject_labels])

            all_subject_centroids.append(subject_centroids)
            all_subject_labels.append(subject_labels)
            all_subject_microstates.append(subject_microstates)

            # Calculate Rand scores for perturbations
            for noise_level in noise_levels:
                rand_scores_for_level = []
                for _ in range(n_perturbations):
                    noise = np.random.normal(0, noise_level, subject_microstates_reshaped.shape)
                    perturbed_data = subject_microstates_reshaped + noise
                    perturbed_labels = KMeans(n_clusters=n_clusters, n_init=10, random_state=42).fit_predict(perturbed_data)
                    rand = adjusted_rand_score(subject_labels, perturbed_labels)
                    rand_scores_for_level.append(rand)
                # Store metrics in csv_data
                if subject_id not in csv_data:
                    csv_data[subject_id] = {}
                csv_data[subject_id][f'Rand_{folder_name}_{noise_level:.2f}'] = np.mean(rand_scores_for_level)

            # Save subject centroids as .ep file
            centroid_filename = f'{folder_name}_{subject_id}_centroids.ep'
            centroid_filepath = os.path.join(output_ep_folder, centroid_filename)
            np.savetxt(centroid_filepath, subject_centroids, fmt='%.6f')
            print(f"Saved centroids for subject {subject_id} in {folder_name} to: {centroid_filepath}")

        all_subject_centroids = np.array(all_subject_centroids)
        all_subject_labels = np.array(all_subject_labels)
        all_subject_microstates = np.array(all_subject_microstates)

        # Perform second-level clustering on the organized centroids
        organized_centroids = all_subject_centroids.reshape(-1, n_channels)
        second_kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)
        second_level_labels = second_kmeans.fit_predict(organized_centroids)
        second_level_centroids = second_kmeans.cluster_centers_

        # Save second-level centroids as .ep file
        second_level_filename = f'{folder_name}_second_level_centroids.ep'
        second_level_filepath = os.path.join(output_ep_folder, second_level_filename)
        np.savetxt(second_level_filepath, second_level_centroids, fmt='%.6f')
        print(f"Saved second-level centroids for {folder_name} to: {second_level_filepath}")

        # Compute mean distances between subject centroids and second-level centroids
        subject_mean_distances = {}
        for i, subject_id in enumerate(subject_files.keys()):
            subject_centroids = all_subject_centroids[i]
            distances = np.linalg.norm(subject_centroids - second_level_centroids, axis=1)
            mean_distance = np.mean(distances)
            subject_mean_distances[subject_id] = mean_distance

        # Identify subjects with closest, farthest, and median mean distances
        subject_ids = list(subject_mean_distances.keys())
        mean_distances = np.array([subject_mean_distances[sid] for sid in subject_ids])

        # Sort the subjects by mean distance
        sorted_indices = np.argsort(mean_distances)
        closest_index = sorted_indices[0]
        farthest_index = sorted_indices[-1]
        median_index = sorted_indices[len(sorted_indices) // 2]

        # Assign fixed labels: Subject 1, Subject 2, Subject 3
        subject1_id = subject_ids[closest_index]
        subject2_id = subject_ids[median_index]
        subject3_id = subject_ids[farthest_index]

        # Create a mapping from subject IDs to labels and markers
        highlighted_subjects = {
            subject1_id: {'label': 'Subject 1', 'marker': '^'},  # Triangle Up
            subject2_id: {'label': 'Subject 2', 'marker': 's'},  # Square
            subject3_id: {'label': 'Subject 3', 'marker': 'D'}   # Diamond
        }

        # Save their centroids as .ep files with labels indicating their fixed subject numbers
        for subject_id, info in highlighted_subjects.items():
            index = subject_ids.index(subject_id)
            subject_centroids = all_subject_centroids[index]
            centroid_filename = f'{folder_name}_{subject_id}_{info["label"].lower().replace(" ", "_")}_centroids.ep'
            centroid_filepath = os.path.join(output_ep_folder, centroid_filename)
            np.savetxt(centroid_filepath, subject_centroids, fmt='%.6f')
            print(f"Saved centroids for {info['label']} ({subject_id}) in {folder_name} to: {centroid_filepath}")

        # Perform t-SNE on all subject centroids and second-level centroids
        all_centroids = np.vstack([organized_centroids, second_level_centroids])
        tsne = TSNE(n_components=2, perplexity=8, max_iter=max_iter, random_state=42)
        reduced_centroids = tsne.fit_transform(all_centroids)

        # Separate the reduced centroids
        reduced_subject_centroids = reduced_centroids[:-n_clusters]
        reduced_second_level_centroids = reduced_centroids[-n_clusters:]

        # Plot centroids for each subject and second-level centroids
        fig, ax = plt.subplots(figsize=(12, 10), dpi=100)
        cluster_colors = ['r', 'g', 'b', 'y']
        cluster_names = ['A', 'B', 'C', 'D']

        # Map subject IDs to indices for quick access
        subject_id_to_index = {sid: idx for idx, sid in enumerate(subject_ids)}

        # Keep track of labels added to the legend to avoid duplicates
        legend_labels = set()

        # Prepare legend order
        legend_entries = []

        # Plot highlighted subjects in desired order
        for subject_id in [subject1_id, subject2_id, subject3_id]:
            info = highlighted_subjects[subject_id]
            index = subject_id_to_index[subject_id]
            subject_reduced_centroids = reduced_subject_centroids[index*n_clusters:(index+1)*n_clusters]
            label_text = info['label']
            marker_style = info['marker']
            marker_size = 500
            alpha = 1.0
            # Plot and label the centroids
            for j in range(n_clusters):
                legend_label = f'{label_text} Centroid {cluster_names[j]}'
                if legend_label not in legend_labels:
                    sc = ax.scatter(subject_reduced_centroids[j, 0], subject_reduced_centroids[j, 1],
                                    c=cluster_colors[second_level_labels[index*n_clusters+j]], marker=marker_style, s=marker_size, edgecolors='k',
                                    label=legend_label)
                    legend_entries.append(sc)
                    legend_labels.add(legend_label)
                else:
                    ax.scatter(subject_reduced_centroids[j, 0], subject_reduced_centroids[j, 1],
                               c=cluster_colors[second_level_labels[index*n_clusters+j]], marker=marker_style, s=marker_size, edgecolors='k')
            # No annotation inside the plot

        # Plot other subjects without annotations or legend entries
        for i, subject_id in enumerate(subject_ids):
            if subject_id in highlighted_subjects:
                continue
            subject_reduced_centroids = reduced_subject_centroids[i*n_clusters:(i+1)*n_clusters]
            marker_style = 'o'  # Circle marker for other subjects
            marker_size = 100
            alpha = 0.5
            # Plot without annotation
            for j in range(n_clusters):
                ax.scatter(subject_reduced_centroids[j, 0], subject_reduced_centroids[j, 1],
                           c=cluster_colors[second_level_labels[i*n_clusters+j]], marker=marker_style, s=marker_size, alpha=alpha)

        # Plot second-level centroids and add to legend
        for j in range(n_clusters):
            legend_label = f'Second-level Centroid {cluster_names[j]}'
            if legend_label not in legend_labels:
                sc = ax.scatter(reduced_second_level_centroids[j, 0], reduced_second_level_centroids[j, 1],
                                c=cluster_colors[j], marker='*', s=500, edgecolors='k', linewidth=2,
                                label=legend_label)
                legend_entries.append(sc)
                legend_labels.add(legend_label)
            else:
                ax.scatter(reduced_second_level_centroids[j, 0], reduced_second_level_centroids[j, 1],
                           c=cluster_colors[j], marker='*', s=500, edgecolors='k', linewidth=2)

        ax.set_title(f'{folder_name} - Highlighted Subject Centroids', fontsize=20)
        ax.set_xlabel('t-SNE Component 1', fontsize=18)
        ax.set_ylabel('t-SNE Component 2', fontsize=18)
        ax.tick_params(axis='both', labelsize=14)

        # Arrange legend entries in desired order
        handles, labels = ax.get_legend_handles_labels()
        # Create a mapping from label to handle
        label_to_handle = dict(zip(labels, handles))
        # Desired legend order
        desired_labels = []
        for subject_label in ['Subject 1', 'Subject 2', 'Subject 3']:
            for cluster_name in cluster_names:
                desired_labels.append(f'{subject_label} Centroid {cluster_name}')
        for cluster_name in cluster_names:
            desired_labels.append(f'Second-level Centroid {cluster_name}')
        # Filter handles and labels based on desired order
        handles = [label_to_handle[label] for label in desired_labels if label in label_to_handle]
        labels = [label for label in desired_labels if label in label_to_handle]
        ax.legend(handles, labels, fontsize=12, bbox_to_anchor=(1.05, 1), loc='upper left')

        plt.tight_layout()
        plt.savefig(os.path.join(output_path, f'{folder_name}_highlighted_subject_centroids_1s.png'), dpi=100, bbox_inches='tight')
        print(f"Saved highlighted subject centroids plot for {folder_name} to: {os.path.join(output_path, f'{folder_name}_highlighted_subject_centroids_1s.png')}")
        plt.close()

        # Perform robustness analysis on the second-level clustering
        ari_scores = []
        ari_confidence_intervals = []
        for noise_level in noise_levels:
            ari_scores_for_level = []
            for _ in range(n_perturbations):
                noise = np.random.normal(0, noise_level, organized_centroids.shape)
                perturbed_data = organized_centroids + noise
                perturbed_labels = KMeans(n_clusters=n_clusters, n_init=10, random_state=42).fit_predict(perturbed_data)
                ari = adjusted_rand_score(second_level_labels, perturbed_labels)
                ari_scores_for_level.append(ari)

            mean_ari = np.mean(ari_scores_for_level)
            ari_scores.append(mean_ari)

            # Calculate confidence interval using standard error
            se = stats.sem(ari_scores_for_level)
            ci = (mean_ari - 1.96 * se, mean_ari + 1.96 * se)
            ari_confidence_intervals.append(ci)

        all_ari_scores.append(ari_scores)
        all_ari_confidence_intervals.append(ari_confidence_intervals)
        folder_labels.append(folder_name)

    # Robustness plot for ARI
    fig, ax_robustness = plt.subplots(figsize=(10, 10), dpi=100)
    for idx, (ari_scores, ari_confidence_intervals) in enumerate(zip(all_ari_scores, all_ari_confidence_intervals)):
        ari_scores = np.array(ari_scores)
        lower_ci = np.array([ci[0] for ci in ari_confidence_intervals])
        upper_ci = np.array([ci[1] for ci in ari_confidence_intervals])

        ax_robustness.errorbar(noise_levels, ari_scores,
                               yerr=[ari_scores - lower_ci, upper_ci - ari_scores],
                               fmt='o-', capsize=5, label=folder_labels[idx])

    ax_robustness.set_xlabel('Noise Level', fontsize=22)
    ax_robustness.set_ylabel('Adjusted Rand Index', fontsize=22)
    ax_robustness.tick_params(axis='both', labelsize=22)
    ax_robustness.legend(fontsize=22)

    plt.tight_layout()
    plt.savefig(os.path.join(output_path, 'second_level_clustering_robustness_analysis_1s.png'), dpi=300, bbox_inches='tight')
    print(f"Saved second-level clustering robustness analysis plot to: {os.path.join(output_path, 'second_level_clustering_robustness_analysis_1s.png')}")
    plt.close()

    # Save metrics to CSV
    csv_filename = os.path.join(output_path, 'clustering_metrics.csv')
    with open(csv_filename, 'w', newline='') as file:
        writer = csv.writer(file)

        # Write header
        header = ['Subject']
        for folder in folder_paths:
            folder_name = os.path.basename(folder)
            for noise_level in noise_levels:
                header.append(f'Rand_{folder_name}_{noise_level:.2f}')
        writer.writerow(header)

        # Write data
        for subject in csv_data:
            row = [subject]
            for folder in folder_paths:
                folder_name = os.path.basename(folder)
                for noise_level in noise_levels:
                    row.append(csv_data[subject].get(f'Rand_{folder_name}_{noise_level:.2f}', ''))
            writer.writerow(row)

    print(f"Saved clustering metrics to: {csv_filename}")

In [None]:
folder_paths = [
    './HC/500ms',
    './HC/1s',
    './HC/5s',
    './HC/10s',
    './HC/20s',
    './HC/30s'
]

plot_clusters_2d_tsne(folder_paths, n_clusters=4, output_path='./Awake', max_iter=1000, n_perturbations=100, noise_levels=np.linspace(0, 1, 10))