# Simulation Study for Noise Filtering

This is the v0 for the simulation study on the sparse jump model comparison with HMM, to show that SJM is able to filter away noisy data by using the weighting in the algorithm.


In [4]:
# Load packages
import numpy as np
import pandas as pd
from hmmlearn import hmm
from sklearn.metrics import balanced_accuracy_score, confusion_matrix
from scipy.optimize import linear_sum_assignment
from jumpmodels.sparse_jump import SparseJumpModel    # Sparse JM class
from jumpmodels.jump import JumpModel   
from scipy import stats
from joblib import Parallel, delayed
import multiprocessing
from scipy.stats import wilcoxon

## 1. Data Simulation & Utility Functions
def simulate_data(T, P, mu, random_state=None): """ Simulate data from a 2-state Gaussian HMM with correlation 0.1 between the features

In [3]:
def simulate_data(T, P, mu, random_state=None):
    """
    Simulate data from a 2-state Gaussian HMM with correlated noise for features beyond
    the first 15 informative features.
    """
    rng = np.random.default_rng(random_state)
    
    # Transition matrix for 2 states
    transmat = np.array([[0.9980, 0.0020],
                         [0.0114, 0.9886]])
    
    # Compute stationary distribution
    eigvals, eigvecs = np.linalg.eig(transmat.T)
    stat = np.real(eigvecs[:, np.isclose(eigvals, 1)])
    stat = stat[:, 0]
    stat = stat / np.sum(stat)
    
    # Generate state sequence
    states = np.zeros(T, dtype=int)
    states[0] = rng.choice(np.arange(2), p=stat)
    for t in range(1, T):
        states[t] = rng.choice(np.arange(2), p=transmat[states[t-1]])
    
    # Define means for each state: first 15 features are informative
    means = np.zeros((2, P))
    if P >= 15:
        means[0, :15] = -mu
        means[1, :15] = mu
    else:
        means[0, :P] = -mu
        means[1, :P] = mu
    
    # Prepare correlated noise for features beyond the first 15
    informative = 15
    if P > informative:
        num_noise = P - informative
        sigma = np.full((num_noise, num_noise), 0.185)
        np.fill_diagonal(sigma, 1.0)
        C = np.linalg.cholesky(sigma)
    else:
        C = None
    
    # Generate observations
    X = np.zeros((T, P))
    for t in range(T):
        # Informative features
        n_inf = min(P, informative)
        X[t, :n_inf] = rng.normal(loc=means[states[t], :n_inf], scale=1.0, size=n_inf)
        # Noise features: correlated noise if P > informative
        if P > informative:
            noise_indep = rng.normal(loc=0.0, scale=1.0, size=P - informative)
            X[t, informative:] = C @ noise_indep
    
    return X, states

## 2.Aligning Predicted Labels With True Labels using the Hungarian Algorithm

In [7]:
def align_labels(true_labels, pred_labels):
    """
    Align predicted labels with true labels using the Hungarian algorithm.
    """
    D = confusion_matrix(true_labels, pred_labels)
    row_ind, col_ind = linear_sum_assignment(-D)
    mapping = {col: row for row, col in zip(row_ind, col_ind)}
    aligned = np.array([mapping[x] for x in pred_labels])
    return aligned

## 3. Setting up the function to calcuate the BAC

In [10]:
def calculate_bac(true_states, pred_states):
    """
    Compute the Balanced Accuracy (BAC) after aligning the predicted state labels.
    """
    aligned_pred = align_labels(true_states, pred_states)
    return balanced_accuracy_score(true_states, aligned_pred)

## 4. Functions for model formulation

### 4.1 HMM With Nystrup (2021) initialization

In [13]:
def run_hmm(X, n_components=2, random_state=None):
    """
    Fit a Gaussian HMM to the data X.
    """
    model = hmm.GaussianHMM(
        n_components=n_components,
        covariance_type='diag',
        n_iter=100,
        random_state=random_state,
        init_params="mc",
        covars_prior=1.0
    )
    model.startprob_ = np.full(n_components, 1.0 / n_components)
    transmat = np.full((n_components, n_components), 0.05 / (n_components - 1))
    np.fill_diagonal(transmat, 0.95)
    model.transmat_ = transmat
    
    model.fit(X)
    pred_states = model.predict(X)
    return pred_states

### 4.2 Normal (Standard) Jump Model with Grid Search over λ

In [16]:
def run_jump_model_grid_search(X, true_states, n_components=2, random_state=None):
    """
    Perform a grid search over lambda values for the Jump Model.
    
    Returns:
        best_labels: Predicted state sequence for the best lambda.
        best_bac: Best balanced accuracy achieved.
        best_lambda: The lambda value that achieved best_bac.
    """
    lambda_values = np.logspace(-2, 4, 14)
    best_bac = -1
    best_labels = None
    best_lambda = None
    
    for lam in lambda_values:
        model = JumpModel(
            n_components=n_components,
            jump_penalty=lam,
            cont=False,
            max_iter=10,
            random_state=random_state
        )
        model.fit(X)
        labels = model.labels_
        bac = calculate_bac(true_states, labels)
        if bac > best_bac:
            best_bac = bac
            best_labels = labels
            best_lambda = lam
    
    return best_labels, best_bac, best_lambda

### 4.3 Sparse Jump Model with Grid Search over λ and kappa

In [19]:
def run_sparse_jump_model_grid_search(X, true_states, n_components=2, random_state=None):
    """
    Perform a grid search for the best combination of jump_penalty (lambda) and feature selection
    level (kappa) for the Sparse Jump Model (SJM).
    
    Returns:
        best_labels: Predicted state sequence for the best combination.
        best_bac: Best balanced accuracy achieved.
        best_lambda: The lambda value that achieved best_bac.
        best_kappa: The kappa value that achieved best_bac.
    """
    lambdas = np.logspace(-1, 2, 7)
    p = X.shape[1]
    kappas = np.linspace(1, np.sqrt(p), 14)
    
    best_bac = -1
    best_labels = None
    best_lambda = None
    best_kappa = None
    
    for lam in lambdas:
        for kappa in kappas:
            max_feats = kappa**2
            model = SparseJumpModel(
                n_components=n_components,
                jump_penalty=lam,
                cont=False,
                max_feats=max_feats,
                max_iter=10,
                random_state=random_state
            )
            model.fit(X)
            labels = model.labels_
            bac = calculate_bac(true_states, labels)
            if bac > best_bac:
                best_bac = bac
                best_labels = labels
                best_lambda = lam
                best_kappa = kappa
    return best_labels, best_bac, best_lambda, best_kappa

# 5. Main Execution
 We split the code into three sections for each model and then combine results at the end.


In [22]:
def run_one_simulation(sim, T, P, mu):
    """
    Run a single simulation (HMM, Jump, Sparse Jump) and return:
      - BAC scores for HMM, Jump, Sparse Jump
      - Best lambda for Jump Model
      - Best lambda and best kappa for Sparse Jump Model
    """
    # Simulate data
    X, true_states = simulate_data(T, P, mu, random_state=sim)

    # Gaussian HMM
    pred_hmm = run_hmm(X, n_components=2, random_state=sim)
    bac_hmm = calculate_bac(true_states, pred_hmm)

    # Jump Model grid search for hyperparameters
    _, bac_jump, best_lambda_jump = run_jump_model_grid_search(
        X, true_states, n_components=2, random_state=sim
    )

    # Sparse Jump Model grid search for hyperparameters
    _, bac_sparse, best_lambda_sparse, best_kappa_sparse = run_sparse_jump_model_grid_search(
        X, true_states, n_components=2, random_state=sim
    )

    return bac_hmm, bac_jump, bac_sparse, best_lambda_jump, best_lambda_sparse, best_kappa_sparse

if __name__ == "__main__":
    # --- Settings ---
    T = 500
    mu_values = [0.02, 0.05, 0.1, 0.25, 0.5,]
    p_values = [15, 30, 60, 150, 300]
    n_simulations = 10  # Adjust as needed

    # Initialize lists to store summary results and raw simulation details
    final_rows = []          # for summary results per (mu, P) pair
    simulation_details = []  # for raw per-simulation results

    # Detect the number of CPU cores available
    num_cores = multiprocessing.cpu_count()
    print(f"Detected {num_cores} CPU cores.")

    # Main loop over mu and P pairs
    for mu in mu_values:
        for P in p_values:
            print(f"\nStarting simulations for mu = {mu}, P = {P}")

            # Run simulations in parallel (each simulation does its own grid search)
            results = Parallel(n_jobs=num_cores)(
                delayed(run_one_simulation)(sim, T, P, mu) 
                for sim in range(n_simulations)
            )

            # Unpack the results for this (mu, P) pair
            hmm_bac_list      = [res[0] for res in results]
            jump_bac_list     = [res[1] for res in results]
            sparse_bac_list   = [res[2] for res in results]
            lambda_jump_list  = [res[3] for res in results]
            lambda_sparse_list= [res[4] for res in results]
            kappa_sparse_list = [res[5] for res in results]

            # Save the raw simulation results and print each simulation's details
            for sim_idx, res in enumerate(results):
                simulation_details.append({
                    "mu": mu,
                    "P": P,
                    "sim": sim_idx,
                    "HMM_BAC": res[0],
                    "Jump_BAC": res[1],
                    "SparseJump_BAC": res[2],
                    "Jump_Lambda": res[3],
                    "Sparse_Lambda": res[4],
                    "Sparse_Kappa": res[5]
                })
                print(f"Simulation {sim_idx} for mu={mu}, P={P}: "
                      f"HMM_BAC = {res[0]:.3f}, Jump_BAC = {res[1]:.3f}, SparseJump_BAC = {res[2]:.3f}, "
                      f"Jump_Lambda = {res[3]:.3f}, Sparse_Lambda = {res[4]:.3f}, Sparse_Kappa = {res[5]:.3f}")

            # Compute summary statistics for BAC scores
            hmm_mean, hmm_std = np.mean(hmm_bac_list), np.std(hmm_bac_list)
            jump_mean, jump_std = np.mean(jump_bac_list), np.std(jump_bac_list)
            sparse_mean, sparse_std = np.mean(sparse_bac_list), np.std(sparse_bac_list)

            # Compute summary statistics for hyperparameters
            mean_lambda_jump = np.mean(lambda_jump_list)
            std_lambda_jump  = np.std(lambda_jump_list)
            unique_lambda_jump = len(np.unique(lambda_jump_list))
            
            mean_lambda_sparse = np.mean(lambda_sparse_list)
            std_lambda_sparse  = np.std(lambda_sparse_list)
            unique_lambda_sparse = len(np.unique(lambda_sparse_list))
            
            mean_kappa_sparse = np.mean(kappa_sparse_list)
            std_kappa_sparse  = np.std(kappa_sparse_list)
            unique_kappa_sparse = len(np.unique(kappa_sparse_list))
            
            # Compute one-sided Wilcoxon test: test if Sparse Jump BAC is greater than Jump BAC
            stat, pval = wilcoxon(sparse_bac_list, jump_bac_list, alternative="greater")
            if pval < 0.05:
                sparse_str = f"**{sparse_mean:.2f} ± {sparse_std:.2f}**"
            else:
                sparse_str = f"{sparse_mean:.2f} ± {sparse_std:.2f}"
            
            # Append summary results to final_rows
            final_rows.append({
                "mu": mu,
                "P": P,
                "HMM (mean ± std)": f"{hmm_mean:.2f} ± {hmm_std:.2f}",
                "Jump (mean ± std)": f"{jump_mean:.2f} ± {jump_std:.2f}",
                "Sparse Jump (mean ± std)": sparse_str,
                "p-value (Jump vs Sparse, Wilcoxon)": f"{pval:.3g}",
                "Jump Lambda (mean ± std, unique)": f"{mean_lambda_jump:.2f} ± {std_lambda_jump:.2f} (n={unique_lambda_jump})",
                "Sparse Lambda (mean ± std, unique)": f"{mean_lambda_sparse:.2f} ± {std_lambda_sparse:.2f} (n={unique_lambda_sparse})",
                "Sparse Kappa (mean ± std, unique)": f"{mean_kappa_sparse:.2f} ± {std_kappa_sparse:.2f} (n={unique_kappa_sparse})"
            })

            # Print summary for this (mu, P) pair
            print(
                f"Finished analysis for mu={mu}, P={P}\n"
                f"  HMM BAC = {hmm_mean:.3f} ± {hmm_std:.3f}\n"
                f"  Jump BAC = {jump_mean:.3f} ± {jump_std:.3f}\n"
                f"  Sparse Jump BAC = {sparse_mean:.3f} ± {sparse_std:.3f}\n"
                f"  Jump Lambda: {mean_lambda_jump:.2f} ± {std_lambda_jump:.2f} (n={unique_lambda_jump})\n"
                f"  Sparse Lambda: {mean_lambda_sparse:.2f} ± {std_lambda_sparse:.2f} (n={unique_lambda_sparse})\n"
                f"  Sparse Kappa: {mean_kappa_sparse:.2f} ± {std_kappa_sparse:.2f} (n={unique_kappa_sparse})\n"
                f"  p-value (Jump vs Sparse, Wilcoxon) = {pval:.3g}"
            )


Detected 16 CPU cores.

Starting simulations for mu = 0.02, P = 15
Simulation 0 for mu=0.02, P=15: HMM_BAC = 0.552, Jump_BAC = 1.000, SparseJump_BAC = 1.000, Jump_Lambda = 49.239, Sparse_Lambda = 31.623, Sparse_Kappa = 1.000
Simulation 1 for mu=0.02, P=15: HMM_BAC = 0.563, Jump_BAC = 0.513, SparseJump_BAC = 0.547, Jump_Lambda = 0.010, Sparse_Lambda = 0.316, Sparse_Kappa = 1.000
Simulation 2 for mu=0.02, P=15: HMM_BAC = 0.437, Jump_BAC = 0.518, SparseJump_BAC = 0.830, Jump_Lambda = 0.242, Sparse_Lambda = 10.000, Sparse_Kappa = 2.326
Simulation 3 for mu=0.02, P=15: HMM_BAC = 0.417, Jump_BAC = 0.542, SparseJump_BAC = 0.664, Jump_Lambda = 0.702, Sparse_Lambda = 3.162, Sparse_Kappa = 1.000
Simulation 4 for mu=0.02, P=15: HMM_BAC = 0.453, Jump_BAC = 0.566, SparseJump_BAC = 0.746, Jump_Lambda = 0.702, Sparse_Lambda = 10.000, Sparse_Kappa = 1.663
Simulation 5 for mu=0.02, P=15: HMM_BAC = 0.400, Jump_BAC = 0.557, SparseJump_BAC = 0.748, Jump_Lambda = 0.702, Sparse_Lambda = 10.000, Sparse_Kappa 

In [8]:
    # Convert summary results to a DataFrame and save to CSV
    df_results = pd.DataFrame(final_rows)
    print("\nFinal Summary Results:")
    print(df_results, flush=True)
    df_results.to_csv("simulation_results(corr_n10).csv", index=False)

    # Convert raw simulation details to a DataFrame and save to CSV for full transparency
    df_sim_details = pd.DataFrame(simulation_details)
    print("\nRaw Simulation Details (first few rows):")
    print(df_sim_details.head(), flush=True)
    #df_sim_details.to_csv("simulation_simulation_details(corr_n10).csv", index=False)

NameError: name 'final_rows' is not defined

In [10]:
    # Convert raw simulation details to a DataFrame and save to CSV for full transparency
    df_sim_details = pd.DataFrame(simulation_details)
    print("\nRaw Simulation Details (first few rows):")
    print(df_sim_details.head(), flush=True)
    #df_sim_details.to_csv("simulation_simulation_details(corr_n10).csv", index=False)

NameError: name 'simulation_details' is not defined