# Simulation Study for Noise Filtering

This is the v0 for the simulation study on the sparse jump model comparison with HMM, to show that SJM is able to filter away noisy data by using the weighting in the algorithm.


In [54]:
#load packages
import numpy as np
import pandas as pd
from hmmlearn import hmm
from sklearn.metrics import balanced_accuracy_score, confusion_matrix
from scipy.optimize import linear_sum_assignment
from jumpmodels.sparse_jump import SparseJumpModel    # Sparse JM class
from jumpmodels.jump import JumpModel    


## 1. Data Simulation & Utility Functions
def simulate_data(T, P, mu, random_state=None): """ Simulate data from a 3-state Gaussian HMM.

In [55]:
def simulate_data(T, P, mu, random_state=None):
    """
    Simulate data from a 3-state Gaussian HMM.
    
    Parameters:
        T (int): Number of observations.
        P (int): Total number of features (only first 15 are informative).
        mu (float): Signal magnitude for informative features.
        random_state (int or None): Seed for reproducibility.
        
    Returns:
        X (ndarray): Simulated observations (T x P).
        states (ndarray): True state sequence (length T).
    """
    rng = np.random.default_rng(random_state)
    
    # Transition matrix as given in your original code
    transmat = np.array([[0.9903, 0.0047, 0.0050],
                         [0.0157, 0.9666, 0.0177],
                         [0.0284, 0.0300, 0.9416]])
    transmat = transmat / transmat.sum(axis=1, keepdims=True)
    
    # Compute stationary distribution (eigenvector corresponding to eigenvalue 1)
    eigvals, eigvecs = np.linalg.eig(transmat.T)
    stat = np.real(eigvecs[:, np.isclose(eigvals, 1)])
    stat = stat[:, 0]
    stat = stat / np.sum(stat)
    
    # Generate state sequence
    states = np.zeros(T, dtype=int)
    states[0] = rng.choice(np.arange(3), p=stat)
    for t in range(1, T):
        states[t] = rng.choice(np.arange(3), p=transmat[states[t-1]])
    
    # Define means for each state
    means = np.zeros((3, P))
    # State 0: +mu in first 15 features
    # State 1: 0
    # State 2: -mu in first 15 features
    if P >= 15:
        means[0, :15] = mu
        means[2, :15] = -mu
    else:
        means[0, :P] = mu
        means[2, :P] = -mu
    
    # Generate observations: N(means[state], I_P)
    X = np.zeros((T, P))
    for t in range(T):
        X[t] = rng.normal(loc=means[states[t]], scale=1.0, size=P)
    
    return X, states

## 2.Aligning Predicted Labels With True Labels using the Hungarian Algorithm

In [56]:

def align_labels(true_labels, pred_labels):
    """
    Align predicted labels with true labels using the Hungarian algorithm.
    
    Returns:
        aligned (ndarray): Predicted labels after optimal permutation.
    """
    D = confusion_matrix(true_labels, pred_labels)
    row_ind, col_ind = linear_sum_assignment(-D)
    mapping = {col: row for row, col in zip(row_ind, col_ind)}
    aligned = np.array([mapping[x] for x in pred_labels])
    return aligned

## 3. Setting up the function to calcuate the BAC

In [57]:
def calculate_bac(true_states, pred_states):
    """
    Compute the Balanced Accuracy (BAC) after aligning the predicted state labels.
    """
    aligned_pred = align_labels(true_states, pred_states)
    return balanced_accuracy_score(true_states, aligned_pred)


## 4. Functions for model formulation

### 4.1 HMM With Nystrup (2021) initialization

In [58]:
def run_hmm(X, n_components=3, random_state=None):
    """
    Fit a Gaussian HMM to the data X with the following initialization:
      - Self-transition probability set to 0.95.
      - Covariance prior set to 1.0 (for regularization).
      - Up to 100 iterations of the EM algorithm.
    
    Parameters:
        X (ndarray): Data matrix.
        n_components (int): Number of hidden states.
        random_state (int or None): Seed for reproducibility.
    
    Returns:
        pred_states (ndarray): Predicted state sequence using Viterbi decoding.
    """
    model = hmm.GaussianHMM(
        n_components=n_components,             # Number of hidden states
        covariance_type='diag',                # Diagonal covariance matrices
        n_iter=100,                            # Maximum number of EM iterations
        random_state=random_state,             # Seed for reproducibility
        init_params="mc",                      # Initialize means ('m') and covariances ('c')
        covars_prior=1.0                   # Regularization: prior added to covariance estimates
    )
    # Set uniform start probabilities
    model.startprob_ = np.full(n_components, 1.0 / n_components)
    # Initialize transition matrix: 0.95 on the diagonal, the remaining probability spread evenly
    transmat = np.full((n_components, n_components), 0.05 / (n_components - 1))
    np.fill_diagonal(transmat, 0.95)
    model.transmat_ = transmat
    
    # Fit the model to the data
    model.fit(X)
    # Predict the hidden state sequence using the Viterbi algorithm
    pred_states = model.predict(X)
    return pred_states


### 4.2 Normal (Standard) Jump Model with Grid Search over λ

In [59]:
def run_jump_model_grid_search(X, true_states, n_components=3, random_state=None):
    """
    Perform a grid search over 14 lambda values (logspace from 1e-2 to 1e4) for the jump model.
    
    For each lambda value:
      - A JumpModel is initialized and fitted.
      - The jump penalty (lambda) controls the cost of switching states.
        - A low lambda allows frequent state changes.
        - A high lambda penalizes state changes, resulting in fewer jumps.
      - The parameter 'cont' specifies whether the jump model is continuous (True) or discrete (False).
      - 'max_iter' defines the maximum number of iterations for the model fitting procedure.
    
    Parameters:
        X (ndarray): Data matrix.
        true_states (ndarray): The true hidden state sequence.
        n_components (int): Number of states.
        random_state (int or None): Seed for reproducibility.
    
    Returns:
        best_labels (ndarray): Predicted state sequence for the best lambda.
        best_bac (float): Best balanced accuracy achieved.
    """
    # Create 14 lambda values logarithmically spaced from 0.001 to 10,000.
    lambda_values = np.logspace(-2, 4, 14)
    best_bac = -1
    best_labels = None
    
    for lam in lambda_values:
        # Create a JumpModel instance with the following parameters:
        model = JumpModel(
            n_components=n_components,    # Number of hidden states
            jump_penalty=lam,             # Lambda: penalty for a state transition
            cont=False,                   # 'cont': if False, model uses discrete jumps
            max_iter=10,                  # Maximum number of iterations for fitting the model
            random_state=random_state     # Seed for reproducibility
        )
        # Fit the jump model to the data X
        model.fit(X)
        # Retrieve predicted state labels from the model
        labels = model.labels_
        # Calculate Balanced Accuracy (BAC) after aligning predicted labels with true states
        bac = calculate_bac(true_states, labels)
        # Update the best result if this lambda gives a higher BAC
        if bac > best_bac:
            best_bac = bac
            best_labels = labels
    
    return best_labels, best_bac

### 4.3 Sparse Jump Model with Grid Search over λ and kappa

In [60]:
def run_sparse_jump_model_grid_search(X, true_states, n_components=3, random_state=None):
    """
    Perform a grid search for the best combination of jump_penalty (lambda) and feature selection
    level for the Sparse Jump Model (SJM). In SJM, feature selection is controlled by 'max_feats',
    which is defined as the square of 'kappa'. Here we vary kappa from 1 to sqrt(P) and set
    max_feats = kappa**2.
    
    Parameters
    ----------
    X : ndarray
        Data matrix of shape (T, P) where T is the number of observations and P the number of features.
    true_states : ndarray
        The true hidden state sequence.
    n_components : int, default=3
        Number of hidden states (clusters).
    random_state : int or None, optional
        Seed for reproducibility.
    
    Returns
    -------
    best_labels : ndarray
        Predicted state sequence for the best combination of parameters.
    best_bac : float
        The best balanced accuracy achieved.
    
    Model Parameters (as per documentation)
    -----------------------------------------
    - jump_penalty : float
        The penalty for state transitions. In the SJM, this penalty is internally scaled by 1/sqrt(n_features).
    - cont : bool, default=False
        Use discrete jumps (False) rather than continuous.
    - max_feats : float, default=100.
        Controls the number of features included. This is set to kappa^2.
    - max_iter : int, default=30
        Maximum number of iterations for the coordinate descent algorithm (feature selection).
    - Other parameters (tol_w, max_iter_jm, tol_jm, n_init_jm, verbose) use their default values.
    """
    # Define 7 lambda values on a log-scale from 10^-1 to 10^2.
    lambdas = np.logspace(-1, 2, 7)
    p = X.shape[1]  # Total number of features
    # Define 14 kappa values ranging from 1 to sqrt(P).
    kappas = np.linspace(1, np.sqrt(p), 14)
    
    best_bac = -1
    best_labels = None
    
    # Grid search over all combinations of lambda and kappa.
    for lam in lambdas:
        for kappa in kappas:
            # Compute max_feats as the square of kappa (as per documentation).
            max_feats = kappa**2
            
            # Create the SparseJumpModel instance.
            # Key parameters:
            # - n_components: number of states.
            # - jump_penalty: lambda value controlling cost of switching states.
            # - cont: set to False for the discrete jump model.
            # - max_feats: effective number of features (kappa^2).
            # - max_iter: maximum iterations for the coordinate descent algorithm (default=30 per docs).
            # - random_state: seed for reproducibility.
            model = SparseJumpModel(
                n_components=n_components,
                jump_penalty=lam,
                cont=False,
                max_feats=max_feats,     # effective number of features = kappa^2
                max_iter=30,             # default from documentation (30 iterations)
                random_state=random_state
                # Additional parameters such as tol_w, max_iter_jm, tol_jm, n_init_jm, and verbose
                # will use their default values.
            )
            # Fit the Sparse Jump Model to the data.
            model.fit(X)
            # Retrieve the predicted state labels.
            labels = model.labels_
            # Calculate the Balanced Accuracy (BAC) by aligning labels to true states.
            bac = calculate_bac(true_states, labels)
            # Update best_bac and best_labels if this combination yields a higher BAC.
            if bac > best_bac:
                best_bac = bac
                best_labels = labels
    
    return best_labels, best_bac


# 5. Main Execution
 We split the code into three sections for each model and then combine results at the end.


In [None]:
if __name__ == "__main__":
    
    # --- Settings ---
    # T: Number of time points/observations.
    # mu_values: List of signal magnitudes for the informative features.
    # p_values: List of numbers of features (dimensions) to simulate.
    # n_simulations: Number of simulation runs per (mu, P) combination.
    T = 500
    mu_values = [0.25, 0.5, 0.75, 1.0]   # Different signal magnitudes
    p_values = [15, 30, 60, 150, 300]      # Different numbers of features
    n_simulations = 10                   # More simulations yield more robust results
    
    # final_rows will store the summary results for each (mu, P) combination.
    final_rows = []
    
    # Loop over each combination of signal magnitude and number of features.
    for mu in mu_values:
        for P in p_values:
            
            # Create lists to collect Balanced Accuracy (BAC) scores from each model
            # across multiple simulation runs.
            hmm_bac_list = []      # For the Gaussian HMM
            jump_bac_list = []     # For the standard Jump Model
            sparse_bac_list = []   # For the Sparse Jump Model
            
            # Run multiple simulations to obtain robust performance estimates.
            for sim in range(n_simulations):
                # Simulate data for the current settings:
                # X: data matrix of shape (T, P)
                # true_states: true underlying state sequence.
                X, true_states = simulate_data(T, P, mu, random_state=sim)
                
                # ----------------- 1) Gaussian HMM -------------------
                # Fit the Gaussian HMM (with Nystrup 2021 initialization) and predict states.
                pred_hmm = run_hmm(X, n_components=3, random_state=sim)
                # Calculate the Balanced Accuracy (BAC) by aligning predicted states with true states.
                bac_hmm = calculate_bac(true_states, pred_hmm)
                # Save the BAC score.
                hmm_bac_list.append(bac_hmm)
                
                # ----------------- 2) Normal Jump Model --------------
                # Perform grid search over lambda values for the standard Jump Model.
                # jump_penalty (lambda) controls the cost of switching states.
                # 'cont=False' specifies that the model uses discrete state jumps.
                # 'max_iter=10' limits the number of iterations for model convergence.
                _, bac_jump = run_jump_model_grid_search(X, true_states, 
                                                         n_components=3, 
                                                         random_state=sim)
                # Save the BAC score.
                jump_bac_list.append(bac_jump)
                
                # ----------------- 3) Sparse Jump Model --------------
                # Perform grid search over a combination of lambda and kappa values.
                # lambda (jump_penalty) controls the cost of state transitions.
                # kappa controls the sparsity (feature selection) in the model.
                # 'max_feats' is computed as kappa**2.
                # 'max_iter=30' is used per documentation.
                _, bac_sparse = run_sparse_jump_model_grid_search(X, true_states,
                                                                  n_components=3,
                                                                  random_state=sim)
                # Save the BAC score.
                sparse_bac_list.append(bac_sparse)
            
            # Compute the mean and standard deviation of BAC scores for each method.
            hmm_mean, hmm_std = np.mean(hmm_bac_list), np.std(hmm_bac_list)
            jump_mean, jump_std = np.mean(jump_bac_list), np.std(jump_bac_list)
            sparse_mean, sparse_std = np.mean(sparse_bac_list), np.std(sparse_bac_list)
            
            # Perform a paired t-test between the standard Jump Model and the Sparse Jump Model.
            # The null hypothesis is that their mean BAC scores are equal.
            tstat, pval = stats.ttest_rel(jump_bac_list, sparse_bac_list)
            # If the Sparse Jump Model has a significantly higher mean (p < 0.05), mark it in bold.
            if (pval < 0.05) and (sparse_mean > jump_mean):
                sparse_str = f"**{sparse_mean:.2f} ± {sparse_std:.2f}**"
            else:
                sparse_str = f"{sparse_mean:.2f} ± {sparse_std:.2f}"
            
            # Store the results for this (mu, P) combination in a dictionary.
            final_rows.append({
                "mu": mu,
                "P": P,
                "HMM (mean ± std)": f"{hmm_mean:.2f} ± {hmm_std:.2f}",
                "Jump (mean ± std)": f"{jump_mean:.2f} ± {jump_std:.2f}",
                "Sparse Jump (mean ± std)": sparse_str,
                "p-value (Jump vs Sparse)": f"{pval:.3g}"
            })
            
            # Print a message when this combination is finished
            print(f"Finished analysis for mu = {mu}, P = {P}")
    


In [None]:
    # Convert the final results into a pandas DataFrame and display it.
    df_results = pd.DataFrame(final_rows)
    print(df_results)