In [7]:
#load packages
import numpy as np
import pandas as pd
import dtale
from hmmlearn import hmm
from sklearn.metrics import balanced_accuracy_score, confusion_matrix
from scipy.optimize import linear_sum_assignment
from jumpmodels.sparse_jump import SparseJumpModel    # Sparse JM class
from jumpmodels.jump import JumpModel                 # JM class

In [14]:


def simulate_data(T, P, mu, random_state=None):
    """
    Simulate data from a 3-state Gaussian HMM.
    
    Parameters:
        T (int): Number of observations.
        P (int): Total number of features (only first 15 are informative).
        mu (float): Signal magnitude for informative features.
        random_state (int or None): Seed for reproducibility.
        
    Returns:
        X (ndarray): Simulated observations (T x P).
        states (ndarray): True state sequence (length T).
    """
    rng = np.random.default_rng(random_state)
    
    # Transition matrix as given in the study
    transmat = np.array([[0.9903, 0.0047, 0.0050],
                         [0.0157, 0.9666, 0.0177],
                         [0.0284, 0.0300, 0.9416]])
    transmat = transmat / transmat.sum(axis=1, keepdims=True)
    
    # Compute stationary distribution (eigenvector corresponding to eigenvalue 1)
    eigvals, eigvecs = np.linalg.eig(transmat.T)
    stat = np.real(eigvecs[:, np.isclose(eigvals, 1)])
    stat = stat[:, 0]
    stat = stat / np.sum(stat)
    
    # Generate state sequence
    states = np.zeros(T, dtype=int)
    states[0] = rng.choice(np.arange(3), p=stat)
    for t in range(1, T):
        states[t] = rng.choice(np.arange(3), p=transmat[states[t-1]])
    
    # Define state means: state 1 has mean mu for first 15 features,
    # state 2 is zero, and state 3 has mean -mu for first 15 features.
    means = np.zeros((3, P))
    if P >= 15:
        means[0, :15] = mu
        means[2, :15] = -mu
    else:
        means[0, :P] = mu
        means[2, :P] = -mu
    
    # Generate observations: each observation is drawn from N(mean[state], I_P)
    X = np.zeros((T, P))
    for t in range(T):
        X[t] = rng.normal(loc=means[states[t]], scale=1.0, size=P)
    
    return X, states


# Simulate data
T = 500  # Number of observations for display purposes
P = 300   # Number of features for display purposes
mu = 1.0
X, states = simulate_data(T, P, mu, random_state=42)

# Create a DataFrame to display the simulated data and states
df_simulated = pd.DataFrame(X, columns=[f'Feature_{i+1}' for i in range(P)])
df_simulated['State'] = states
df_simulated.index.name = 'Time'

df_simulated.head()



Unnamed: 0_level_0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,Feature_10,...,Feature_292,Feature_293,Feature_294,Feature_295,Feature_296,Feature_297,Feature_298,Feature_299,Feature_300,State
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.433215,0.09152,0.580777,-0.056783,-0.170408,-0.779482,0.430301,-0.851537,0.665585,1.085287,...,0.255213,1.350625,0.012053,0.202797,-1.093471,0.396991,0.060386,-1.302652,-0.051197,1
1,-0.07973,1.797561,0.894213,0.011445,0.248787,0.044212,-0.202914,-1.082427,-0.151052,-0.746098,...,-0.229293,-1.592794,-0.912878,0.226786,1.319013,2.809211,-0.586585,1.4353,0.243752,1
2,-0.151248,0.432594,0.061916,0.110396,-0.408333,-1.39811,-1.543625,0.653245,-0.2767,-0.596089,...,0.404855,-1.605271,-0.024297,0.364534,0.556075,0.177261,0.291231,1.473611,1.226034,1
3,-2.867055,-0.317439,-0.164651,-1.752099,0.094183,1.248758,-1.086568,0.336401,-0.915823,-0.671912,...,0.135123,0.450021,-0.227419,-1.615051,0.988532,0.364525,-0.726015,-0.2806,-1.500669,1
4,-1.02541,-0.537811,1.228421,-0.639503,0.640214,-0.538316,0.247948,-0.141396,0.873057,-0.534811,...,0.10469,1.696587,-1.340356,0.769857,-1.893171,0.217273,1.575083,0.488669,0.170625,1




## Align Labels Function

This function aligns predicted labels with true labels using the Hungarian algorithm.



In [9]:
def align_labels(true_labels, pred_labels):
    """
    Align predicted labels with true labels using the Hungarian algorithm.
    
    Returns:
        aligned (ndarray): Predicted labels after optimal permutation.
    """
    D = confusion_matrix(true_labels, pred_labels)
    row_ind, col_ind = linear_sum_assignment(-D)
    mapping = {col: row for row, col in zip(row_ind, col_ind)}
    aligned = np.array([mapping[x] for x in pred_labels])
    return aligned



## Run HMM Function

This function fits a Gaussian HMM to the data using `hmmlearn`.



In [10]:
def run_hmm(X, n_components=3, random_state=None):
    """
    Fit a Gaussian HMM to the data X using hmmlearn.
    
    Parameters:
        X (ndarray): Observations (T x P).
        n_components (int): Number of hidden states.
        random_state (int or None): Seed for reproducibility.
        
    Returns:
        pred_states (ndarray): Inferred state sequence.
    """
    # Let the HMM initialize all parameters by itself (i.e., start probabilities, transition matrix, means, covariances)
    model = hmm.GaussianHMM(n_components=n_components, covariance_type='diag',
                            n_iter=100, random_state=random_state)
    model.fit(X)
    pred_states = model.predict(X)
    return pred_states

# Example usage:
pred_states = run_hmm(X, random_state=42)



## Calculate Balanced Accuracy Function

This function computes the Balanced Accuracy (BAC) after aligning the predicted state labels.



In [11]:
def calculate_bac(true_states, pred_states):
    """
    Compute the Balanced Accuracy (BAC) after aligning the predicted state labels.
    """
    aligned_pred = align_labels(true_states, pred_states)
    bac = balanced_accuracy_score(true_states, aligned_pred)
    return bac



## Main Execution

This section runs the simulation, fits the HMM, and calculates the BAC for different numbers of features.



In [13]:
if __name__ == '__main__':
    # Simulation settings
    T = 500
    mu_values = [0.125, 0.250, 0.375, 0.500]  # Different signal magnitudes
    p_values = [15, 30, 60, 150, 300]  # Different numbers of features
    n_simulations = 10  # Increase to 100 for full replication
    
    results = []
    
    for mu in mu_values:
        for P in p_values:
            bac_hmm_list = []
            bac_jump_normal_list = []
            bac_sparse_jump_list = []
            
            for sim in range(n_simulations):
                # Simulate data and get true state sequence
                X, true_states = simulate_data(T, P, mu, random_state=sim)
                
                # ----- HMM Analysis -----
                pred_states_hmm = run_hmm(X, random_state=sim)
                bac_hmm = calculate_bac(true_states, pred_states_hmm)
                bac_hmm_list.append(bac_hmm)
                
                # ----- Normal (Standard) Jump Model Analysis -----
                # Use the jump penalty value from Nystrup et al.: lambda = 400
                jump_model_normal = JumpModel(n_components=3, jump_penalty=32, cont=False, )
                jump_model_normal.fit(X)
                pred_states_jump_normal = jump_model_normal.labels_
                bac_jump_normal = calculate_bac(true_states, pred_states_jump_normal)
                bac_jump_normal_list.append(bac_jump_normal)
                
                # ----- Sparse Jump Model Analysis -----
                # For sparse jump model, use lambda = 30 and kappa = 17 (i.e., max_feats = 289)
                sparse_jump_model = SparseJumpModel(n_components=3, max_feats=289, jump_penalty=30, random_state=sim)
                sparse_jump_model.fit(X)
                pred_states_sparse_jump = sparse_jump_model.labels_
                bac_sparse_jump = calculate_bac(true_states, pred_states_sparse_jump)
                bac_sparse_jump_list.append(bac_sparse_jump)
            
            mean_bac_hmm = np.mean(bac_hmm_list)
            mean_bac_jump_normal = np.mean(bac_jump_normal_list)
            mean_bac_sparse_jump = np.mean(bac_sparse_jump_list)
            
            results.append({
                'mu': mu, 
                'P': P, 
                'HMM Mean BAC': mean_bac_hmm,
                'Normal Jump Model Mean BAC': mean_bac_jump_normal,
                'Sparse Jump Model Mean BAC': mean_bac_sparse_jump
            })
            print(f"mu = {mu}, P = {P}, HMM BAC = {mean_bac_hmm:.3f}, Normal Jump BAC = {mean_bac_jump_normal:.3f}, Sparse Jump BAC = {mean_bac_sparse_jump:.3f}")
    


mu = 0.125, P = 15, HMM BAC = 0.398, Normal Jump BAC = 0.333, Sparse Jump BAC = 0.333
mu = 0.125, P = 30, HMM BAC = 0.355, Normal Jump BAC = 0.346, Sparse Jump BAC = 0.366
mu = 0.125, P = 60, HMM BAC = 0.363, Normal Jump BAC = 0.412, Sparse Jump BAC = 0.362
mu = 0.125, P = 150, HMM BAC = 0.362, Normal Jump BAC = 0.402, Sparse Jump BAC = 0.352
mu = 0.125, P = 300, HMM BAC = 0.369, Normal Jump BAC = 0.374, Sparse Jump BAC = 0.358
mu = 0.25, P = 15, HMM BAC = 0.541, Normal Jump BAC = 0.469, Sparse Jump BAC = 0.496
mu = 0.25, P = 30, HMM BAC = 0.521, Normal Jump BAC = 0.474, Sparse Jump BAC = 0.497
mu = 0.25, P = 60, HMM BAC = 0.397, Normal Jump BAC = 0.501, Sparse Jump BAC = 0.497
mu = 0.25, P = 150, HMM BAC = 0.356, Normal Jump BAC = 0.453, Sparse Jump BAC = 0.447
mu = 0.25, P = 300, HMM BAC = 0.359, Normal Jump BAC = 0.395, Sparse Jump BAC = 0.442
mu = 0.375, P = 15, HMM BAC = 0.576, Normal Jump BAC = 0.599, Sparse Jump BAC = 0.577
mu = 0.375, P = 30, HMM BAC = 0.551, Normal Jump BAC = 

In [None]:
  df_results = pd.DataFrame(results)
  print(df_results)

      mu    P  HMM Mean BAC  Normal Jump Model Mean BAC  Sparse Jump Model Mean BAC
0   0.25   15      0.541157                    0.469100                    0.496114
1   0.25   30      0.521216                    0.474309                    0.496899
2   0.25   60      0.396980                    0.501286                    0.497270
3   0.25  150      0.355612                    0.453364                    0.446615
4   0.25  300      0.359205                    0.394669                    0.441935
5   0.50   15      0.645824                    0.681246                    0.664919
6   0.50   30      0.558571                    0.669189                    0.754147
7   0.50   60      0.538370                    0.624728                    0.854883
8   0.50  150      0.530867                    0.676857                    0.863040
9   0.50  300      0.547323                    0.597335                    0.858557
10  0.75   15      0.780393                    0.826627                    0