In [19]:
#load packages
import numpy as np
import pandas as pd
import dtale
from hmmlearn import hmm
from sklearn.metrics import balanced_accuracy_score, confusion_matrix
from scipy.optimize import linear_sum_assignment
from jumpmodels.sparse_jump import SparseJumpModel    # Sparse JM class
from jumpmodels.jump import JumpModel             
    # JM class

In [None]:
def simulate_data(T, P, mu, random_state=None):
    """
    Simulate data from a 2-state Gaussian HMM.
    
    Parameters:
        T (int): Number of observations.
        P (int): Total number of features (only first 15 are informative).
        mu (float): Signal magnitude for informative features.
        random_state (int or None): Seed for reproducibility.
        
    Returns:
        X (ndarray): Simulated observations (T x P).
        states (ndarray): True state sequence (length T).
    """
    rng = np.random.default_rng(random_state)
    
    # Transition matrix for 2-state HMM with higher switching probability.
    transmat = np.array([[0.9629, 0.0371],
                         [0.2101, 0.7899]])
    
    # Compute stationary distribution (eigenvector corresponding to eigenvalue 1)
    eigvals, eigvecs = np.linalg.eig(transmat.T)
    stat = np.real(eigvecs[:, np.isclose(eigvals, 1)])
    if stat.shape[1] > 0:
        stat = stat[:, 0]
        stat = stat / np.sum(stat)
    else:
        raise ValueError("Eigenvector corresponding to eigenvalue 1 not found.")
    
    # Generate state sequence
    states = np.zeros(T, dtype=int)
    states[0] = rng.choice(np.arange(2), p=stat)
    for t in range(1, T):
        states[t] = rng.choice(np.arange(2), p=transmat[states[t-1]])
    
    # Define state means: state 0 has mean mu for first 15 features,
    # state 1 has mean -mu for first 15 features.
    means = np.zeros((2, P))
    if P >= 15:
        means[0, :15] = -mu
        means[1, :15] = mu
    else:
        means[0, :P] = -mu
        means[1, :P] = mu
    
    # Generate observations: each observation is drawn from N(mean[state], I_P)
    X = np.zeros((T, P))
    for t in range(T):
        X[t] = rng.normal(loc=means[states[t]], scale=1.0, size=P)
    
    return X, states

# Simulate data
T = 500  # Number of observations for display purposes
P = 50   # Number of features for display purposes  
mu = 1.0
X, states = simulate_data(T, P, mu, random_state=2)

# Create a DataFrame to display the simulated data and states
df_simulated = pd.DataFrame(X, columns=[f'Feature_{i+1}' for i in range(P)])
df_simulated['State'] = states
df_simulated.index.name = 'Time'


df_simulated.head()


df_simulated['State'].value_counts()


State
0    441
1     59
Name: count, dtype: int64



## Align Labels Function

This function aligns predicted labels with true labels using the Hungarian algorithm.



In [21]:

df_simulated.head()

Unnamed: 0_level_0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,Feature_10,...,Feature_42,Feature_43,Feature_44,Feature_45,Feature_46,Feature_47,Feature_48,Feature_49,Feature_50,State
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-1.36361,-2.242339,-0.373653,-1.803251,-1.765829,-1.226271,-0.526822,-3.413184,-2.121454,-0.824411,...,-1.638939,1.474097,0.31631,1.260773,1.254822,-0.207901,0.341962,0.736665,0.769613,0
1,-1.163919,-1.618185,-0.964713,-1.801543,-1.555934,-3.030179,-0.916357,-2.026891,-1.55584,-1.0527,...,-0.16605,-0.103035,0.493468,0.230938,-0.561804,-1.289163,1.234852,1.417302,-0.454445,0
2,-0.438586,-2.526105,-2.562215,-1.269885,0.471682,-2.046551,-1.834124,0.607595,-2.893448,-0.353778,...,1.687785,-0.442674,-1.399585,-0.447571,-1.157259,0.157904,-0.419365,-1.057424,0.601766,0
3,-1.643434,-0.276239,0.260984,-1.361953,-1.646514,-0.800704,-0.10855,-1.002793,-0.860044,-1.026891,...,-0.061408,-0.393158,1.301048,0.160116,-0.802095,2.400545,-0.500117,0.472446,0.676232,0
4,-0.215678,-0.827084,-0.413326,-1.067902,-2.419182,-1.219811,-1.198412,-1.17772,-0.863908,-1.624703,...,-0.632771,-0.254323,-0.592744,-2.342922,1.077409,-0.303473,-0.949637,0.306412,-0.662438,0


In [22]:
def align_labels(true_labels, pred_labels):
    """
    Align predicted labels with true labels using the Hungarian algorithm.
    
    Returns:
        aligned (ndarray): Predicted labels after optimal permutation.
    """
    D = confusion_matrix(true_labels, pred_labels)
    row_ind, col_ind = linear_sum_assignment(-D)
    mapping = {col: row for row, col in zip(row_ind, col_ind)}
    aligned = np.array([mapping[x] for x in pred_labels])
    return aligned



## Run HMM Function

This function fits a Gaussian HMM to the data using `hmmlearn`.



In [23]:
def run_hmm(X, n_components=2, random_state=None):
    """
    Fit a Gaussian HMM to the data X using hmmlearn.
    
    Parameters:
        X (ndarray): Observations (T x P).
        n_components (int): Number of hidden states.
        random_state (int or None): Seed for reproducibility.
        
    Returns:
        pred_states (ndarray): Inferred state sequence.
    """
    # Let the HMM initialize all parameters by itself (i.e., start probabilities, transition matrix, means, covariances)
    model = hmm.GaussianHMM(n_components=n_components, covariance_type='diag',
                            n_iter=100, random_state=random_state)
    model.fit(X)
    pred_states = model.predict(X)
    return pred_states

# Example usage:
pred_states = run_hmm(X, random_state=42)

Model is not converging.  Current: -35501.36868657541 is not greater than -35501.36868655666. Delta is -1.875014277175069e-08




## Calculate Balanced Accuracy Function

This function computes the Balanced Accuracy (BAC) after aligning the predicted state labels.



In [24]:
def calculate_bac(true_states, pred_states):
    """
    Compute the Balanced Accuracy (BAC) after aligning the predicted state labels.
    """
    aligned_pred = align_labels(true_states, pred_states)
    bac = balanced_accuracy_score(true_states, aligned_pred)
    return bac



## Main Execution

This section runs the simulation, fits the HMM, and calculates the BAC for different numbers of features.



In [25]:
if __name__ == '__main__':
    # Simulation settings
    T = 500
    mu_values = [0.01, 0.05, 0.1, 0.25, 0.5]  # Different signal magnitudes
    p_values = [15, 30, 60, 150, 300]  # Different numbers of features
    n_simulations = 10  # Increase to 100 for full replication
    
    results = []
    
    for mu in mu_values:
        for P in p_values:
            bac_hmm_list = []
            bac_jump_normal_list = []
            bac_sparse_jump_list = []
            
            for sim in range(n_simulations):
                # Simulate data and get true state sequence
                X, true_states = simulate_data(T, P, mu, random_state=sim)
                
                # ----- HMM Analysis -----
                pred_states_hmm = run_hmm(X, random_state=sim)
                bac_hmm = calculate_bac(true_states, pred_states_hmm)
                bac_hmm_list.append(bac_hmm)
                
                # ----- Normal (Standard) Jump Model Analysis -----
                # Use the jump penalty value from Nystrup et al.: lambda = 400
                jump_model_normal = JumpModel(n_components=2, jump_penalty=8, cont=False, )
                jump_model_normal.fit(X)
                pred_states_jump_normal = jump_model_normal.labels_
                bac_jump_normal = calculate_bac(true_states, pred_states_jump_normal)
                bac_jump_normal_list.append(bac_jump_normal)
                
                # ----- Sparse Jump Model Analysis -----
                # For sparse jump model, use lambda = 30 and kappa = 17 (i.e., max_feats = 300)
                sparse_jump_model = SparseJumpModel(n_components=2, max_feats=P, jump_penalty=8, random_state=sim)
                sparse_jump_model.fit(X)
                pred_states_sparse_jump = sparse_jump_model.labels_
                bac_sparse_jump = calculate_bac(true_states, pred_states_sparse_jump)
                bac_sparse_jump_list.append(bac_sparse_jump)
            
            mean_bac_hmm = np.mean(bac_hmm_list)
            mean_bac_jump_normal = np.mean(bac_jump_normal_list)
            mean_bac_sparse_jump = np.mean(bac_sparse_jump_list)
            
            results.append({
                'mu': mu, 
                'P': P, 
                'HMM Mean BAC': mean_bac_hmm,
                'Normal Jump Model Mean BAC': mean_bac_jump_normal,
                'Sparse Jump Model Mean BAC': mean_bac_sparse_jump
            })
            print(f"mu = {mu}, P = {P}, HMM BAC = {mean_bac_hmm:.3f}, Normal Jump BAC = {mean_bac_jump_normal:.3f}, Sparse Jump BAC = {mean_bac_sparse_jump:.3f}")
    


mu = 0.01, P = 15, HMM BAC = 0.503, Normal Jump BAC = 0.503, Sparse Jump BAC = 0.509
mu = 0.01, P = 30, HMM BAC = 0.534, Normal Jump BAC = 0.528, Sparse Jump BAC = 0.485
mu = 0.01, P = 60, HMM BAC = 0.488, Normal Jump BAC = 0.501, Sparse Jump BAC = 0.524
mu = 0.01, P = 150, HMM BAC = 0.515, Normal Jump BAC = 0.501, Sparse Jump BAC = 0.519
mu = 0.01, P = 300, HMM BAC = 0.504, Normal Jump BAC = 0.499, Sparse Jump BAC = 0.488
mu = 0.05, P = 15, HMM BAC = 0.507, Normal Jump BAC = 0.501, Sparse Jump BAC = 0.509
mu = 0.05, P = 30, HMM BAC = 0.492, Normal Jump BAC = 0.524, Sparse Jump BAC = 0.504
mu = 0.05, P = 60, HMM BAC = 0.495, Normal Jump BAC = 0.501, Sparse Jump BAC = 0.522
mu = 0.05, P = 150, HMM BAC = 0.519, Normal Jump BAC = 0.505, Sparse Jump BAC = 0.528
mu = 0.05, P = 300, HMM BAC = 0.495, Normal Jump BAC = 0.496, Sparse Jump BAC = 0.491
mu = 0.1, P = 15, HMM BAC = 0.485, Normal Jump BAC = 0.493, Sparse Jump BAC = 0.537
mu = 0.1, P = 30, HMM BAC = 0.521, Normal Jump BAC = 0.526, Sp

In [26]:
  df_results = pd.DataFrame(results)
  print(df_results)

      mu    P  HMM Mean BAC  Normal Jump Model Mean BAC  Sparse Jump Model Mean BAC
0   0.01   15      0.503402                    0.503172                    0.508603
1   0.01   30      0.533988                    0.528468                    0.485268
2   0.01   60      0.488106                    0.500704                    0.524266
3   0.01  150      0.515233                    0.500899                    0.518607
4   0.01  300      0.503910                    0.498832                    0.487641
5   0.05   15      0.506915                    0.500524                    0.508860
6   0.05   30      0.491564                    0.524083                    0.504205
7   0.05   60      0.495353                    0.500780                    0.522028
8   0.05  150      0.518800                    0.505010                    0.528093
9   0.05  300      0.495246                    0.496008                    0.490567
10  0.10   15      0.485131                    0.492851                    0