In [4]:
#load packages
import numpy as np
import pandas as pd
import dtale
from hmmlearn import hmm
from sklearn.metrics import balanced_accuracy_score, confusion_matrix
from scipy.optimize import linear_sum_assignment
from jumpmodels.sparse_jump import SparseJumpModel    # Sparse JM class
from jumpmodels.jump import JumpModel                 # JM class

In [6]:

def simulate_data_correlated(T, P, mu, random_state=None):
    """
    Simulate data from a 2-state Gaussian HMM with correlated noise features.
    
    Parameters:
        T (int): Number of observations.
        P (int): Total number of features. The first 15 features are informative,
                 the remaining (if any) are noise.
        mu (float): Signal magnitude for informative features.
        random_state (int or None): Seed for reproducibility.
        
    Returns:
        X (ndarray): Simulated observations (T x P).
        states (ndarray): True state sequence (length T).
    """
    rng = np.random.default_rng(random_state)
    
    # Transition matrix for 2-state HMM with high persistence.
    transmat = np.array([[0.9979, 0.0021],
                         [0.0120, 0.9880]])
    
    # Compute stationary distribution
    eigvals, eigvecs = np.linalg.eig(transmat.T)
    stat = np.real(eigvecs[:, np.isclose(eigvals, 1)])
    stat = stat[:, 0]
    stat = stat / np.sum(stat)
    
    # Generate state sequence
    states = np.zeros(T, dtype=int)
    states[0] = rng.choice(np.arange(2), p=stat)
    for t in range(1, T):
        states[t] = rng.choice(np.arange(2), p=transmat[states[t-1]])
    
    # Define means for each state
    means = np.zeros((2, P))
    if P >= 15:
        means[0, :15] = -mu
        means[1, :15] = mu
    else:
        means[0, :P] = -mu
        means[1, :P] = mu

    # Handle correlated noise for features beyond the first 15
    informative = 15
    if P > informative:
        num_noise = P - informative
        # Covariance: 1 on diag, 0.1 off-diagonal
        sigma = np.full((num_noise, num_noise), 0.1)
        np.fill_diagonal(sigma, 1.0)
        C = np.linalg.cholesky(sigma)
    else:
        C = None
    
    # Generate observations
    X = np.zeros((T, P))
    for t in range(T):
        # Informative features
        X[t, :min(P, informative)] = rng.normal(
            loc=means[states[t], :min(P, informative)],
            scale=1.0,
            size=min(P, informative)
        )
        # Correlated noise if P > 15
        if P > informative:
            noise_indep = rng.normal(loc=0.0, scale=1.0, size=num_noise)
            noise_corr = C @ noise_indep
            X[t, informative:] = noise_corr
            
    return X, states

# Simulate data
T = 500  
P = 50
mu = 1.0
X, states = simulate_data_correlated(T, P, mu, random_state=1)

# Create a DataFrame
df_simulated = pd.DataFrame(X, columns=[f'Feature_{i+1}' for i in range(P)])
df_simulated['State'] = states
df_simulated.index.name = 'Time'

print(df_simulated.head())
print(df_simulated['State'].value_counts())


      Feature_1  Feature_2  Feature_3  Feature_4  Feature_5  Feature_6  Feature_7  Feature_8  Feature_9  Feature_10  ...  Feature_42  Feature_43  Feature_44  Feature_45  Feature_46  Feature_47  Feature_48  Feature_49  Feature_50  State
Time                                                                                                                 ...                                                                                                                   
0     -0.827144  -1.379214  -0.438970  -3.135831  -0.767627  -0.971874  -2.370340   1.175598  -2.387413   -2.077520  ...   -1.312379    0.398908    1.669743    1.065854    0.897108   -0.011932    0.913627   -0.935753    0.690793      0
1     -1.087248   0.130906  -0.533989  -2.089880  -0.872007   0.218221  -2.129234  -1.559140  -1.768673   -2.495646  ...    0.974836   -0.701287   -2.469362   -0.933327    0.149726   -2.896892   -0.353552   -0.337576   -1.395112      0
2     -2.482503  -1.462462  -1.547100   0.259919  -0.625



## Align Labels Function

This function aligns predicted labels with true labels using the Hungarian algorithm.



In [7]:
def align_labels(true_labels, pred_labels):
    """
    Align predicted labels with true labels using the Hungarian algorithm.
    
    Returns:
        aligned (ndarray): Predicted labels after optimal permutation.
    """
    D = confusion_matrix(true_labels, pred_labels)
    row_ind, col_ind = linear_sum_assignment(-D)
    mapping = {col: row for row, col in zip(row_ind, col_ind)}
    aligned = np.array([mapping[x] for x in pred_labels])
    return aligned



## Run HMM Function

This function fits a Gaussian HMM to the data using `hmmlearn`.



In [8]:
def run_hmm(X, n_components=2, random_state=None):
    """
    Fit a Gaussian HMM to the data X using hmmlearn.
    
    Parameters:
        X (ndarray): Observations (T x P).
        n_components (int): Number of hidden states.
        random_state (int or None): Seed for reproducibility.
        
    Returns:
        pred_states (ndarray): Inferred state sequence.
    """
    # Let the HMM initialize all parameters by itself (i.e., start probabilities, transition matrix, means, covariances)
    model = hmm.GaussianHMM(n_components=n_components, covariance_type='diag',
                            n_iter=100, random_state=random_state)
    model.fit(X)
    pred_states = model.predict(X)
    return pred_states

# Example usage:
pred_states = run_hmm(X, random_state=42)



## Calculate Balanced Accuracy Function

This function computes the Balanced Accuracy (BAC) after aligning the predicted state labels.



In [9]:
def calculate_bac(true_states, pred_states):
    """
    Compute the Balanced Accuracy (BAC) after aligning the predicted state labels.
    """
    aligned_pred = align_labels(true_states, pred_states)
    bac = balanced_accuracy_score(true_states, aligned_pred)
    return bac



## Main Execution

This section runs the simulation, fits the HMM, and calculates the BAC for different numbers of features.



In [None]:
if __name__ == '__main__':
    # Simulation settings
    T = 500
    mu_values = [0.125, 0.250, 0.375, 0.5, 1]  # Different signal magnitudes
    p_values = [15, 30, 60, 150, 300]  # Different numbers of features
    n_simulations = 10  # Increase to 100 for full replication
    
    results = []
    
    for mu in mu_values:
        for P in p_values:
            bac_hmm_list = []
            bac_jump_normal_list = []
            bac_sparse_jump_list = []
            
            for sim in range(n_simulations):
                # Simulate data and get true state sequence
                X, true_states = simulate_data_correlated(T, P, mu, random_state=sim)
                
                # ----- HMM Analysis -----
                pred_states_hmm = run_hmm(X, random_state=sim)
                bac_hmm = calculate_bac(true_states, pred_states_hmm)
                bac_hmm_list.append(bac_hmm)
                
                # ----- Normal (Standard) Jump Model Analysis -----
                # Use the jump penalty value from Nystrup et al.: lambda = 400
                jump_model_normal = JumpModel(n_components=2, jump_penalty=30, cont=False, )
                jump_model_normal.fit(X)
                pred_states_jump_normal = jump_model_normal.labels_
                bac_jump_normal = calculate_bac(true_states, pred_states_jump_normal)
                bac_jump_normal_list.append(bac_jump_normal)
                
                # ----- Sparse Jump Model Analysis -----
                # For sparse jump model, use lambda = 30 and kappa = 17 (i.e., max_feats = 300)
                sparse_jump_model = SparseJumpModel(n_components=2, max_feats=50, jump_penalty=30, random_state=sim)
                sparse_jump_model.fit(X)
                pred_states_sparse_jump = sparse_jump_model.labels_
                bac_sparse_jump = calculate_bac(true_states, pred_states_sparse_jump)
                bac_sparse_jump_list.append(bac_sparse_jump)
            
            mean_bac_hmm = np.mean(bac_hmm_list)
            mean_bac_jump_normal = np.mean(bac_jump_normal_list)
            mean_bac_sparse_jump = np.mean(bac_sparse_jump_list)
            
            results.append({
                'mu': mu, 
                'P': P, 
                'HMM Mean BAC': mean_bac_hmm,
                'Normal Jump Model Mean BAC': mean_bac_jump_normal,
                'Sparse Jump Model Mean BAC': mean_bac_sparse_jump
            })
            print(f"mu = {mu}, P = {P}, HMM BAC = {mean_bac_hmm:.3f}, Normal Jump BAC = {mean_bac_jump_normal:.3f}, Sparse Jump BAC = {mean_bac_sparse_jump:.3f}")
    



y_pred contains classes not in y_true


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.



mu = 0.125, P = 15, HMM BAC = 0.594, Normal Jump BAC = 0.598, Sparse Jump BAC = 0.687



y_pred contains classes not in y_true


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.



mu = 0.125, P = 30, HMM BAC = 0.512, Normal Jump BAC = 0.622, Sparse Jump BAC = 0.638



y_pred contains classes not in y_true


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.



mu = 0.125, P = 60, HMM BAC = 0.524, Normal Jump BAC = 0.700, Sparse Jump BAC = 0.680



y_pred contains classes not in y_true


y_pred contains classes not in y_true


y_pred contains classes not in y_true



mu = 0.125, P = 150, HMM BAC = 0.506, Normal Jump BAC = 0.592, Sparse Jump BAC = 0.553



y_pred contains classes not in y_true


y_pred contains classes not in y_true


y_pred contains classes not in y_true



mu = 0.125, P = 300, HMM BAC = 0.520, Normal Jump BAC = 0.556, Sparse Jump BAC = 0.591



y_pred contains classes not in y_true


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.



mu = 0.25, P = 15, HMM BAC = 0.794, Normal Jump BAC = 0.898, Sparse Jump BAC = 0.898



y_pred contains classes not in y_true


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.



mu = 0.25, P = 30, HMM BAC = 0.533, Normal Jump BAC = 0.798, Sparse Jump BAC = 0.891



y_pred contains classes not in y_true


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.



mu = 0.25, P = 60, HMM BAC = 0.525, Normal Jump BAC = 0.822, Sparse Jump BAC = 0.872



y_pred contains classes not in y_true


y_pred contains classes not in y_true


y_pred contains classes not in y_true



mu = 0.25, P = 150, HMM BAC = 0.505, Normal Jump BAC = 0.740, Sparse Jump BAC = 0.704



y_pred contains classes not in y_true


y_pred contains classes not in y_true


y_pred contains classes not in y_true



mu = 0.25, P = 300, HMM BAC = 0.520, Normal Jump BAC = 0.563, Sparse Jump BAC = 0.593



y_pred contains classes not in y_true


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.



mu = 0.375, P = 15, HMM BAC = 0.844, Normal Jump BAC = 0.929, Sparse Jump BAC = 0.930



y_pred contains classes not in y_true


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.



mu = 0.375, P = 30, HMM BAC = 0.778, Normal Jump BAC = 0.873, Sparse Jump BAC = 0.935



y_pred contains classes not in y_true


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.



mu = 0.375, P = 60, HMM BAC = 0.527, Normal Jump BAC = 0.810, Sparse Jump BAC = 0.924



y_pred contains classes not in y_true


y_pred contains classes not in y_true


y_pred contains classes not in y_true



mu = 0.375, P = 150, HMM BAC = 0.505, Normal Jump BAC = 0.764, Sparse Jump BAC = 0.816



y_pred contains classes not in y_true


y_pred contains classes not in y_true


y_pred contains classes not in y_true



mu = 0.375, P = 300, HMM BAC = 0.520, Normal Jump BAC = 0.560, Sparse Jump BAC = 0.588



y_pred contains classes not in y_true


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.



mu = 0.5, P = 15, HMM BAC = 0.906, Normal Jump BAC = 0.950, Sparse Jump BAC = 1.000



y_pred contains classes not in y_true


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.



mu = 0.5, P = 30, HMM BAC = 0.778, Normal Jump BAC = 0.994, Sparse Jump BAC = 0.900



y_pred contains classes not in y_true


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.



mu = 0.5, P = 60, HMM BAC = 0.573, Normal Jump BAC = 0.872, Sparse Jump BAC = 0.932



y_pred contains classes not in y_true


y_pred contains classes not in y_true


y_pred contains classes not in y_true



mu = 0.5, P = 150, HMM BAC = 0.505, Normal Jump BAC = 0.839, Sparse Jump BAC = 0.816



y_pred contains classes not in y_true


y_pred contains classes not in y_true


y_pred contains classes not in y_true



mu = 0.5, P = 300, HMM BAC = 0.520, Normal Jump BAC = 0.731, Sparse Jump BAC = 0.587



y_pred contains classes not in y_true


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.



mu = 1, P = 15, HMM BAC = 0.965, Normal Jump BAC = 0.950, Sparse Jump BAC = 1.000



y_pred contains classes not in y_true


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.



mu = 1, P = 30, HMM BAC = 0.926, Normal Jump BAC = 1.000, Sparse Jump BAC = 1.000



y_pred contains classes not in y_true


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.


A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.



mu = 1, P = 60, HMM BAC = 0.801, Normal Jump BAC = 0.975, Sparse Jump BAC = 1.000



y_pred contains classes not in y_true


y_pred contains classes not in y_true


y_pred contains classes not in y_true



mu = 1, P = 150, HMM BAC = 0.652, Normal Jump BAC = 0.943, Sparse Jump BAC = 0.998



y_pred contains classes not in y_true


y_pred contains classes not in y_true


y_pred contains classes not in y_true



mu = 1, P = 300, HMM BAC = 0.523, Normal Jump BAC = 0.802, Sparse Jump BAC = 0.933


In [14]:
  df_results = pd.DataFrame(results)
  print(df_results)

       mu    P  HMM Mean BAC  Normal Jump Model Mean BAC  Sparse Jump Model Mean BAC
0   0.125   15      0.594136                    0.597981                    0.686885
1   0.125   30      0.511903                    0.622417                    0.637890
2   0.125   60      0.523670                    0.699774                    0.679851
3   0.125  150      0.505938                    0.592089                    0.552995
4   0.125  300      0.520356                    0.555793                    0.590972
5   0.250   15      0.794359                    0.898283                    0.897662
6   0.250   30      0.532919                    0.798348                    0.891202
7   0.250   60      0.524716                    0.821623                    0.871799
8   0.250  150      0.504599                    0.739807                    0.704121
9   0.250  300      0.520313                    0.562854                    0.592684
10  0.375   15      0.844187                    0.929332         