# Simulation Study for Noise Filtering

This is the v0 for the simulation study on the sparse jump model comparison with HMM, to show that SJM is able to filter away noisy data by using the weighting in the algorithm.


In [3]:
#Importing packages
import numpy as np
import pandas as pd

from hmmlearn import hmm
from sklearn.metrics import balanced_accuracy_score, confusion_matrix
from scipy.optimize import linear_sum_assignment
from scipy import stats

# classes as in your original code.
from jumpmodels.jump import JumpModel
from jumpmodels.sparse_jump import SparseJumpModel

In [6]:
# ## 1. Data Simulation & Utility Functions

def simulate_data(T, P, mu, random_state=None):
    """
    Simulate data from a 3-state Gaussian HMM.
    
    Parameters:
        T (int): Number of observations.
        P (int): Total number of features (only first 15 are informative).
        mu (float): Signal magnitude for informative features.
        random_state (int or None): Seed for reproducibility.
        
    Returns:
        X (ndarray): Simulated observations (T x P).
        states (ndarray): True state sequence (length T).
    """
    rng = np.random.default_rng(random_state)
    
    # Transition matrix as given in your original code
    transmat = np.array([[0.9903, 0.0047, 0.0050],
                         [0.0157, 0.9666, 0.0177],
                         [0.0284, 0.0300, 0.9416]])
    transmat = transmat / transmat.sum(axis=1, keepdims=True)
    
    # Compute stationary distribution (eigenvector corresponding to eigenvalue 1)
    eigvals, eigvecs = np.linalg.eig(transmat.T)
    stat = np.real(eigvecs[:, np.isclose(eigvals, 1)])
    stat = stat[:, 0]
    stat = stat / np.sum(stat)
    
    # Generate state sequence
    states = np.zeros(T, dtype=int)
    states[0] = rng.choice(np.arange(3), p=stat)
    for t in range(1, T):
        states[t] = rng.choice(np.arange(3), p=transmat[states[t-1]])
    
    # Define means for each state
    means = np.zeros((3, P))
    # State 0: +mu in first 15 features
    # State 1: 0
    # State 2: -mu in first 15 features
    if P >= 15:
        means[0, :15] = mu
        means[2, :15] = -mu
    else:
        means[0, :P] = mu
        means[2, :P] = -mu
    
    # Generate observations: N(means[state], I_P)
    X = np.zeros((T, P))
    for t in range(T):
        X[t] = rng.normal(loc=means[states[t]], scale=1.0, size=P)
    
    return X, states

def align_labels(true_labels, pred_labels):
    """
    Align predicted labels with true labels using the Hungarian algorithm.
    
    Returns:
        aligned (ndarray): Predicted labels after optimal permutation.
    """
    D = confusion_matrix(true_labels, pred_labels)
    row_ind, col_ind = linear_sum_assignment(-D)
    mapping = {col: row for row, col in zip(row_ind, col_ind)}
    aligned = np.array([mapping[x] for x in pred_labels])
    return aligned

# Simulate data
T = 500  # Number of observations
P = 50   # Total number of features
mu = 1.0 # Signal magnitude for informative features
random_state = 42

X, states = simulate_data(T, P, mu, random_state)

# Create a DataFrame to display the states and features
df_simulated = pd.DataFrame(X, columns=[f'Feature_{i+1}' for i in range(P)])
df_simulated['State'] = states

print(df_simulated.head())

   Feature_1  Feature_2  Feature_3  Feature_4  Feature_5  Feature_6  Feature_7  Feature_8  Feature_9  Feature_10  ...  Feature_42  Feature_43  Feature_44  Feature_45  Feature_46  Feature_47  Feature_48  Feature_49  Feature_50  State
0   1.433215   0.091520   0.580777  -0.056783  -0.170408  -0.779482   0.430301  -0.851537   0.665585    1.085287  ...    1.335784   -0.191344    1.403821   -0.442536    1.455046    0.131486    0.258229    1.564718   -0.361770      1
1  -0.941122  -0.448564   0.452334  -1.565759   0.637471  -0.538771   1.147813  -2.394260  -0.786566   -1.686468  ...    0.262036   -0.899695    0.189843   -1.454822    1.336186    1.247950   -0.252517    0.363454   -2.409922      1
2  -1.156348  -0.293779  -1.072133   0.714396   1.997297  -1.176615  -0.837463   0.235448   1.611116   -1.222374  ...    0.268913   -0.619666    0.471136   -0.533452   -0.411638    1.362643   -1.040586   -2.412780    1.610937      1
3   2.549328  -0.405269  -1.936838  -0.310484  -0.286223  -0.189924 