# Generation of Synthetic Data for Testing the RecurrentHealthEventsHMM

In [3]:
import numpy as np
import pandas as pd
from scipy.stats import bernoulli, norm, poisson
import os

## Set HMM Parameters


In [4]:
n_hidden_states = 3
initial_prob = np.array([0.5, 0.3, 0.2])  # should sum to 1
transition_matrix = np.array([
    [0.7, 0.2, 0.1], # medium
    [0.1, 0.7, 0.2], # high
    [0.2, 0.3, 0.5] # low
])



In [5]:
# Emission types and parameters per state
emission_types = ['gaussian', 'poisson']
# For each hidden state, provide param dict for each emission type
emission_params = [
    [ {'mean': 10, 'std': 1}, {'mu': 4} ], # 0 - medium
    [ {'mean': 20, 'std': 5}, {'mu': 9} ], # 1 - high
    [ {'mean': 5, 'std': 2}, {'mu': 1} ], # 2 - low
]

# For generating synthetic event times (not part of emission)
event_time_params = [
    {'mean': 50, 'std': 10}, # 0 - medium
    {'mean': 20, 'std': 10}, # 1 - high
    {'mean': 90, 'std': 40}, # 2 - low
]

## Helper Functions

In [6]:
def sample_emission(state, emission_types, emission_params):
    emissions = []
    for idx, em_type in enumerate(emission_types):
        params = emission_params[state][idx]
        if em_type == 'gaussian':
            emissions.append(norm.rvs(params['mean'], params['std']))
        elif em_type == 'poisson':
            emissions.append(poisson.rvs(params['mu']))
    return emissions

def sample_sequence_length(lam=3.0, min_len=1, max_len=20):
    length = int(np.clip(np.random.exponential(lam), min_len, max_len))
    return length

In [12]:
def generate_patient_sequences(
    n_subjects=100,
    emission_types=emission_types,
    emission_params=emission_params,
    event_time_params=event_time_params,
    initial_prob=initial_prob,
    transition_matrix=transition_matrix,
    n_hidden_states=n_hidden_states,
    lam=3.0,
    min_len=1,
    max_len=20
):
    all_rows = []
    for subject_id in range(1, n_subjects + 1):
        n_steps = sample_sequence_length(lam, min_len, max_len)
        state = np.random.choice(n_hidden_states, p=initial_prob)
        for t in range(n_steps):
            emissions = sample_emission(state, emission_types, emission_params)
            # Event time (for plotting, not an emission)
            et_params = event_time_params[state]
            synthetic_event_time = np.clip(norm.rvs(et_params['mean'], et_params['std']), 1, 365)
            binary_time_var = int(synthetic_event_time <= 30)
            # For single-event patients, force binary var = 0 and event_time = censored (e.g., 999)
            if n_steps == 1:
                binary_time_var = 0
                synthetic_event_time = None
            all_rows.append({
                "SUBJECT_ID": subject_id,
                "TIME": subject_id * (max_len) + t,
                "VARIABLE_1": emissions[0],
                "VARIABLE_2": emissions[1],
                "BINARY_TIME_VARIABLE": binary_time_var,
                "SYNTHETIC_EVENT_TIME": synthetic_event_time
            })
            # Transition to next state for next step
            state = np.random.choice(n_hidden_states, p=transition_matrix[state])
    return pd.DataFrame(all_rows)


## Generate Synthetic Data


In [13]:
# Generate and show
df = generate_patient_sequences(n_subjects=1300)
df["EVENT_ID"] = df.index
df = df[["EVENT_ID", "SUBJECT_ID", "TIME", "VARIABLE_1", "VARIABLE_2", "BINARY_TIME_VARIABLE", "SYNTHETIC_EVENT_TIME"]]
df.head(10)

Unnamed: 0,EVENT_ID,SUBJECT_ID,TIME,VARIABLE_1,VARIABLE_2,BINARY_TIME_VARIABLE,SYNTHETIC_EVENT_TIME
0,0,1,20,18.708193,12,0,
1,1,2,40,20.382434,11,0,32.981958
2,2,2,41,16.60739,8,1,19.797837
3,3,3,60,19.005234,12,0,34.602548
4,4,3,61,21.28707,14,1,17.075735
5,5,4,80,6.259545,0,1,29.17784
6,6,4,81,6.38571,0,0,83.761694
7,7,4,82,9.009438,4,0,41.843321
8,8,4,83,22.323618,9,1,19.869755
9,9,4,84,14.912759,9,1,20.251649


In [20]:
df["SYNTHETIC_EVENT_TIME_CAT"] = pd.cut(
    df["SYNTHETIC_EVENT_TIME"],
    bins=[0, 30, 120, np.inf],
    labels=["0-30", "30-120", "120+"],
    right=False
)

df["SYNTHETIC_EVENT_TIME_CAT"] = df["SYNTHETIC_EVENT_TIME_CAT"].fillna("120+")

In [21]:
df

Unnamed: 0,EVENT_ID,SUBJECT_ID,TIME,VARIABLE_1,VARIABLE_2,BINARY_TIME_VARIABLE,SYNTHETIC_EVENT_TIME,SYNTHETIC_EVENT_TIME_CAT
0,0,1,20,18.708193,12,0,,120+
1,1,2,40,20.382434,11,0,32.981958,30-120
2,2,2,41,16.607390,8,1,19.797837,0-30
3,3,3,60,19.005234,12,0,34.602548,30-120
4,4,3,61,21.287070,14,1,17.075735,0-30
...,...,...,...,...,...,...,...,...
3795,3795,1297,25941,9.207257,6,0,42.754648,30-120
3796,3796,1298,25960,7.167341,3,0,,120+
3797,3797,1299,25980,9.482148,5,0,,120+
3798,3798,1300,26000,4.945602,0,0,85.830169,30-120


## Save DataFrame to CSV

In [22]:
file_dir = '/workspaces/master-thesis-recurrent-health-events-prediction/data/synthetic_hmm'
filename = 'synthetic_test.csv'
os.makedirs(file_dir, exist_ok=True)
df.to_csv(f'{file_dir}/{filename}', index=False)
print(f"Data saved to {file_dir}/{filename}")

Data saved to /workspaces/master-thesis-recurrent-health-events-prediction/data/synthetic_hmm/synthetic_test.csv
