In [1]:
import pandas as pd
import numpy as np

In [2]:
# --- Simulation Configuration ---
N_SAMPLES = 1000  # Number of data points to generate

# Cognitive Load (CL) levels
COGNITIVE_LOAD_LEVELS = ['Low', 'Medium', 'High']

# Task Types and their distribution
TASK_TYPES = ['Lecture', 'Group Discussion', 'Assignment', 'Exam']
TASK_TYPE_PROBS = [0.3, 0.2, 0.3, 0.2]

# Psychological State (derived from Cognitive Load)
PSYCHOLOGICAL_STATES = ['Relaxed', 'Focused', 'Stressed', 'Anxious']

# Noise levels
HR_NOISE = 3.0
HRV_NOISE = 5.0
GSR_NOISE = 0.05
EEG_NOISE = 0.5  # General noise for EEG bands
RESP_RATE_NOISE = 0.5
SKIN_TEMP_NOISE = 0.1
BP_NOISE = 5.0
FOCUS_DURATION_NOISE = 60.0 # seconds

# Missing data configuration
MISSING_VALUE_FRACTION = 0.05
# Ensure EEG_Delta is in this list if you want missing values for it
COLUMNS_FOR_MISSING_DATA = [
    'HRV (ms)', 'GSR (μS)', 'EEG_Delta', 'EEG_Alpha', 'EEG_Beta', # Added EEG_Delta here
    'Skin Temp (°C)', 'Focus Duration (s)', 'Ambient Noise (dB)'
]

In [3]:
# --- Base Physiological & Behavioral Values (means for 'Low' Cognitive Load) ---
BASE_HR = 70.0  # bpm
BASE_HRV = 60.0  # ms
BASE_GSR = 0.5  # μS
BASE_EEG_DELTA = 15.0 # μV^2/Hz (example, often higher in relaxed states)
BASE_EEG_ALPHA = 10.0 # μV^2/Hz
BASE_EEG_BETA = 5.0   # μV^2/Hz
BASE_RESP_RATE = 12.0 # breaths per minute
BASE_SKIN_TEMP = 33.0 # °C
BASE_BP_SYS = 110.0 # mmHg
BASE_BP_DIAS_OFFSET = -30.0
BASE_FOCUS_DURATION = 600.0 # seconds

In [4]:
# --- Effects of Cognitive Load (CL) ---
# Delta waves might decrease with higher cognitive load (less deep relaxation)
CL_EFFECT_EEG_DELTA = {'Medium': -3.0, 'High': -7.0} # Additive
CL_EFFECT_HR = {'Medium': 5.0, 'High': 15.0}
CL_EFFECT_HRV = {'Medium': -10.0, 'High': -25.0}
CL_EFFECT_GSR = {'Medium': 0.2, 'High': 0.5}
CL_EFFECT_EEG_ALPHA = {'Medium': -2.0, 'High': -5.0}
CL_EFFECT_EEG_BETA = {'Medium': 2.0, 'High': 5.0}
CL_EFFECT_RESP_RATE = {'Medium': 2.0, 'High': 5.0}
CL_EFFECT_SKIN_TEMP = {'Medium': -0.1, 'High': -0.3}
CL_EFFECT_BP = {'Medium': 5.0, 'High': 15.0}
CL_EFFECT_FOCUS_DURATION = {'Medium': -120.0, 'High': -300.0}


def simulate_cognitive_load(task_types):
    """Simulates Cognitive Load based on Task Type."""
    cognitive_load = []
    for task in task_types:
        if task == 'Lecture':
            load = np.random.choice(COGNITIVE_LOAD_LEVELS, p=[0.6, 0.3, 0.1])
        elif task == 'Group Discussion':
            load = np.random.choice(COGNITIVE_LOAD_LEVELS, p=[0.2, 0.6, 0.2])
        elif task == 'Assignment':
            load = np.random.choice(COGNITIVE_LOAD_LEVELS, p=[0.1, 0.4, 0.5])
        elif task == 'Exam':
            load = np.random.choice(COGNITIVE_LOAD_LEVELS, p=[0.05, 0.15, 0.8])
        else:
            load = np.random.choice(COGNITIVE_LOAD_LEVELS, p=[1/3, 1/3, 1/3])
        cognitive_load.append(load)
    return cognitive_load

def derive_psychological_state(cognitive_load_series):
    """Derives Psychological State from Cognitive Load."""
    psych_states = []
    for cl in cognitive_load_series:
        if cl == 'Low':
            state = np.random.choice(['Relaxed', 'Focused'], p=[0.6, 0.4])
        elif cl == 'Medium':
            state = np.random.choice(['Focused', 'Stressed', 'Anxious'], p=[0.5, 0.3, 0.2])
        elif cl == 'High':
            state = np.random.choice(['Stressed', 'Anxious', 'Focused'], p=[0.5, 0.4, 0.1])
        else:
            state = np.random.choice(PSYCHOLOGICAL_STATES)
        psych_states.append(state)
    return psych_states

def add_noise_and_missing(series, noise_std, missing_frac, min_val=None, max_val=None):
    """Adds Gaussian noise and introduces missing values to a pandas Series."""
    noisy_series = series.astype(float).copy()
    noise_values = np.random.normal(0, noise_std, size=noisy_series.shape)
    noisy_series += noise_values

    if min_val is not None:
        noisy_series = np.maximum(noisy_series, min_val)
    if max_val is not None:
        noisy_series = np.minimum(noisy_series, max_val)

    nan_mask = np.random.rand(len(noisy_series)) < missing_frac
    noisy_series[nan_mask] = np.nan
    return noisy_series

In [5]:
# --- Main Simulation ---
print(f"\n--- Starting Simulation for {N_SAMPLES} samples ---")

sim_task_type = np.random.choice(TASK_TYPES, size=N_SAMPLES, p=TASK_TYPE_PROBS)
df_sim = pd.DataFrame({'Task Type': sim_task_type})
df_sim['Cognitive Load'] = simulate_cognitive_load(df_sim['Task Type'])
df_sim['Psychological State'] = derive_psychological_state(df_sim['Cognitive Load'])
sim_ambient_noise = np.random.uniform(30, 80, N_SAMPLES).astype(float)
df_sim['Ambient Noise (dB)'] = add_noise_and_missing(sim_ambient_noise, 5.0, MISSING_VALUE_FRACTION, min_val=20.0)

# Initialize physiological & behavioral variables
df_sim['Heart Rate (BPM)'] = BASE_HR
df_sim['HRV (ms)'] = BASE_HRV
df_sim['GSR (μS)'] = BASE_GSR
df_sim['EEG_Delta'] = BASE_EEG_DELTA # Added EEG_Delta
df_sim['EEG_Alpha'] = BASE_EEG_ALPHA
df_sim['EEG_Beta'] = BASE_EEG_BETA
df_sim['Respiration Rate_continuous'] = BASE_RESP_RATE
df_sim['Skin Temp (°C)'] = BASE_SKIN_TEMP
df_sim['BP_Systolic'] = BASE_BP_SYS
df_sim['Focus Duration (s)'] = BASE_FOCUS_DURATION

# Ensure columns are float before adding effects
for col in ['Heart Rate (BPM)', 'HRV (ms)', 'GSR (μS)', 'EEG_Delta', 'EEG_Alpha', 'EEG_Beta',
            'Respiration Rate_continuous', 'Skin Temp (°C)', 'BP_Systolic', 'Focus Duration (s)']:
    df_sim[col] = df_sim[col].astype(float)

# Apply Cognitive Load effects
for cl_level in ['Medium', 'High']:
    mask = df_sim['Cognitive Load'] == cl_level
    df_sim.loc[mask, 'Heart Rate (BPM)'] += CL_EFFECT_HR[cl_level]
    df_sim.loc[mask, 'HRV (ms)'] += CL_EFFECT_HRV[cl_level]
    df_sim.loc[mask, 'GSR (μS)'] += CL_EFFECT_GSR[cl_level]
    df_sim.loc[mask, 'EEG_Delta'] += CL_EFFECT_EEG_DELTA[cl_level] # Added effect for EEG_Delta
    df_sim.loc[mask, 'EEG_Alpha'] += CL_EFFECT_EEG_ALPHA[cl_level]
    df_sim.loc[mask, 'EEG_Beta'] += CL_EFFECT_EEG_BETA[cl_level]
    df_sim.loc[mask, 'Respiration Rate_continuous'] += CL_EFFECT_RESP_RATE[cl_level]
    df_sim.loc[mask, 'Skin Temp (°C)'] += CL_EFFECT_SKIN_TEMP[cl_level]
    df_sim.loc[mask, 'BP_Systolic'] += CL_EFFECT_BP[cl_level]
    df_sim.loc[mask, 'Focus Duration (s)'] += CL_EFFECT_FOCUS_DURATION[cl_level]

df_sim['BP_Diastolic'] = df_sim['BP_Systolic'] + BASE_BP_DIAS_OFFSET
for cl_level in ['Medium', 'High']:
    mask = df_sim['Cognitive Load'] == cl_level
    df_sim.loc[mask, 'BP_Diastolic'] += CL_EFFECT_BP[cl_level] * 0.5

# Add noise and missing values
df_sim['Heart Rate (BPM)'] = add_noise_and_missing(df_sim['Heart Rate (BPM)'], HR_NOISE, MISSING_VALUE_FRACTION, min_val=40.0, max_val=180.0)
df_sim['HRV (ms)'] = add_noise_and_missing(df_sim['HRV (ms)'], HRV_NOISE, MISSING_VALUE_FRACTION, min_val=10.0)
df_sim['GSR (μS)'] = add_noise_and_missing(df_sim['GSR (μS)'], GSR_NOISE, MISSING_VALUE_FRACTION, min_val=0.01)
df_sim['EEG_Delta'] = add_noise_and_missing(df_sim['EEG_Delta'], EEG_NOISE, MISSING_VALUE_FRACTION, min_val=0.5) # Added EEG_Delta
df_sim['EEG_Alpha'] = add_noise_and_missing(df_sim['EEG_Alpha'], EEG_NOISE, MISSING_VALUE_FRACTION, min_val=1.0)
df_sim['EEG_Beta'] = add_noise_and_missing(df_sim['EEG_Beta'], EEG_NOISE, MISSING_VALUE_FRACTION, min_val=1.0)
df_sim['Respiration Rate_continuous'] = add_noise_and_missing(df_sim['Respiration Rate_continuous'], RESP_RATE_NOISE, MISSING_VALUE_FRACTION, min_val=5.0, max_val=40.0)
df_sim['Skin Temp (°C)'] = add_noise_and_missing(df_sim['Skin Temp (°C)'], SKIN_TEMP_NOISE, MISSING_VALUE_FRACTION, min_val=30.0, max_val=37.0)
df_sim['BP_Systolic'] = add_noise_and_missing(df_sim['BP_Systolic'], BP_NOISE, MISSING_VALUE_FRACTION, min_val=70.0, max_val=200.0)
df_sim['BP_Diastolic'] = add_noise_and_missing(df_sim['BP_Diastolic'], BP_NOISE, MISSING_VALUE_FRACTION, min_val=40.0, max_val=130.0)
df_sim['Focus Duration (s)'] = add_noise_and_missing(df_sim['Focus Duration (s)'], FOCUS_DURATION_NOISE, MISSING_VALUE_FRACTION, min_val=0.0)

# Bin Respiration Rate
resp_rate_data_for_bins = df_sim['Respiration Rate_continuous'].dropna()
if not resp_rate_data_for_bins.empty:
    min_rr, max_rr = resp_rate_data_for_bins.min(), resp_rate_data_for_bins.max()
    if min_rr == max_rr: bins = [min_rr -1 , min_rr, min_rr + 1, min_rr + 2]
    else: bins = np.linspace(min_rr, max_rr, 4)
    labels = ['Low_RR', 'Medium_RR', 'High_RR']
    df_sim['Respiration Rate (BPM)'] = pd.cut(df_sim['Respiration Rate_continuous'], bins=bins, labels=labels, include_lowest=True)
    df_sim['Respiration Rate (BPM)'] = df_sim['Respiration Rate (BPM)'].cat.add_categories('Unknown_RR').fillna('Unknown_RR')
else:
    df_sim['Respiration Rate (BPM)'] = 'Unknown_RR'
df_sim = df_sim.drop(columns=['Respiration Rate_continuous'])


--- Starting Simulation for 1000 samples ---


In [6]:
# Add other independent features
df_sim['Age'] = np.random.randint(18, 45, N_SAMPLES)
df_sim['Gender'] = np.random.choice(['Male', 'Female', 'Other'], N_SAMPLES, p=[0.48, 0.48, 0.04])
df_sim['Educational Level'] = np.random.choice(['High School', 'Undergraduate', 'Postgraduate'], N_SAMPLES, p=[0.2, 0.5, 0.3])
df_sim['Perceived Task Difficulty'] = np.random.randint(1, 11, N_SAMPLES)

hr_for_poisson = df_sim['Heart Rate (BPM)'].copy()
hr_for_poisson.fillna(BASE_HR, inplace=True)
lambda_click_freq = np.maximum(0.1, hr_for_poisson / 20.0)
df_sim['Click/Interaction Freq'] = np.random.poisson(lambda_click_freq).astype(int)

cognitive_load_map_typing = {'Low':0.0, 'Medium':10.0, 'High':20.0}
cognitive_load_effect_typing = df_sim['Cognitive Load'].map(cognitive_load_map_typing).fillna(0.0) # Ensure fillna for robustness
lambda_typing_speed = np.maximum(0.1, 40.0 - cognitive_load_effect_typing)
df_sim['Typing Speed (WPM)'] = np.random.poisson(lambda_typing_speed).astype(int)

In [7]:
print("\n--- Missing Values in Final Simulated Dataset ---")
missing_summary = df_sim.isnull().sum()
print(missing_summary[missing_summary > 0])

print("\n--- Sample of Simulated Data ---")
print(df_sim.head(10))

df_sim.to_csv("simulated_psychological_data_v3_with_delta.csv", index=False) # New filename
print("\nSimulated dataset saved to simulated_psychological_data_v3_with_delta.csv")


--- Missing Values in Final Simulated Dataset ---
Ambient Noise (dB)    39
Heart Rate (BPM)      73
HRV (ms)              43
GSR (μS)              42
EEG_Delta             61
EEG_Alpha             47
EEG_Beta              44
Skin Temp (°C)        58
BP_Systolic           50
Focus Duration (s)    53
BP_Diastolic          42
dtype: int64

--- Sample of Simulated Data ---
          Task Type Cognitive Load Psychological State  Ambient Noise (dB)  \
0           Lecture            Low             Focused           35.389565   
1           Lecture            Low             Focused           62.051062   
2        Assignment            Low             Relaxed           46.741532   
3  Group Discussion         Medium            Stressed           52.777191   
4  Group Discussion            Low             Focused           72.655554   
5              Exam           High             Anxious           43.111532   
6           Lecture            Low             Relaxed           81.995873   
7  