In [2]:
import pandas as pd
import numpy as np
import os

# CONFIGURATION
INPUT_CSV_PATH = '../../data/adhdata.csv'
OUTPUT_FOLDER = '../../data/augmented'
AUGMENTATION_RATIO = 0.2  # 20% of each class

# Create output folder if it doesn't exist
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# Load the dataset
df = pd.read_csv(INPUT_CSV_PATH)

# EEG and meta columns
eeg_cols = df.select_dtypes(include=np.number).columns.tolist()
patient_col = 'ID'
label_col = 'Class'

augmented_data = []

# Process each class independently
for label in df[label_col].unique():
    class_df = df[df[label_col] == label]
    patient_ids = class_df[patient_col].unique()
    n_to_augment = int(len(patient_ids) * AUGMENTATION_RATIO)

    selected_ids = np.random.choice(patient_ids, size=n_to_augment, replace=False)

    for patient_id in selected_ids:
        group = class_df[class_df[patient_col] == patient_id].copy()
        noise = np.random.normal(loc=0, scale=0.01, size=(len(group), len(eeg_cols)))  # gentle noise
        augmented = group.copy()
        augmented[eeg_cols] = group[eeg_cols].values + noise
        augmented[patient_col] = patient_id + 'a'
        augmented_data.append(augmented)

# Save result
aug_df = pd.concat(augmented_data, ignore_index=True)
aug_df.to_csv(os.path.join(OUTPUT_FOLDER, "augmented_noise_data.csv"), index=False)

print(f"✅ Noise-augmented data saved to {OUTPUT_FOLDER}")
print(f"🧠 ADHD: {sum(aug_df[label_col] == 'ADHD')} | Control: {sum(aug_df[label_col] != 'ADHD')}")


✅ Noise-augmented data saved to ../../data/augmented
🧠 ADHD: 212973 | Control: 192448
