In [3]:
import pandas as pd
import numpy as np
import os

# CONFIGURATION
INPUT_CSV_PATH = '../../data/adhdata.csv'
OUTPUT_FOLDER = '../../data/augmented'
AUGMENTATION_RATIO = 0.1  # 10% per class

# Create output folder if needed
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# Load data
df = pd.read_csv(INPUT_CSV_PATH)
eeg_cols = df.select_dtypes(include=np.number).columns.tolist()
patient_col = 'ID'
label_col = 'Class'

augmented_rows = []

# Augment each class separately
for label in df[label_col].unique():
    label_data = df[df[label_col] == label]
    patient_ids = label_data[patient_col].unique()
    n_new_patients = int(len(patient_ids) * AUGMENTATION_RATIO)

    if len(patient_ids) < 2 or n_new_patients == 0:
        continue

    used_pairs = set()
    mix_count = 0

    while mix_count < n_new_patients:
        p1, p2 = np.random.choice(patient_ids, 2, replace=False)
        pair_key = tuple(sorted((p1, p2)))
        if pair_key in used_pairs:
            continue  # skip duplicates
        used_pairs.add(pair_key)

        data1 = label_data[label_data[patient_col] == p1].reset_index(drop=True)
        data2 = label_data[label_data[patient_col] == p2].reset_index(drop=True)

        min_len = min(len(data1), len(data2))
        if min_len == 0:
            continue

        mixed_eeg = (data1[eeg_cols].iloc[:min_len].values + data2[eeg_cols].iloc[:min_len].values) / 2.0
        mixed_df = data1.iloc[:min_len].copy()
        mixed_df[eeg_cols] = mixed_eeg
        mixed_df[patient_col] = f"{p1}_{p2}_mix"
        mixed_df[label_col] = label  # preserve class
        augmented_rows.append(mixed_df)
        mix_count += 1  # count only valid mix per class

# Combine and save
aug_df = pd.concat(augmented_rows, ignore_index=True)
aug_df.to_csv(os.path.join(OUTPUT_FOLDER, "augmented_mixed_data.csv"), index=False)

print(f"✅ Mix-augmented data saved to {OUTPUT_FOLDER}")
print(f"🧠 ADHD: {sum(aug_df[label_col] == 'ADHD')} | Control: {sum(aug_df[label_col] != 'ADHD')}")


✅ Mix-augmented data saved to ../../data/augmented
🧠 ADHD: 98144 | Control: 84758
