In [None]:
import json
import numpy as np
import os
import torch
import toml
import pandas as pd

In [None]:
# GLOBAL CONSTANTS - K-FOLD SPECIFIC
PARTICIPANT_IDS = [1, 3, 6, 7, 10]  # List of participant IDs to include
NUM_FOLDS = 5  # Number of folds for cross-validation
KFOLD_BASE_DIR = '/home/kuba/Desktop/k-fold'  # Base directory containing fold subdirectories
LABELS_PATH = "/home/kuba/projects/puff/paper00/experiments/01/data/smoking_labels_export_2025-07-15_22.json"  # Path to labels JSON file


#########################################################
#-------------------------------------------------------#
#########################################################

TRAIN_PERCENT = 0.6  # Percentage for training set (applied to non-test participants)
DEV_PERCENT = (1-TRAIN_PERCENT)
RANDOM_SEED = 70
USE_GYRO = False
LABEL = ['puff', 'puffs']
LABEL_VALUE = 1
RESAMPLE = False
PERCENT_OF_NEGATIVE_WINDOWS_TO_SAMPLE = 0.5
THRESHOLD_FOR_GAP = 30  # minutes
WINDOW_SIZE = 256
STEP_SIZE = 256  # Same as WINDOW_SIZE for non-overlapping windows

# Global variables for current fold processing
CURRENT_FOLD = 0
CURRENT_SAVE_DIR = ""
TRAIN_IDS = []
DEV_IDS = []
TEST_IDS = []

In [None]:
def validate_kfold_structure():
    """Check that k-fold directory structure is correct"""
    if not os.path.exists(KFOLD_BASE_DIR):
        raise FileNotFoundError(f"K-fold base directory does not exist: {KFOLD_BASE_DIR}")
    
    # Check for exactly NUM_FOLDS subdirectories
    expected_dirs = [f"fold-{i}" for i in range(NUM_FOLDS)]
    existing_dirs = [d for d in os.listdir(KFOLD_BASE_DIR) 
                    if os.path.isdir(os.path.join(KFOLD_BASE_DIR, d))]
    
    missing_dirs = set(expected_dirs) - set(existing_dirs)
    extra_dirs = set(existing_dirs) - set(expected_dirs)
    
    if missing_dirs:
        raise FileNotFoundError(f"Missing fold directories: {sorted(missing_dirs)}")
    
    if extra_dirs:
        print(f"Warning: Found unexpected directories: {sorted(extra_dirs)}")
    
    print(f"Validated k-fold structure with {NUM_FOLDS} folds")

def validate_parameters():
    """Validate k-fold parameters"""
    if not PARTICIPANT_IDS:
        raise ValueError("PARTICIPANT_IDS cannot be empty")
    
    if TRAIN_PERCENT + DEV_PERCENT != 1.0:
        raise ValueError(f"TRAIN_PERCENT + DEV_PERCENT must be == 1.0, got {TRAIN_PERCENT + DEV_PERCENT}")
    
    if NUM_FOLDS > len(PARTICIPANT_IDS):
        print(f"Warning: NUM_FOLDS ({NUM_FOLDS}) > participants ({len(PARTICIPANT_IDS)}), "
              f"this will create leave-one-out style validation")
    
    if not os.path.exists(LABELS_PATH):
        raise FileNotFoundError(f"Labels file does not exist: {LABELS_PATH}")

In [None]:
def generate_kfold_splits():
    """Generate all k-fold participant splits"""
    participants_per_fold = len(PARTICIPANT_IDS) // NUM_FOLDS
    remainder = len(PARTICIPANT_IDS) % NUM_FOLDS
    
    splits = []
    start_idx = 0
    
    for fold_k in range(NUM_FOLDS):
        # Calculate test set size for this fold
        if fold_k < remainder:
            test_size = participants_per_fold + 1
        else:
            test_size = participants_per_fold
        
        # Get test participants for this fold
        test_ids = PARTICIPANT_IDS[start_idx:start_idx + test_size]
        
        # Get remaining participants for train/dev split
        remaining_ids = [p for p in PARTICIPANT_IDS if p not in test_ids]
        
        # Split remaining participants into train/dev
        np.random.seed(RANDOM_SEED)
        random_perm = np.random.permutation(remaining_ids)
        train_size = int(len(random_perm) * TRAIN_PERCENT)
        dev_size = int(len(random_perm) - train_size)
        
        train_ids = random_perm[:train_size].tolist()
        dev_ids = random_perm[train_size:train_size + dev_size].tolist()
        
        splits.append({
            'fold': fold_k,
            'train_ids': train_ids,
            'dev_ids': dev_ids,
            'test_ids': test_ids
        })
        
        start_idx += test_size
    
    return splits

In [None]:
def save_config(fold_num, train_ids, dev_ids, test_ids):
    """Save experiment configuration to toml file with k-fold specific parameters"""
    config = {
        "kfold": {
            "fold_number": fold_num,
            "total_folds": NUM_FOLDS,
            "participant_ids": PARTICIPANT_IDS
        },
        "splits": {
            "train_ids": train_ids,
            "dev_ids": dev_ids,
            "test_ids": test_ids
        },
        "paths": {
            "labels_path": LABELS_PATH,
            "kfold_base_dir": KFOLD_BASE_DIR,
            "save_dir": CURRENT_SAVE_DIR
        },
        "experiment": {
            "label": LABEL,
            "label_value": LABEL_VALUE,
            "resample": RESAMPLE,
            "random_seed": RANDOM_SEED,
            "window_size": WINDOW_SIZE,
            "step_size": STEP_SIZE,
            "percent_negative_windows": PERCENT_OF_NEGATIVE_WINDOWS_TO_SAMPLE,
            "threshold_gap_minutes": THRESHOLD_FOR_GAP,
            "use_gyro": USE_GYRO
        },
        "split": {
            "train_percent": TRAIN_PERCENT,
            "dev_percent": DEV_PERCENT,
            "test_percent": 1.0 - TRAIN_PERCENT - DEV_PERCENT,
            "num_participants": len(PARTICIPANT_IDS)
        }
    }
    
    os.makedirs(CURRENT_SAVE_DIR, exist_ok=True)
    with open(os.path.join(CURRENT_SAVE_DIR, 'data_config.toml'), "w") as f:
        toml.dump(config, f)

In [None]:

def validate_splits():
    """Make sure dataset splits add up to 1.0"""
    test_percent = 1.0 - TRAIN_PERCENT - DEV_PERCENT
    if abs(TRAIN_PERCENT + DEV_PERCENT + test_percent - 1.0) > 1e-6:
        raise ValueError(f"Dataset percents must add up to 1, not {TRAIN_PERCENT + DEV_PERCENT + test_percent}")


In [None]:
def resample(df):
    """Resample dataframe to consistent sampling rate"""
    print("RESAMPLE has not been added yet so you need to implement the function")
    raise RuntimeError("The resample function has not been implemented")
    return df


In [None]:

def check_for_gaps(df):
    """Split dataframe on time gaps larger than threshold"""
    gap_threshold_ns = THRESHOLD_FOR_GAP * 60 * 1_000_000_000
    df = df.sort_values('ns_since_reboot').reset_index(drop=True)
    time_diffs = df['ns_since_reboot'].diff()
    gap_indices = time_diffs[time_diffs > gap_threshold_ns].index
    
    if len(gap_indices) == 0:
        return [df]
    
    # Split into segments
    segments = []
    start_idx = 0
    
    for gap_idx in gap_indices:
        if start_idx < gap_idx:
            segment = df.iloc[start_idx:gap_idx].copy()
            if not segment.empty:
                segments.append(segment)
        start_idx = gap_idx
    
    # Add final segment
    if start_idx < len(df):
        final_segment = df.iloc[start_idx:].copy()
        if not final_segment.empty:
            segments.append(final_segment)
    
    return segments

In [None]:
def rename_df(df, type):
    if set(['ns_since_reboot', 'x', 'y', 'z']).issubset(set(df.columns)):
        df = df.rename(columns={"x": f"x_{type}", "y": f"y_{type}", "z": f"z_{type}"})
    elif set(['ns_since_reboot', f'{type}_x', f'{type}_y', f'{type}_z']).issubset(set(df.columns)):
        df = df.rename(columns={f"{type}_x": f"x_{type}", f"{type}_y": f"y_{type}", f"{type}_z": f"z_{type}"})
    else:   
        # rename columns to avoid conflicts
        raise ValueError(f"Warning coloumn names are: {df.columns} but expected to be: ['ns_since_reboot', 'x', 'y', 'z']")
    
    return df

In [None]:
def combine(session, project_path: str) -> pd.DataFrame:
    """combine accelerometer and gyroscope data for a session"""
    data_path = os.path.join(project_path, session['session_name'])
    
    try:
        accl = pd.read_csv(os.path.join(data_path, 'accelerometer_data.csv'))
        if USE_GYRO:
            gyro = pd.read_csv(os.path.join(data_path, 'gyroscope_data.csv'))
    except FileNotFoundError as e:
        print(f"Warning: Could not find data files for session {session['session_name']}: {e}")
        return pd.DataFrame()
    
    accl = rename_df(accl, type='accel')

    # ensure data types are correct
    for col in ['ns_since_reboot', 'x_accel', 'y_accel', 'z_accel']:
        accl[col] = accl[col].astype(float)
    
    if USE_GYRO:
        gyro = rename_df(gyro, type='gyro')
        
        # combine accelerometer and gyroscope data
        combined = pd.merge(accl, gyro, on='ns_since_reboot', how='inner')
        column_order = ['ns_since_reboot', 'x_accel', 'y_accel', 'z_accel', 'x_gyro', 'y_gyro', 'z_gyro']
    else:
        # use only accelerometer data
        combined = accl
        column_order = ['ns_since_reboot', 'x_accel', 'y_accel', 'z_accel']
    
    # reorder columns
    combined = combined[column_order]
    
    return combined.dropna()

In [None]:

def apply_labels_to_df(df, session) -> pd.DataFrame:
    """Add labels to dataframe based on bout annotations"""
    if df.empty:
        return df
    
    bout_starts = []
    bout_ends = []

    for bout in session.get('bouts', []):
        if bout.get('label') in LABEL:
            bout_starts.append(bout['start_time'])
            bout_ends.append(bout['end_time'])

    df['label'] = 0

    for start, stop in zip(bout_starts, bout_ends):
        mask = (df['ns_since_reboot'] >= start) & (df['ns_since_reboot'] <= stop)
        df.loc[mask, 'label'] = LABEL_VALUE
    
    return df

In [None]:
def create_windows(df):
    """create sliding windows from dataframe"""
    if len(df) < WINDOW_SIZE:
        print(f"Warning: DataFrame too small ({len(df)} < {WINDOW_SIZE}), skipping")
        return np.array([]), np.array([])
    
    if USE_GYRO:
        feature_cols = ['x_accel', 'y_accel', 'z_accel', 'x_gyro', 'y_gyro', 'z_gyro']
    else:
        feature_cols = ['x_accel', 'y_accel', 'z_accel']
        
    X_data = df[feature_cols].values
    y_data = df['label'].values
    
    windows_X = []
    windows_y = []
    
    for i in range(0, len(df) - WINDOW_SIZE + 1, STEP_SIZE):
        window_X = X_data[i:i + WINDOW_SIZE]
        window_y = y_data[i:i + WINDOW_SIZE]
        
        windows_X.append(window_X)
        windows_y.append(window_y)
    
    return np.array(windows_X), np.array(windows_y)

In [None]:

def filter_negative_windows(X, y):
    """Sample negative windows based on configured percentage"""
    if PERCENT_OF_NEGATIVE_WINDOWS_TO_SAMPLE >= 1.0:
        return X, y
    
    # Find windows with and without labels
    has_label = np.any(y > 0, axis=1)
    print(f'Positive samples: {np.where(has_label)[0].shape} : Negative Samples  {np.where(~has_label)[0].shape}')
    positive_indices = np.where(has_label)[0]
    negative_indices = np.where(~has_label)[0]
    
    # Sample negative windows
    num_negative_to_keep = int(len(negative_indices) * PERCENT_OF_NEGATIVE_WINDOWS_TO_SAMPLE)
    np.random.seed(RANDOM_SEED)
    sampled_negative_indices = np.random.choice(negative_indices, size=num_negative_to_keep, replace=False)
    
    # Combine positive and sampled negative windows
    keep_indices = np.concatenate([positive_indices, sampled_negative_indices])
    keep_indices = np.sort(keep_indices)
    
    return X[keep_indices], y[keep_indices]


In [None]:

def process_session(session, project_path):
    """Process a single session and return windowed data"""
    df = combine(session, project_path)

    if df.empty:
        return np.array([]), np.array([])
    
    # Check for gaps and split if necessary
    segments = check_for_gaps(df)
    
    all_windows_X = []
    all_windows_y = []
    
    for segment in segments:
        if RESAMPLE:
            segment = resample(segment)
        
        segment = apply_labels_to_df(segment, session)
        windows_X, windows_y = create_windows(segment)
        
        if len(windows_X) > 0:
            all_windows_X.append(windows_X)
            all_windows_y.append(windows_y)
    
    if not all_windows_X:
        return np.array([]), np.array([])
    
    # Concatenate all segments
    combined_X = np.concatenate(all_windows_X, axis=0)
    combined_y = np.concatenate(all_windows_y, axis=0)
    
    return combined_X, combined_y


In [None]:

def process_participant(participant):
    """Process all sessions for a participant"""
    all_X = []
    all_y = []
    
    for session in participant.get('sessions', []):
        X, y = process_session(session, participant['project_path'])
        
        if len(X) > 0:
            all_X.append(X)
            all_y.append(y)
    
    if not all_X:
        return np.array([]), np.array([])
    
    # Concatenate all sessions
    participant_X = np.concatenate(all_X, axis=0)
    participant_y = np.concatenate(all_y, axis=0)
    
    return participant_X, participant_y

In [None]:

def make_dataset(ids, labels_data):
    """Create dataset from participant ids"""
    all_X = []
    all_y = []
    
    for project in labels_data['projects']:
        participant_id = project['participant']['participant_id']
        
        if participant_id not in ids:
            continue
        
        print(f"Processing participant {participant_id} in project: {project['project_name']}")
        X, y = process_participant(project)
        
        if len(X) > 0:
            all_X.append(X)
            all_y.append(y)
    
    if not all_X:
        return np.array([]), np.array([])
    
    # Concatenate all participants
    dataset_X = np.concatenate(all_X, axis=0)
    dataset_y = np.concatenate(all_y, axis=0)
    
    # Filter negative windows
    dataset_X, dataset_y = filter_negative_windows(dataset_X, dataset_y)
    
    # Shuffle the dataset
    np.random.seed(RANDOM_SEED)
    indices = np.random.permutation(len(dataset_X))
    dataset_X = dataset_X[indices]
    dataset_y = dataset_y[indices]
    
    print(f"Dataset created with {len(dataset_X):,} windows")
    
    return dataset_X, dataset_y

In [None]:

def save_dataset(X: np.ndarray, y: np.ndarray, name: str):
    """Save X and y tensors in a .pt file with the name as name.pt"""
    if len(X) == 0:
        print(f"Warning: No data to save for {name}")
        return
    
    X_tensor = torch.tensor(X, dtype=torch.float32)
    y_tensor = torch.tensor(y, dtype=torch.float32)
    
    # Transpose X to have shape (batch_size, features, time_steps)
    X_tensor = X_tensor.transpose(1, 2)
    
    save_path = os.path.join(CURRENT_SAVE_DIR, f"{name}.pt")
    torch.save((X_tensor, y_tensor), save_path)
    print(f"Saved {name} dataset with shape X: {X_tensor.shape}, y: {y_tensor.shape}")


In [None]:
def process_fold(fold_num, train_ids, dev_ids, test_ids, labels_data):
    """Process a single fold"""
    global CURRENT_SAVE_DIR
    
    print(f"\n\nProcessing fold {fold_num}/{NUM_FOLDS}...")
    print(f"TRAIN ids: {train_ids}")
    print(f"DEV ids: {dev_ids}")
    print(f"TEST ids: {test_ids}")
    
    # Set up fold-specific save directory
    CURRENT_SAVE_DIR = os.path.join(KFOLD_BASE_DIR, f"fold-{fold_num}", "data")
    os.makedirs(CURRENT_SAVE_DIR, exist_ok=True)
    
    # Save fold configuration
    save_config(fold_num, train_ids, dev_ids, test_ids)
    
    # Create and save datasets
    print("Creating training dataset...")
    train_X, train_y = make_dataset(train_ids, labels_data)
    save_dataset(train_X, train_y, "train")
    
    print("Creating development dataset...")
    dev_X, dev_y = make_dataset(dev_ids, labels_data)
    save_dataset(dev_X, dev_y, "dev")
    
    print("Creating test dataset...")
    test_X, test_y = make_dataset(test_ids, labels_data)
    save_dataset(test_X, test_y, "test")
    

In [None]:
"""Main execution function"""
print("Starting K-Fold Cross-Validation Data Preparation")
print(f"Participants: {PARTICIPANT_IDS}")
print(f"Number of folds: {NUM_FOLDS}")
print(f"Base directory: {KFOLD_BASE_DIR}")
print()

# Validate configuration and directory structure
validate_parameters()
validate_kfold_structure()
validate_splits()

# Generate k-fold splits
print("Generating k-fold splits...")
fold_splits = generate_kfold_splits()

# Load labels data
print(f"Loading labels from: {LABELS_PATH}")
with open(LABELS_PATH, 'r') as f:
    labels_data = json.load(f)

# Process each fold
for split in fold_splits:
    process_fold(
        split['fold'],
        split['train_ids'],
        split['dev_ids'],
        split['test_ids'],
        labels_data
    )

print("K-Fold cross-validation data preparation completed!")