In [17]:
import json
import numpy as np
import os
import torch
import toml
import pandas as pd

In [18]:
# DEF CONSTATS

LABELS_PATH = "/home/kuba/Desktop/smoking_labels_export_2025-07-01_16.json"
NUM_PARTICIPATES = 10
TRAIN_PERCENT = 0.6
DEV_PERCENT = 0.2
TEST_PERCENT = (1 - TRAIN_PERCENT - DEV_PERCENT)
RANDOM_SEED = 70
USE_GYRO = False
LABEL = 'puff'
LABEL_VALUE = 1 #what to place in the y vector 
RESAMPLE = False
PERCENT_OF_NEGATIVE_WINDOWS_TO_SAMPLE = 1.0 #from all windows that don't contain a label what percent to sample
THRESHOLD_FOR_GAP = 30 #min
SAVE_DIR = '/home/kuba/projects/puff/test/data'
WINDOW_SIZE = 512
WINDOW_OVERLAP = WINDOW_SIZE // 2

TRAIN_IDS = []
DEV_IDS = []
TEST_IDS = []




In [19]:

def save_config():
    """save experiment configuration to toml file"""
    config = {
        "paths": {
            "labels_path": LABELS_PATH,
            "save_dir": SAVE_DIR,
        },
        "experiment": {
            "label": LABEL,
            "resample": RESAMPLE,
            "random_seed": RANDOM_SEED,
            "window_size": WINDOW_SIZE,
            "window_overlap": WINDOW_OVERLAP,
            "percent_negative_windows": PERCENT_OF_NEGATIVE_WINDOWS_TO_SAMPLE,
            "threshold_gap_minutes": THRESHOLD_FOR_GAP,
            "use_gyro": USE_GYRO
        },
        "split": {
            "train_percent": TRAIN_PERCENT,
            "dev_percent": DEV_PERCENT,
            "test_percent": TEST_PERCENT,
            "num_participants": NUM_PARTICIPATES,
        },
        "splits": {
            "train_ids": TRAIN_IDS.tolist() if isinstance(TRAIN_IDS, np.ndarray) else TRAIN_IDS,
            "dev_ids": DEV_IDS.tolist() if isinstance(DEV_IDS, np.ndarray) else DEV_IDS,
            "test_ids": TEST_IDS.tolist() if isinstance(TEST_IDS, np.ndarray) else TEST_IDS
        }
    }
    
    os.makedirs(SAVE_DIR, exist_ok=True)
    with open(os.path.join(SAVE_DIR, 'config.toml'), "w") as f:
        toml.dump(config, f)

In [20]:
def validate_splits():
    """makesure  dataset splits add up to 1.0"""
    if abs(TRAIN_PERCENT + DEV_PERCENT + TEST_PERCENT - 1.0) > 1e-6:
        raise ValueError(f"dataset percents must add up to 1, not {TRAIN_PERCENT + DEV_PERCENT + TEST_PERCENT}")


In [21]:
def create_participant_splits():
    """create random train/dev/test splits of participants"""
    global TRAIN_IDS, DEV_IDS, TEST_IDS
    
    np.random.seed(RANDOM_SEED)
    random_perm = np.random.permutation(NUM_PARTICIPATES)
    train_size = int(len(random_perm) * TRAIN_PERCENT)
    dev_size = int(len(random_perm) * DEV_PERCENT)

    TRAIN_IDS = random_perm[:train_size]
    DEV_IDS = random_perm[train_size:train_size + dev_size]
    TEST_IDS = random_perm[train_size + dev_size:]
    
    print(f'TRAIN ids: {TRAIN_IDS}')
    print(f'DEV ids: {DEV_IDS}')
    print(f'TEST ids: {TEST_IDS}')


In [22]:
def resample(df) :
    """resample dataframe to consistent sampling rate"""
    print("RESAMPLE has not been added yet so you need to impliment the function")
    raise RuntimeError("The resample function has not been implimented ")
    return df

In [23]:
def check_for_gaps(df):
    """split dataframe on time gaps larger than threshold"""
    gap_threshold_ns = THRESHOLD_FOR_GAP * 60 * 1_000_000_000
    df = df.sort_values('ns_since_reboot').reset_index(drop=True)
    time_diffs = df['ns_since_reboot'].diff()
    gap_indices = time_diffs[time_diffs > gap_threshold_ns].index
    
    if len(gap_indices) == 0:
        return [df]
    
    # split into segments
    segments = []
    start_idx = 0
    
    for gap_idx in gap_indices:
        if start_idx < gap_idx:
            segment = df.iloc[start_idx:gap_idx].copy()
            if not segment.empty:
                segments.append(segment)
        start_idx = gap_idx
    
    # add final segment
    if start_idx < len(df):
        final_segment = df.iloc[start_idx:].copy()
        if not final_segment.empty:
            segments.append(final_segment)
    
    return segments

In [24]:
def combine(session, project_path: str) -> pd.DataFrame:
    """combine accelerometer and gyroscope data for a session"""
    data_path = os.path.join(project_path, session['session_name'])
    
    try:
        accl = pd.read_csv(os.path.join(data_path, 'accelerometer_data.csv'))
        if USE_GYRO:
            gyro = pd.read_csv(os.path.join(data_path, 'gyroscope_data.csv'))
    except FileNotFoundError as e:
        print(f"Warning: Could not find data files for session {session['session_name']}: {e}")
        return pd.DataFrame()
    
    # rename columns to avoid conflicts
    accl = accl.rename(columns={"x": "x_acc", "y": "y_acc", "z": "z_acc"})
    
    # ensure data types are correct
    for col in ['ns_since_reboot', 'x_acc', 'y_acc', 'z_acc']:
        accl[col] = accl[col].astype(float)
    
    if USE_GYRO:
        gyro = gyro.rename(columns={"x": "x_gyro", "y": "y_gyro", "z": "z_gyro"})
        for col in ['ns_since_reboot', 'x_gyro', 'y_gyro', 'z_gyro']:
            gyro[col] = gyro[col].astype(float)
        
        # combine accelerometer and gyroscope data
        combined = pd.merge(accl, gyro, on='ns_since_reboot', how='inner')
        column_order = ['ns_since_reboot', 'x_acc', 'y_acc', 'z_acc', 'x_gyro', 'y_gyro', 'z_gyro']
    else:
        # use only accelerometer data
        combined = accl
        column_order = ['ns_since_reboot', 'x_acc', 'y_acc', 'z_acc']
    
    # reorder columns
    combined = combined[column_order]
    
    return combined.dropna()

In [25]:

def apply_labels_to_df(df, session) -> pd.DataFrame:
    """add labels to dataframe based on bout annotations"""
    if df.empty:
        return df
    
    bout_starts = []
    bout_ends = []

    for bout in session.get('bouts', []):
        if bout.get('label') == LABEL:
            bout_starts.append(bout['start_time'])
            bout_ends.append(bout['end_time'])

    df['label'] = 0

    for start, stop in zip(bout_starts, bout_ends):
        mask = (df['ns_since_reboot'] >= start) & (df['ns_since_reboot'] <= stop)
        df.loc[mask, 'label'] = LABEL_VALUE
    
    return df


In [26]:
def create_windows(df):
    """create sliding windows from dataframe"""
    if len(df) < WINDOW_SIZE:
        print(f"Warning: DataFrame too small ({len(df)} < {WINDOW_SIZE}), skipping")
        return np.array([]), np.array([])
    
    if USE_GYRO:
        feature_cols = ['x_acc', 'y_acc', 'z_acc', 'x_gyro', 'y_gyro', 'z_gyro']
    else:
        feature_cols = ['x_acc', 'y_acc', 'z_acc']
        
    X_data = df[feature_cols].values
    y_data = df['label'].values
    
    windows_X = []
    windows_y = []
    
    for i in range(0, len(df) - WINDOW_SIZE + 1, WINDOW_OVERLAP):
        window_X = X_data[i:i + WINDOW_SIZE]
        window_y = y_data[i:i + WINDOW_SIZE]
        
        windows_X.append(window_X)
        windows_y.append(window_y)
    
    return np.array(windows_X), np.array(windows_y)

In [27]:
def filter_negative_windows(X, y) :
    """sample negative windows based on configured percentage"""
    if PERCENT_OF_NEGATIVE_WINDOWS_TO_SAMPLE >= 1.0:
        return X, y
    
    # find windows with and without labels
    has_label = np.any(y > 0, axis=1)
    print(f"has labels shape: {has_label.shape}")
    positive_indices = np.where(has_label)[0]
    negative_indices = np.where(~has_label)[0]
    
    # sample negative windows
    num_negative_to_keep = int(len(negative_indices) * PERCENT_OF_NEGATIVE_WINDOWS_TO_SAMPLE)
    np.random.seed(RANDOM_SEED)
    sampled_negative_indices = np.random.choice(negative_indices, size=num_negative_to_keep, replace=False)
    
    # combine positive and sampled negative windows
    keep_indices = np.concatenate([positive_indices, sampled_negative_indices])
    keep_indices = np.sort(keep_indices)
    
    return X[keep_indices], y[keep_indices]

In [28]:
def process_session(session, project_path) :
    """process a single session and return windowed data"""
    df = combine(session, project_path)
    
    if df.empty:
        return np.array([]), np.array([])
    
    # check for gaps and split if necessary
    segments = check_for_gaps(df)
    
    all_windows_X = []
    all_windows_y = []
    
    for segment in segments:
        if RESAMPLE:
            segment = resample(segment)
        
        segment = apply_labels_to_df(segment, session)
        windows_X, windows_y = create_windows(segment)
        
        if len(windows_X) > 0:
            all_windows_X.append(windows_X)
            all_windows_y.append(windows_y)
    
    if not all_windows_X:
        return np.array([]), np.array([])
    
    # concatenate all segments
    combined_X = np.concatenate(all_windows_X, axis=0)
    combined_y = np.concatenate(all_windows_y, axis=0)
    
    return combined_X, combined_y

In [29]:
def process_participant(participant) :
    """process all sessions for a participant"""
    all_X = []
    all_y = []
    
    for session in participant.get('sessions', []):
        X, y = process_session(session, participant['project_path'])
        
        if len(X) > 0:
            all_X.append(X)
            all_y.append(y)
    
    if not all_X:
        return np.array([]), np.array([])
    
    # concatenate all sessions
    participant_X = np.concatenate(all_X, axis=0)
    participant_y = np.concatenate(all_y, axis=0)
    
    return participant_X, participant_y

In [30]:
def make_dataset(ids, labels_data):
    """create dataset from participant ids"""
    all_X = []
    all_y = []
    
    for project in labels_data['projects']:
        participant_id = project['participant']['participant_id']
        
        if participant_id not in ids:
            continue
        
        print(f"Processing participant {participant_id}")
        X, y = process_participant(project)
        
        if len(X) > 0:
            all_X.append(X)
            all_y.append(y)
    
    if not all_X:
        return np.array([]), np.array([])
    
    # concatenate all participants
    dataset_X = np.concatenate(all_X, axis=0)
    dataset_y = np.concatenate(all_y, axis=0)
    
    # filter negative windows
    dataset_X, dataset_y = filter_negative_windows(dataset_X, dataset_y)
    
    # shuffle the dataset
    np.random.seed(RANDOM_SEED)
    indices = np.random.permutation(len(dataset_X))
    dataset_X = dataset_X[indices]
    dataset_y = dataset_y[indices]
    
    print(f"Dataset created with {len(dataset_X)} windows")
    
    return dataset_X, dataset_y

In [31]:
def save_dataset(X: np.ndarray, y: np.ndarray, name: str):
    """save X and y tensors in a .pt file with the name as name.pt"""
    if len(X) == 0:
        print(f"Warning: No data to save for {name}")
        return
    
    X_tensor = torch.tensor(X, dtype=torch.float32)
    y_tensor = torch.tensor(y, dtype=torch.float32)
    
    # transpose X to have shape (batch_size, features, time_steps)
    X_tensor = X_tensor.transpose(1, 2)
    
    save_path = os.path.join(SAVE_DIR, f"{name}.pt")
    torch.save((X_tensor, y_tensor), save_path)
    print(f"Saved {name} dataset with shape X: {X_tensor.shape}, y: {y_tensor.shape}")

In [32]:
"""main execution function"""
# validate configuration
validate_splits()

# create participant splits
create_participant_splits()

# save configuration
save_config()

# load labels data
with open(LABELS_PATH, 'r') as f:
    labels_data = json.load(f)

# create and save datasets
print("Creating training dataset...")
train_X, train_y = make_dataset(TRAIN_IDS, labels_data)
save_dataset(train_X, train_y, "trainhh")

# print("Creating development dataset...")
# dev_X, dev_y = make_dataset(DEV_IDS, labels_data)
# save_dataset(dev_X, dev_y, "dev")

# print("Creating test dataset...")
# test_X, test_y = make_dataset(TEST_IDS, labels_data)
# save_dataset(test_X, test_y, "test")


TRAIN ids: [5 8 1 9 7 3]
DEV ids: [4 0]
TEST ids: [2 6]
Creating training dataset...
Processing participant 3


Processing participant 1
Dataset created with 87282 windows
Saved trainhh dataset with shape X: torch.Size([87282, 3, 512]), y: torch.Size([87282, 512])
