In [1]:
import json
import numpy as np
import os
import torch
import toml
import pandas as pd

In [2]:
# DEF CONSTATS

LABELS_PATH = "/home/kuba/projects/puff/paper00/experiments/01/data/smoking_labels_export_2025-07-15_22.json"
TRAIN_PERCENT = 0.8
DEV_PERCENT = 0.2
TEST_PERCENT = (1 - TRAIN_PERCENT - DEV_PERCENT)
RANDOM_SEED = 70
USE_GYRO = False
LABEL = {"puff", "puffs"}
LABEL_VALUE = 1 #what to place in the y vector 
RESAMPLE = False
PERCENT_OF_NEGATIVE_WINDOWS_TO_SAMPLE = 0.5 #from all windows that don't contain a label what percent to sample
THRESHOLD_FOR_GAP = 30 #min
SAVE_DIR = '/home/kuba/projects/puff/paper00/experiments/01/data'
WINDOW_SIZE = 1024
STEP_SIZE = WINDOW_SIZE 


train_sessions = []
dev_sessions = []
test_sessions = []

In [3]:
def save_config():
    """save experiment configuration to toml file"""
    config = {
        "paths": {
            "labels_path": LABELS_PATH,
            "save_dir": SAVE_DIR,
        },
        "experiment": {
            "label": LABEL,
            "resample": RESAMPLE,
            "random_seed": RANDOM_SEED,
            "window_size": WINDOW_SIZE,
            "step_size": STEP_SIZE,
            "percent_negative_windows": PERCENT_OF_NEGATIVE_WINDOWS_TO_SAMPLE,
            "threshold_gap_minutes": THRESHOLD_FOR_GAP,
            "use_gyro": USE_GYRO
        },
        "split": {
            "train_percent": TRAIN_PERCENT,
            "dev_percent": DEV_PERCENT,
            "test_percent": TEST_PERCENT,
        },
        "splits": {
            "train_sessions": train_sessions,
            "dev_essions": dev_sessions,
            "test_sessions": test_sessions 
        }
    }
    
    os.makedirs(SAVE_DIR, exist_ok=True)
    with open(os.path.join(SAVE_DIR, 'data_config.toml'), "w") as f:
        toml.dump(config, f)

In [4]:
def validate_splits():
    """makesure  dataset splits add up to 1.0"""
    if abs(TRAIN_PERCENT + DEV_PERCENT + TEST_PERCENT - 1.0) > 1e-6:
        raise ValueError(f"dataset % must add up to 1, not {TRAIN_PERCENT + DEV_PERCENT + TEST_PERCENT}")


In [5]:
def resample(df) :
    """resample dataframe to consistent sampling rate"""
    print("RESAMPLE has not been added yet so you need to impliment the function")
    raise RuntimeError("The resample function has not been implimented ")
    return df

In [6]:
def check_for_gaps(df):
    """split dataframe on time gaps larger than threshold"""
    gap_threshold_ns = THRESHOLD_FOR_GAP * 60 * 1_000_000_000
    df = df.sort_values('ns_since_reboot').reset_index(drop=True)
    time_diffs = df['ns_since_reboot'].diff()
    gap_indices = time_diffs[time_diffs > gap_threshold_ns].index
    
    if len(gap_indices) == 0:
        return [df]
    
    # split into segments
    segments = []
    start_idx = 0
    
    for gap_idx in gap_indices:
        if start_idx < gap_idx:
            segment = df.iloc[start_idx:gap_idx].copy()
            if not segment.empty:
                segments.append(segment)
        start_idx = gap_idx
    
    # add final segment
    if start_idx < len(df):
        final_segment = df.iloc[start_idx:].copy()
        if not final_segment.empty:
            segments.append(final_segment)
    
    return segments

In [7]:
def combine(session, project_path: str) -> pd.DataFrame:
    """combine accelerometer and gyroscope data for a session"""
    data_path = os.path.join(project_path, session['session_name'])
    
    try:
        accl = pd.read_csv(os.path.join(data_path, 'accelerometer_data.csv'))
        if USE_GYRO:
            gyro = pd.read_csv(os.path.join(data_path, 'gyroscope_data.csv'))
    except FileNotFoundError as e:
        print(f"Warning: Could not find data files for session {session['session_name']}: {e}")
        return pd.DataFrame()
    
    # rename columns to avoid conflicts
    accl = accl.rename(columns={"x": "x_acc", "y": "y_acc", "z": "z_acc"})
    
    # ensure data types are correct
    for col in ['ns_since_reboot', 'x_acc', 'y_acc', 'z_acc']:
        accl[col] = accl[col].astype(float)
    
    if USE_GYRO:
        gyro = gyro.rename(columns={"x": "x_gyro", "y": "y_gyro", "z": "z_gyro"})
        for col in ['ns_since_reboot', 'x_gyro', 'y_gyro', 'z_gyro']:
            gyro[col] = gyro[col].astype(float)
        
        # combine accelerometer and gyroscope data
        combined = pd.merge(accl, gyro, on='ns_since_reboot', how='inner')
        column_order = ['ns_since_reboot', 'x_acc', 'y_acc', 'z_acc', 'x_gyro', 'y_gyro', 'z_gyro']
    else:
        # use only accelerometer data
        combined = accl
        column_order = ['ns_since_reboot', 'x_acc', 'y_acc', 'z_acc']
    
    # reorder columns
    combined = combined[column_order]
    
    return combined.dropna()

In [8]:
def apply_labels_to_df(df, session) -> pd.DataFrame:
    """add labels to dataframe based on bout annotations"""
    if df.empty:
        return df
    
    bout_starts = []
    bout_ends = []

    for bout in session.get('bouts', []):
        if bout.get('label') in LABEL:
            bout_starts.append(bout['start_time'])
            bout_ends.append(bout['end_time'])

    df['label'] = 0

    for start, stop in zip(bout_starts, bout_ends):
        mask = (df['ns_since_reboot'] >= start) & (df['ns_since_reboot'] <= stop)
        df.loc[mask, 'label'] = LABEL_VALUE
    
    return df

In [9]:
def create_windows(df):
    """create sliding windows from dataframe"""
    if len(df) < WINDOW_SIZE:
        print(f"Warning: DataFrame too small ({len(df)} < {WINDOW_SIZE}), skipping")
        return np.array([]), np.array([])
    
    if USE_GYRO:
        feature_cols = ['x_acc', 'y_acc', 'z_acc', 'x_gyro', 'y_gyro', 'z_gyro']
    else:
        feature_cols = ['x_acc', 'y_acc', 'z_acc']
        
    X_data = df[feature_cols].values
    y_data = df['label'].values
    
    windows_X = []
    windows_y = []
    
    for i in range(0, len(df) - WINDOW_SIZE + 1, STEP_SIZE):
        window_X = X_data[i:i + WINDOW_SIZE]
        window_y = y_data[i:i + WINDOW_SIZE]
        
        windows_X.append(window_X)
        windows_y.append(window_y)
    
    return np.array(windows_X), np.array(windows_y)

In [10]:
def filter_negative_windows(X, y) :
    """sample negative windows based on configured percentage"""
    if PERCENT_OF_NEGATIVE_WINDOWS_TO_SAMPLE >= 1.0:
        return X, y
    
    # find windows with and without labels
    has_label = np.any(y > 0, axis=1)
    print(f'Positive samples: {np.where(has_label)[0].shape} : Negative Samples  {np.where(~has_label)[0].shape}')
    positive_indices = np.where(has_label)[0]
    negative_indices = np.where(~has_label)[0]
    
    # sample negative windows
    num_negative_to_keep = int(len(negative_indices) * PERCENT_OF_NEGATIVE_WINDOWS_TO_SAMPLE)
    np.random.seed(RANDOM_SEED)
    sampled_negative_indices = np.random.choice(negative_indices, size=num_negative_to_keep, replace=False)
    
    # combine positive and sampled negative windows
    keep_indices = np.concatenate([positive_indices, sampled_negative_indices])
    keep_indices = np.sort(keep_indices)
    
    return X[keep_indices], y[keep_indices]

In [11]:
def process_session(session, project_path) :
    """process a single session and return windowed data"""
    df = combine(session, project_path)

    # sample_interval = df['ns_since_reboot'].diff().median() * 1e-9
    # sample_rate = 1 / sample_interval
    # print(f"Sample rate: {sample_rate} Hz")

    if df.empty:
        return np.array([]), np.array([])
    
    # check for gaps and split if necessary
    segments = check_for_gaps(df)
    
    all_windows_X = []
    all_windows_y = []
    
    for segment in segments:
        if RESAMPLE:
            segment = resample(segment)
        
        segment = apply_labels_to_df(segment, session)
        windows_X, windows_y = create_windows(segment)
        
        if len(windows_X) > 0:
            all_windows_X.append(windows_X)
            all_windows_y.append(windows_y)
    
    if not all_windows_X:
        return np.array([]), np.array([])
    
    # concatenate all segments
    combined_X = np.concatenate(all_windows_X, axis=0)
    combined_y = np.concatenate(all_windows_y, axis=0)
    
    return combined_X, combined_y

In [12]:
def make_dataset(sessions_to_use):
    """create dataset from participant ids"""
    all_X = []
    all_y = []
    
    for session, _, project_path in sessions_to_use:
        if session in sessions_to_use:
            X, y = process_session(session, project_path)

        if len(X) > 0:
            all_X.append(X)
            all_y.append(y)
    
    if not all_X:
        return np.array([]), np.array([])
    
    # concatenate all participants
    dataset_X = np.concatenate(all_X, axis=0)
    dataset_y = np.concatenate(all_y, axis=0)
    
    # filter negative windows
    dataset_X, dataset_y = filter_negative_windows(dataset_X, dataset_y)
    
    # shuffle the dataset
    np.random.seed(RANDOM_SEED)
    indices = np.random.permutation(len(dataset_X))
    dataset_X = dataset_X[indices]
    dataset_y = dataset_y[indices]
    
    print(f"Dataset created with {len(dataset_X):,} windows")
    
    return dataset_X, dataset_y

In [13]:
def split_sessions(all_sessions):
    all_sessions = np.array(all_sessions)
    np.random.seed(RANDOM_SEED)
    random_perm = np.random.permutation(len(all_sessions))

    train_size = int(len(random_perm) * TRAIN_PERCENT)
    dev_size = int(len(random_perm) * DEV_PERCENT)

    train_idxs = random_perm[:train_size]
    dev_idxs = random_perm[train_size:train_size + dev_size]
    test_idxs = random_perm[train_size + dev_size:]
    
    print(f'TRAIN size: {len(train_idxs)}')
    print(f'DEV size: {len(dev_idxs)}')
    print(f'TEST size: {len(test_idxs)}')
    train, dev, test = all_sessions[train_idxs], all_sessions[dev_idxs], all_sessions[test_idxs]


    def add_session_details_to_list(sessions, maping_list):
        try:
            for session, project_info, _ in sessions:
                session_meta_data  = f"{project_info}_{session['session_name']}"
                maping_list.append(session_meta_data)
        except Exception as e:
            print(f"Error in processing the sessin details: {e}")

    add_session_details_to_list(train, train_sessions)
    add_session_details_to_list(dev, dev_sessions)
    add_session_details_to_list(test, test_sessions)

    return train, dev, test

In [14]:
def get_all_sessions(labels_data):
    all_sessions = [] # list of tuples with session ifo and project id
    for project in labels_data['projects']:
        participant_code_and_id = f'{project['project_name']}_{project['participant']['participant_code']}_{project['participant']['participant_id']}'
        for session in project['sessions']:
            # if session['bout_count'] > 0:
            lables_present = set()
            for bout in session['bouts']:
                lables_present.add(bout['label'])

            if LABEL.intersection(lables_present):
                all_sessions.append((session, participant_code_and_id, project['project_path']))

    return all_sessions
    

In [15]:
def save_dataset(X: np.ndarray, y: np.ndarray, name: str):
    """save X and y tensors in a .pt file with the name as name.pt"""
    if len(X) == 0:
        print(f"Warning: No data to save for {name}")
        return
    
    X_tensor = torch.tensor(X, dtype=torch.float32)
    y_tensor = torch.tensor(y, dtype=torch.float32)
    
    # transpose X to have shape (batch_size, features, time_steps)
    X_tensor = X_tensor.transpose(1, 2)
    
    save_path = os.path.join(SAVE_DIR, f"{name}.pt")
    torch.save((X_tensor, y_tensor), save_path)
    print(f"Saved {name} dataset with shape X: {X_tensor.shape}, y: {y_tensor.shape}\n\n")

In [16]:
"""main execution function"""

# validate configuration
validate_splits()

# save configuration
# save_config()

# load labels data
with open(LABELS_PATH, 'r') as f:
    labels_data = json.load(f)

all_sessions = get_all_sessions(labels_data)

train_data, dev_data, test_data = split_sessions(all_sessions)

# create and save datasets
print("Creating training dataset...")
train_X, train_y = make_dataset(train_data)
save_dataset(train_X, train_y, "train")

print("Creating development dataset...")
dev_X, dev_y = make_dataset(dev_data)
save_dataset(dev_X, dev_y, "dev")

# print("Creating test dataset...")
# test_X, test_y = make_dataset(test_data)
# save_dataset(test_X, test_y, "test")


TRAIN size: 43
DEV size: 10
TEST size: 1
Creating training dataset...
Creating development dataset...
