# Prepare Dataset
- Read raw session files that are annoted (include .json file)
- Save just x y z columns
- Window the data into 500 datapoint long windows of xyz
- Save xyz acc data and the labels to pytorch dataset files in:
    - `pipeline/datasets/train_dataset` 
    - `pipeline/datasets/test_dataset` with specified `test_size` 
    - `pipeline/datasets/<session_name>-holdout_dataset` with specified sessions

In [2]:
import pandas as pd
import numpy as np
import torch
import json
from sklearn.model_selection import train_test_split
import os
from torch.utils.data import TensorDataset
from tqdm import tqdm
import plotly.express as px

# Process Raw Data

In [3]:
# make directories
dir = '../test-dataset'

if not os.path.isdir(f'{dir}'):
    os.system(f'mkdir -p {dir}')
else:
    print(f"{dir} directory already exists - delete or rename it")

In [5]:
json_labels = {}

total_length = 0
nfiles = 3     # set to limit number of sessions to read
i=0

raw_dir = f'../smoking_dataset_100Hz/0_raw/'

os.system(f'mkdir {dir}/1_xyz')

for subdir in os.listdir(raw_dir):
    if os.path.exists(f'{raw_dir}/{subdir}/{subdir}_data.json'):
        # json files with annotations exists - dataset is annotated
        i += 1

        # Read labels
        with open(f'{raw_dir}/{subdir}/{subdir}_data.json', 'r') as f:
            annot = json.load(f)
            if not annot.get('puffs'):
                print(f'Skipped {subdir}')
                continue
            json_labels[subdir] = annot
        
        # if dataset is annotated, read raw data file and save just x,y,z in 1_xyz dir
        df = pd.read_csv(f'{raw_dir}/{subdir}/raw_data.csv', header=None, usecols=[2,3,4], names=['x','y','z'])
        json_labels[subdir]['length'] = len(df)
        total_length += len(df)
        df.to_csv(f'{dir}/1_xyz/{subdir}.csv', index=False)

        print(f'{subdir} - Length: {len(df)}')

    else:
        print(f'Skipped {subdir}')
        
    if i == nfiles:
        break

mkdir: cannot create directory ‘../test-dataset/1_xyz’: File exists


Skipped 70
66 - Length: 192500
57 - Length: 206500
41 - Length: 133500


In [6]:
# Window data

os.system(f'mkdir {dir}/2_windowed')

WIN_LEN = 500

for i,file in enumerate(tqdm(os.listdir(f'{dir}/1_xyz/'))):
    X = torch.from_numpy(pd.read_csv(f'{dir}/1_xyz/{file}').to_numpy())
    x = X[:,0].unsqueeze(1)
    y = X[:,1].unsqueeze(1)
    z = X[:,2].unsqueeze(1)

    w = 500 - 1

    xs = [x[:-w]]
    ys = [y[:-w]]
    zs = [z[:-w]]

    for i in range(1,w):
        xs.append(x[i:i-w])
        ys.append(y[i:i-w])
        zs.append(z[i:i-w])

    xs.append(x[w:])
    ys.append(y[w:])
    zs.append(z[w:])
    xs = torch.cat(xs,axis=1).float()
    ys = torch.cat(ys,axis=1).float()
    zs = torch.cat(zs,axis=1).float()

    X = torch.cat([xs,ys,zs],axis=1)

    torch.save(X, f'{dir}/2_windowed/{file.split(".")[0]}.pt')

100%|██████████| 3/3 [00:06<00:00,  2.21s/it]


In [7]:
# Make Labels from annotations

os.system(f'mkdir {dir}/3_labels')

for i, (key, annot) in enumerate(json_labels.items()):
    l = torch.zeros(annot['length'] - (WIN_LEN - 1))
    for j in range(annot['start'], annot['end']):
        for puff in annot['puffs']:
            # If the midpoint of window j is within a puff, label the window as a puff
            if j+WIN_LEN/2 >= puff['start'] and j+WIN_LEN/2 <= puff['end']:
                l[j] = 1
    
    torch.save(l.reshape(-1,1), f'{dir}/3_labels/{key}.pt')

In [8]:
# visualize true labels on continous signal
i = 14

labels = torch.load(f'{dir}/3_labels/{i}.pt').flatten()

df = pd.read_csv(f'{dir}/1_xyz/{i}.csv')
df['label'] = np.pad(labels*10, (WIN_LEN//2-1,WIN_LEN//2), mode='constant', constant_values=0)

fig = px.line(data_frame=df[::5])
fig.show(renderer='browser')

FileNotFoundError: [Errno 2] No such file or directory: '../test-dataset/3_labels/14.pt'

In [9]:
# Prepare Pytorch Datasets
holdout_sessions = [57]#['4', '7', '9', '17', '25', '31', '35', '36', '51', '59']

os.system(f'mkdir -p {dir}/holdouts')
os.system(f'mkdir -p {dir}/4_all/')

j = 0
for i,file in enumerate(tqdm(os.listdir(f'{dir}/2_windowed'))):
    # First, save <n_sessions_holdout> sessions
    print(i)
    session_name = file.split('.')[0]
    if session_name in holdout_sessions:
        print(f'Saving holdout for session {session_name}')
        X_holdout = torch.load(f'{dir}/2_windowed/{file}')
        y_holdout = torch.load(f'{dir}/3_labels/{file}')
        torch.save(TensorDataset(X_holdout, y_holdout), f'{dir}/holdouts/{session_name}-holdout.pt')
        continue

    # print(f'{i} - Saving windows of session {file.split(".")[0]}')
    X = torch.load(f'{dir}/2_windowed/{file}')
    y = torch.load(f'{dir}/3_labels/{file}')

    # Save each window from every session in its own file in '{dir}/4_all'
    for (xi,yi) in zip(X,y):
        torch.save((xi.clone(), yi.clone()), f'{dir}/4_all/{j}.pt')
        j += 1

100%|██████████| 3/3 [00:57<00:00, 19.12s/it]


In [None]:
# Get total number of window-files
n = len(os.listdir(f'{dir}/4_all'))
n

In [None]:
# Train test split window-files into '{dir}/4_all/train' and '{dir}/4_all/test'
test_size = 0.3

os.system(f'mkdir -p {dir}/4_all/train')
os.system(f'mkdir -p {dir}/4_all/test')

# todo possibly stratify by splitting pos. and neg. samples and taking train/test samples from each
(train_idx, test_idx) = train_test_split(range(n), test_size=test_size, shuffle=True, random_state=0)

for i,idx in enumerate(tqdm(train_idx)):
    os.system(f'mv {dir}/4_all/{idx}.pt {dir}/4_all/train/{i}.pt')

for i,idx in enumerate(tqdm(test_idx)):
    os.system(f'mv {dir}/4_all/{idx}.pt {dir}/4_all/test/{i}.pt')