# Prepare Dataset
- Read raw session files that are annoted (include .json file)
- decimate sessions files from 100 Hz to 20 Hz
- Window the data into 100 datapoint (in 5 Hz) long windows of xyz
- Save xyz acc data and the labels to pytorch dataset files in:
    - `pipeline/datasets/train_dataset` 
    - `pipeline/datasets/test_dataset` with specified `test_size` 
    - `pipeline/datasets/<session_name>-holdout_dataset` with specified `n_sessions_holdout`

In [1]:
import pandas as pd
import numpy as np
import torch
from torch import nn
import seaborn as sns
import plotly.express as px
import json
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import os
from torch.utils.data import TensorDataset,DataLoader
from tqdm import tqdm

# Process Raw Data

In [2]:
# make directories
if not os.path.isdir('pipeline'):
    os.system('mkdir -p pipeline')
else:
    print("pipeline directory already exists - delete or rename it")

In [3]:
json_labels = {}

total_length = 0
nfiles = 5     # set to limit number of sessions to read
i=0

raw_dir = './data'

os.system('mkdir pipeline/1_dm')

for subdir in os.listdir(raw_dir):
    if os.path.exists(f'{raw_dir}/{subdir}/{subdir}_data.json'):
        # json files with annotations exists - dataset is annotated
        i += 1

        # Read labels
        with open(f'{raw_dir}/{subdir}/{subdir}_data.json', 'r') as f:
            annot = json.load(f)
            if not annot.get('puffs'):
                print(f'Skipped {subdir}')
                continue
            json_labels[subdir] = annot
        
        # if dataset is annotated, read raw data file, decimate from 100 to 20 Hz, and save just x,y,z in 1_dm dir
        df = pd.read_csv(f'{raw_dir}/{subdir}/raw_data.csv', header=None, usecols=[2,3,4], names=['x','y','z'])
        json_labels[subdir]['length'] = len(df[::5])
        total_length += len(df[::5])
        df[::5].to_csv(f'pipeline/1_dm/{subdir}.csv', index=False)

        print(f'{subdir} - Length: {len(df[::5])}')

    else:
        print(f'Skipped {subdir}')
        
    if i == nfiles:
        break

27 - Length: 28400
58 - Length: 41100
Skipped 29
67 - Length: 34600
Skipped 13
53 - Length: 31000
Skipped 7
61 - Length: 31200


In [6]:
# Window data

os.system('mkdir pipeline/2_windowed')

for i,file in enumerate(tqdm(os.listdir('pipeline/1_dm/'))):
    df = pd.read_csv(f'pipeline/1_dm/{file}')
    # print(f'{file} - Before {df.shape}')
    w = np.empty((len(df)-99, 300), dtype=float)
    for j in range(len(df)-99):
        w[j] = df[j:j+100].to_numpy().T.flatten()
    # print(f'After {w.shape}')
    np.save(f'pipeline/2_windowed/{file.split(".")[0]}.npy', w)

100%|██████████| 5/5 [00:03<00:00,  1.43it/s]


In [7]:
# Make Labels from annotations

os.system('mkdir pipeline/3_labels')

for i, (key, annot) in enumerate(json_labels.items()):
    l = np.zeros(annot['length'] - 99)
    for j in range(annot['start']//5, annot['end']//5):
        for puff in annot['puffs']:
            if j >= puff['start']//5 and j <= puff['end']//5:
                l[j] = 1
    
    np.save(f'pipeline/3_labels/{key}.npy', l.reshape(-1,1))

mkdir: cannot create directory ‘pipeline/3_labels’: File exists


In [None]:
# # visualize true labels on continous signal
# i = 21
# labels = np.load(f'pipeline/labels/{i}.npy')

# df = pd.read_csv(f'pipeline/1_dm/{i}.csv')
# df['label'] = np.pad(labels*10, (0,99), mode='constant', constant_values=0)

# fig = px.line(data_frame=df)
# fig.show(renderer='browser')

In [10]:
# Prepare Pytorch Datasets
n_sessions_holdout = 2

os.system('mkdir -p pipeline/holdouts')
os.system('mkdir -p pipeline/4_all/')

for i,file in enumerate(tqdm(os.listdir('pipeline/2_windowed'))):
    # First, save <n_sessions_holdout> sessions
    if i < n_sessions_holdout:
        print(f'Saving holdout for session {file.split(".")[0]}')
        session_name = file.split('.')[0]
        X_holdout = torch.from_numpy(np.load(f'pipeline/2_windowed/{file}')).float()
        y_holdout = torch.from_numpy(np.load(f'pipeline/3_labels/{file}')).float()
        # torch.save(TensorDataset(X_holdout, y_holdout), f'pipeline/holdouts/{session_name}-holdout.pt')
        continue

    # print(f'{i} - Saving windows of session {file.split(".")[0]}')
    X = np.load(f'pipeline/2_windowed/{file}')
    y = np.load(f'pipeline/3_labels/{file}')

    # Save each window from every session in its own file in 'pipeline/4_all'
    for j, (xi,yi) in enumerate(zip(X,y)):
        xi = torch.from_numpy(xi).float()
        yi = torch.from_numpy(yi).float()
        torch.save((xi,yi), f'pipeline/4_all/{j}.pt')

  0%|          | 0/5 [00:00<?, ?it/s]

Saving holdout for session 58
Saving holdout for session 61


100%|██████████| 5/5 [00:13<00:00,  2.68s/it]


In [11]:
# Get total number of window-files
n = len(os.listdir('pipeline/4_all'))
n

34501

In [12]:
# Train test split window-files into 'pipeline/4_all/train' and 'pipeline/4_all/test'
test_size = 0.3

os.system('mkdir -p pipeline/4_all/train')
os.system('mkdir -p pipeline/4_all/test')

# todo possibly stratify by splitting pos. and neg. samples and taking train/test samples from each
(train_idx, test_idx) = train_test_split(range(n), test_size=test_size, shuffle=True, random_state=0)

for i,idx in enumerate(tqdm(train_idx)):
    os.system(f'mv pipeline/4_all/{idx}.pt pipeline/4_all/train/{i}.pt')

for i,idx in enumerate(tqdm(test_idx)):
    os.system(f'mv pipeline/4_all/{idx}.pt pipeline/4_all/test/{i}.pt')

100%|██████████| 24150/24150 [00:18<00:00, 1286.33it/s]
100%|██████████| 10351/10351 [00:07<00:00, 1308.99it/s]
