# Prepare Dataset
- Read raw session files that are annoted (include .json file)
- decimate sessions files from 100 Hz to 20 Hz
- Window the data into 100 datapoint (in 5 Hz) long windows of xyz
- Save xyz acc data and the labels to pytorch dataset files in:
    - `pipeline/datasets/train_dataset` 
    - `pipeline/datasets/test_dataset` with specified `test_size` 
    - `pipeline/datasets/<session_name>-holdout_dataset` with specified `n_sessions_holdout`

In [2]:
import pandas as pd
import numpy as np
import torch
from torch import nn
import seaborn as sns
import plotly.express as px
import json
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import os
from torch.utils.data import TensorDataset,DataLoader
from tqdm import tqdm

# Process Raw Data

In [3]:
# make directories
if not os.path.isdir('pipeline'):
    os.system('mkdir -p pipeline')
else:
    print("pipeline directory already exists - delete or rename it")

In [4]:
json_labels = {}

total_length = 0
nfiles = 80     # set to limit number of sessions to read
i=0

raw_dir = './data'

os.system('mkdir pipeline/1_dm')

for subdir in os.listdir(raw_dir):
    if os.path.exists(f'{raw_dir}/{subdir}/{subdir}_data.json'):
        # json files with annotations exists - dataset is annotated
        i += 1

        # Read labels
        with open(f'{raw_dir}/{subdir}/{subdir}_data.json', 'r') as f:
            annot = json.load(f)
            if not annot.get('puffs'):
                print(f'Skipped {subdir}')
                continue
            json_labels[subdir] = annot
        
        # if dataset is annotated, read raw data file, decimate from 100 to 20 Hz, and save just x,y,z in 1_dm dir
        df = pd.read_csv(f'{raw_dir}/{subdir}/raw_data.csv', header=None, usecols=[2,3,4], names=['x','y','z'])
        json_labels[subdir]['length'] = len(df[::5])
        total_length += len(df[::5])
        df[::5].to_csv(f'pipeline/1_dm/{subdir}.csv', index=False)

        print(f'{subdir} - Length: {len(df[::5])}')

    else:
        print(f'Skipped {subdir}')
        
    if i == nfiles:
        break

27 - Length: 28400
58 - Length: 41100
Skipped 29
67 - Length: 34600
Skipped 13
53 - Length: 31000
Skipped 7
61 - Length: 31200
41 - Length: 26700
Skipped 9
Skipped 46
Skipped 19
Skipped 5
Skipped 10
21 - Length: 38700
50 - Length: 23100
65 - Length: 36000
Skipped 6
Skipped 70
22 - Length: 39100
Skipped 52
Skipped 40
28 - Length: 27000
Skipped 49
Skipped 26
Skipped 0
Skipped 14
23 - Length: 39000
47 - Length: 31200
31 - Length: 23000
17 - Length: 21200
68 - Length: 33500
Skipped 32
35 - Length: 32400
57 - Length: 41300
20 - Length: 32300
43 - Length: 28900
Skipped 2
Skipped 38
18 - Length: 30800
51 - Length: 23800
37 - Length: 28700
45 - Length: 28200
Skipped 55
Skipped 15
16 - Length: 28500
Skipped 8
25 - Length: 31300
33 - Length: 33500
62 - Length: 32700
Skipped 3
63 - Length: 30100
Skipped 24
60 - Length: 32700
64 - Length: 38600
30 - Length: 26100
56 - Length: 41500
Skipped 12
Skipped 34
59 - Length: 41200
42 - Length: 34900
Skipped 11
Skipped 1
66 - Length: 38500
69 - Length: 3600

In [5]:
# Window data

os.system('mkdir pipeline/2_windowed')

for i,file in enumerate(os.listdir('pipeline/1_dm/')):
    df = pd.read_csv(f'pipeline/1_dm/{file}')
    # print(f'{file} - Before {df.shape}')
    w = np.empty((len(df)-99, 300), dtype=float)
    for j in range(len(df)-99):
        w[j] = df[j:j+100].to_numpy().T.flatten()
    # print(f'After {w.shape}')
    np.save(f'pipeline/2_windowed/{file.split(".")[0]}.npy', w)

In [6]:
# Make Labels from annotations

os.system('mkdir pipeline/3_labels')

for i, (key, annot) in enumerate(json_labels.items()):
    l = np.zeros(annot['length'] - 99)
    for j in range(annot['start']//5, annot['end']//5):
        for puff in annot['puffs']:
            if j >= puff['start']//5 and j <= puff['end']//5:
                l[j] = 1
    
    np.save(f'pipeline/3_labels/{key}.npy', l)

In [None]:
# # visualize true labels on continous signal
# i = 21
# labels = np.load(f'pipeline/labels/{i}.npy')

# df = pd.read_csv(f'pipeline/1_dm/{i}.csv')
# df['label'] = np.pad(labels*10, (0,99), mode='constant', constant_values=0)

# fig = px.line(data_frame=df)
# fig.show(renderer='browser')

In [34]:
# Prepare Pytorch Datasets
n_sessions_holdout = 2
test_size = 0.3

os.system('mkdir -p pipeline/holdouts')
os.system('mkdir -p pipeline/4_all/train')
os.system('mkdir -p pipeline/4_all/test')

# X = np.zeros((0,300))
# y = np.zeros((0,1))

for i,file in enumerate(os.listdir('pipeline/2_windowed')):
    # First, save <n_sessions_holdout> sessions
    if i < n_sessions_holdout:
        session_name = file.split('.')[0]
        X_holdout = torch.from_numpy(np.load(f'pipeline/2_windowed/{file}')).float()
        y_holdout = torch.from_numpy(np.load(f'pipeline/3_labels/{file}').reshape(-1,1)).float()
        torch.save(TensorDataset(X_holdout, y_holdout), f'pipeline/holdouts/{session_name}-holdout.pt')
    
    X = np.load(f'pipeline/2_windowed/{file}')
    y = np.load(f'pipeline/3_labels/{file}').reshape(-1,1)
    
    # Save each window from every session in its own file in 'pipeline/4_all'
    for j, (xi,yi) in enumerate(zip(X,y)):
        torch.save((xi,yi), f'pipeline/4_all/{j}.pt')
    break

In [None]:
# Prepare Pytorch Datasets
n_sessions_holdout = 2
test_size = 0.3

os.system('mkdir pipeline/datasets')

X = np.zeros((0,300))
y = np.zeros((0,1))

for i,file in enumerate(os.listdir('pipeline/2_windowed')):
    # First, save <n_sessions_holdout> sessions
    if i < n_sessions_holdout:
        session_name = file.split('.')[0]
        X_holdout = torch.from_numpy(np.load(f'pipeline/2_windowed/{file}')).float()
        y_holdout = torch.from_numpy(np.load(f'pipeline/labels/{file}').reshape(-1,1)).float()
        torch.save(TensorDataset(X_holdout, y_holdout), f'pipeline/datasets/{session_name}-holdout.pt')
        i += 1

    X = np.concatenate([X, np.load(f'pipeline/2_windowed/{file}')])
    y = np.concatenate([y, np.load(f'pipeline/labels/{file}').reshape(-1,1)])

(X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size=test_size, stratify=y, shuffle=True, random_state=0)

train_dataset = TensorDataset(torch.from_numpy(X_train).float(), torch.from_numpy(y_train).float())
test_dataset = TensorDataset(torch.from_numpy(X_test).float(), torch.from_numpy(y_test).float())

torch.save(train_dataset, 'pipeline/datasets/train_dataset.pt')
torch.save(test_dataset, 'pipeline/datasets/test_dataset.pt')