In [2]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset, TensorDataset
from tqdm import tqdm
from pathlib import Path
from sklearn.model_selection import train_test_split
import numpy as np
from time import time
import matplotlib.pyplot as plt
from smokingml.datasets.nursing_dataset_v1 import (
    NursingDatasetV1,
    nursingv1_train_dev_test_split,
    load_one_session,
    load_sessions,
    load_one_windowed_session,
    load_windowed_sessions,
    utils
)

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
nursingv1_dir = Path('../data/nursingv1_dataset')
np.random.seed(0)

# # Using train dev test split function on all sessions
# train_dataset, dev_dataset, test_dataset = nursingv1_train_dev_test_split(nursingv1_dir, 0.5, 0.2, 0.3)

session_ids = utils.get_all_session_ids(nursingv1_dir)

## Using all sessions in fs - takes 13.5 minutes
start_time = time()
dataset = NursingDatasetV1(nursingv1_dir, session_ids)
for X,y in DataLoader(dataset):
    pass
print(f'Elapsed Time fs: {time() - start_time}')

## Load all sessions into memory - takes 1 minute
start_time = time()
dataset = load_windowed_sessions(nursingv1_dir, session_ids)
for X,y in DataLoader(dataset):
    pass
print(f'Elapsed Time fs: {time() - start_time}')

Elapsed Time fs: 814.0530607700348
Elapsed Time fs: 61.98713684082031


Create a dataset as follows. 
1. Load and individually-window data from 10 participants
2. aggregate (concatenate)
3. shuffle windows (across all 10)
4. train-dev split (80-20, no testing here)
5. optimize a simple MLP for a number of epochs
6. plot train and dev loss
7. print f1 score for trainloader and devloader
8. plot confusion matrix for both train and dev sets.

In [None]:
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.h1 = nn.Linear(in_features=)

In [3]:
# Load 10 sessions
test_size, dev_size = 0.8,0.2
batch_size = 64

nursingv1_dir = Path('../data/nursingv1_dataset')
session_ids = utils.get_all_session_ids(nursingv1_dir)
dataset = load_windowed_sessions(nursingv1_dir, session_ids)

train_dataset, dev_dataset = torch.utils.data.random_split(dataset, [test_size, dev_size])
trainloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
devloader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=True)

for X,y in trainloader:
    # todo flatten
    pass



In [229]:
# WINSIZE = 101   # for this dataset

# class NursingDatasetV1(Dataset):
#     """
#         Dataset class to handle the nursingv1_dataset
#     """

#     def __init__(self, dir: Path, session_ids: list[int], shuffle: bool = False) -> None:
#         super().__init__()

#         # Public attributes
#         self.dir = dir
#         self.session_ids = session_ids

#         # Private attributes
#         self._shuffle = shuffle
        
#         ## Get info from session sizes
        
#         # save length of each session in dataset - TODO might be able to replace this with just a sum - dont need to save lengths and rn its useless
#         self._lengths = []
        
#         # Save mapping from each possible index to the session that window is in
#         self._idx_to_session = []

#         for session_id in self.session_ids:
#             # Get shape of session from dataset
#             session_shape = torch.load(dir / f'{session_id}' / 'Xshape.pt')
            
#             # Save number of windows, which is session length - winsize + 1
#             self._lengths.append(session_shape[1] - WINSIZE + 1)

#             # Save which indices should map to this session as tuple (<session id>, <idx of window in that session>)
#             self._idx_to_session += zip([session_id]*self._lengths[-1], list(range(self._lengths[-1])))
#             # print(session_id, ':', self._idx_to_session[-1], '---', self._lengths[-1])


#         # Save random mapping of internal window indices to external indices (for shuffling)
#         self._idxs = list(range(sum(self._lengths)))
#         if shuffle:
#             np.random.shuffle(self._idxs)
        

#     def __getitem__(self, index: int) -> torch.Tensor:
#         # Return one single window from one of the sessions and its label
#         # return data in shape for convolution rather than linear input for now

#         # For now, only support postive integer indices
#         if not isinstance(index, int) or index < 0:
#             print("Error: Unsupported index type")
#             return None
        

#         ## Get session to choose window from based on index
#         # Use random mapping to choose random index
#         idx = self._idxs[index]     # Will catch index out of bounds
#         x,y = self._get_one_window_and_label(idx)
#         return (x,y)

#     def _get_one_window_and_label(self, idx: int) -> tuple[torch.Tensor]:
        
#         # Get the session that this idx is in and the idx within that session
#         session_id, window_idx = self._idx_to_session[idx]

#         # Read whole session and label files
#         X = torch.load(self.dir / f'{session_id}' / 'X.pt')
#         y = torch.load(self.dir / f'{session_id}' / 'y.pt')
#         # print(session_id, window_idx, X.shape[1] - WINSIZE +1)

#         # Window session starting at window_idx
#         window = X[:, window_idx:window_idx+WINSIZE]
#         label = y[window_idx]

#         return (window, label)

#     def __len__(self) -> int:
#         # Total number of windows in every session is length of dataset
#         return sum(self._lengths)

#     def get_one_session(self, session_id: int) -> tuple[torch.Tensor, torch.Tensor]:
#         # Get one unwindowed session from session_ids and its labels (labels are padded)
#         # Only return session if it is a part of this dataset

#         if session_id not in self.session_ids:
#             print("Error: Session id not a part of this dataset")
#             return None
        
#         return NursingDatasetV1.get_one_session_static(self.dir, session_id)


#     def get_all_sessions(self) -> list[tuple[torch.Tensor, torch.Tensor]]:
#         # return list of all unwindowed sessions and their labels in this dataset
#         return NursingDatasetV1.get_sessions_static(self.dir, self.session_ids)

#     def get_one_windowed_session(self, id) -> TensorDataset:
#         # Return one windowed session and its labels as tensor dataset
#         return NursingDatasetV1.get_one_windowed_session_static(self.dir, id)

#     def get_all_windowed_sessions(self) -> list[TensorDataset]:
#         # Return all windowed sessions and their labels as list of tensor datasets
#         pass
    
#     @staticmethod
#     def nursingv1_train_dev_test_split(
#         dir: Path, 
#         train_size: float, 
#         dev_size: float, 
#         test_size: float,
#         shuffle: bool = False,
#         session_ids: list[int] = None
#     ) -> tuple:
#         """
#             Creates and returns three NursingDatasetV1 objects for train,
#                 dev, and test purposes. Each of the three objects are given
#                 a subset of the total sessions in the dataset. The number
#                 of sessions given to each dataset is set with train, dev,
#                 and test size parameters, which each represent a percentage 
#                 of the total number of sessions.
#         Args:
#             dir (Path): filepath to nursingv1 dataset in filesystem
#             train_size (float): percent of sessions for train dataset
#             dev_size (float): percent of sessions for dev dataset
#             test_size (float): percent of sessions for test dataset
#             shuffle (bool, optional): shuffle dataset before split. Defaults to False.

#         Returns:
#             tuple: Three NursingDatasetV1 objects (train, dev, test)
#         """

#         ## Check parameters:
#         if not dir.is_dir():
#             print("Error: directory does not exist")
#             return None
        
#         if sum([train_size, dev_size, test_size]) != 1:
#             print("Error: train_size + dev_size + test_size != 1")
#             return None

#         ## Get list of all session ids in dataset or use provided ids
#         if not session_ids:
#             session_ids = NursingDatasetV1.get_all_session_ids(dir)
        
#         ## Split sessions into train, dev, and test
#         # Shuffle first if desired
#         if shuffle:
#             np.random.shuffle(session_ids)

#         # Get size of partitions
#         n_train_sessions = round(train_size * len(session_ids))
#         n_dev_sessions = round(dev_size * len(session_ids))

#         # Split sessions into three parts
#         train_ids, dev_ids, test_ids = np.split(
#             session_ids,
#             [n_train_sessions, n_train_sessions + n_dev_sessions]
#         )

#         return (
#             NursingDatasetV1(dir, train_ids, shuffle),
#             NursingDatasetV1(dir, dev_ids, shuffle),
#             NursingDatasetV1(dir, test_ids, shuffle)
#         )
    
#     @staticmethod
#     def get_all_session_ids(dir: Path) -> list[int]:
#         # Get list of all session ids in dataset

#         session_ids = []
#         for session_id in dir.iterdir():
#             session_ids.append(int(session_id.name))

#         return session_ids

#     @staticmethod
#     def get_one_session_static(dir: Path, session_id: int)  -> tuple[torch.Tensor, torch.Tensor]:
#         # Get one unwindowed session from fs and its labels (labels are padded)

#         # Load session and labels
#         X = torch.load(dir / f'{session_id}' / 'X.pt')
#         y = torch.load(dir / f'{session_id}' / 'y.pt')

#         # Pad labels with half of window size at beginning and end to match length of X
#         y = np.pad(
#             y.flatten(), 
#             (WINSIZE//2, WINSIZE//2), 
#             mode='constant',
#             constant_values=0
#         )

#         return (X,y)
    
#     @staticmethod
#     def get_sessions_static(dir: Path, session_ids) -> list[tuple[torch.Tensor, torch.Tensor]]:
#         # get unwindowed sessions listed in param and their labels (labels are padded)

#         sessions = []
#         for session_id in session_ids:
#             sessions.append(NursingDatasetV1.get_one_session_static(dir, int(session_id)))

#         return sessions
    
#     @staticmethod
#     def get_one_windowed_session_static(dir: Path, session_id: int) -> TensorDataset:
#         # Get one session from dataset, window it, and turn it into a TensorDataset with its labels
#         session = torch.load(dir / f'{session_id}' / 'X.pt')
#         labels = torch.load(dir / f'{session_id}' / 'y.pt')

#         # Window session
#         x_acc = session[0].reshape(-1, 1)
#         y_acc = session[1].reshape(-1, 1)
#         z_acc = session[2].reshape(-1, 1)

#         w = WINSIZE-1

#         xs = [x_acc[:-w]]
#         ys = [y_acc[:-w]]
#         zs = [z_acc[:-w]]

#         for i in range(1,w):
#             xs.append(x_acc[i:i-w])
#             ys.append(y_acc[i:i-w])
#             zs.append(z_acc[i:i-w])

#         xs.append(x_acc[w:])
#         ys.append(y_acc[w:])
#         zs.append(z_acc[w:])

#         xs = torch.cat(xs,axis=1).float()
#         ys = torch.cat(ys,axis=1).float()
#         zs = torch.cat(zs,axis=1).float()

#         X = torch.cat([xs,ys,zs], axis=1).reshape(-1, 3, WINSIZE)

#         # Return X and y as TensorDataset
#         return TensorDataset(X, labels)

#     @staticmethod
#     def get_windowed_sessions_static(dir: Path, session_ids: list[int] = None, shuffle: bool = False):
#         # return concatonated tensor of windowed sessions in list
#         # if no list is provided, all the sessions are returned (might cause memory issues)

#         if not session_ids:
#             session_ids = NursingDatasetV1.get_all_session_ids(dir)
        
#         sessions = []
#         all_labels = []
#         for session_id in session_ids:
#             dataset = NursingDatasetV1.get_one_windowed_session_static(dir, session_id)
#             sessions.append(dataset.tensors[0])
#             all_labels.append(dataset.tensors[1])
        
#         return TensorDataset(torch.cat(sessions), torch.cat(all_labels))

In [233]:
# nursingv1_dir = Path('../data/nursingv1_dataset')
# np.random.seed(0)

# # session_ids = NursingDatasetV1.get_all_session_ids(nursingv1_dir)[:2]
# # dataset = NursingDatasetV1(nursingv1_dir, session_ids, shuffle=True)

# # for X,y in DataLoader(dataset):
# #     print(X.shape, y.shape)

# # train_dataset, dev_dataset, test_dataset = NursingDatasetV1.nursingv1_train_dev_test_split(nursingv1_dir, 0.5, 0.2, 0.3)

# dataset = NursingDatasetV1.get_windowed_sessions_static(nursingv1_dir, [0,1,2])
# dataset.tensors[0].shape

# # dataset = NursingDatasetV1.get_one_windowed_session_static(nursingv1_dir, 0)
# # for X,y in DataLoader(dataset):
#     # print(X.shape, y.shape)

torch.Size([83100, 3, 101])