In [1]:
#from __future__ import annotations

#import random
#import shutil
#from datetime import datetime
from pathlib import Path
from typing import Callable, Dict, Iterator, List, Optional, Sequence, Tuple, Union

import numpy as np
import tensorflow as tf
import torch
from loguru import logger
#from torch.nn.utils.rnn import pad_sequence
#from tqdm import tqdm
from scipy.io import arff

Tensor = torch.Tensor

2022-06-10 12:08:04.989830: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-06-10 12:08:04.989865: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
def get_eeg(data_dir: Path="../data/raw") -> Path:
    dataset_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00264/EEG%20Eye%20State.arff"  # noqa: E501
    datapath = tf.keras.utils.get_file(
        "eeg_data", origin=dataset_url, untar=False, cache_dir=data_dir
    )

    datapath = Path(datapath)
    logger.info(f"Data is downloaded to {datapath}.")
    return datapath

In [3]:
df = get_eeg()

2022-06-10 12:08:33.975 | INFO     | __main__:get_eeg:8 - Data is downloaded to ../data/raw/datasets/eeg_data.


In [4]:
from __future__ import annotations
from typing import Tuple
from tqdm import tqdm
import random
Tensor = torch.Tensor


class BaseDataset:
    def __init__(self, datapath: Path) -> None:
        self.path = datapath
        self.data =  self.process_data()

    def process_data(self) -> None:
        data = arff.loadarff(self.path)
        cur_label = int(data[0][0][14]) #index 14 = label
        EEG_list = [] #Lege lijst waarin meerdere observaties worden opgeslagen
        EEG_full = [] #Lege lijst waarin meerdere batches in worden samengevoegd.
        for obs in data[0]:
            if int(obs[14]) == cur_label:
                EEG_dim = [] #Lege lijst waarin de EEG_dim van een bepaalde observatie in kunnen worden opgeslagen.
                for index, i in enumerate(obs):
                    if index != 14:
                        EEG_dim.append(i)
                EEG_dim = torch.Tensor(EEG_dim)
                EEG_list.append(EEG_dim)
            else:
                EEG_full_label = (cur_label, torch.stack(EEG_list))
                EEG_full.append(EEG_full_label)
                cur_label = int(obs[14])
                EEG_list = [] #Lege lijst waarin meerdere observaties in kunnen worden opgeslagen.
                EEG_dim = [] #Lege lijst waarin de EEG_dim van een bepaalde observatie in kunnen worden opgeslagen.
                for index, i in enumerate(obs):
                    if index != 14:
                        EEG_dim.append(i)
                EEG_dim = torch.Tensor(EEG_dim)
                EEG_list.append(EEG_dim)
        EEG_full_label = (cur_label, torch.stack(EEG_list))
        EEG_full.append(EEG_full_label)
        return EEG_full

    def __len__(self) -> int:
        return len(self.data)
   
    def __getitem__(self, idx: int) -> Tuple:
        return self.data[idx]

In [37]:
dataloader = BaseDataset(datapath=get_eeg())

2022-06-10 12:23:50.063 | INFO     | __main__:get_eeg:8 - Data is downloaded to ../data/raw/datasets/eeg_data.


In [6]:
dataloader.__len__()

24

In [6]:
 index_list = np.random.permutation(len(dataloader))
 index = 0
x, y = dataloader[int(index_list[index])]

In [63]:
y

tensor([[4329.2300, 4009.2300, 4289.2300,  ..., 4280.5098, 4635.8999,
         4393.8501],
        [4324.6201, 4004.6201, 4293.8501,  ..., 4279.4902, 4632.8198,
         4384.1001],
        [4327.6899, 4006.6699, 4295.3799,  ..., 4282.0498, 4628.7202,
         4389.2300],
        ...,
        [4468.2100, 4044.6201, 4305.1299,  ..., 4367.6899, 4833.8501,
         4571.7900],
        [4461.0298, 4041.0300, 4300.0000,  ..., 4365.1299, 4826.6699,
         4558.4600],
        [4452.8198, 4032.3101, 4295.3799,  ..., 4353.3301, 4808.2100,
         4549.2300]])

In [77]:
class BaseDatastreamer:
    """This datastreamer wil never stop
    The dataset should have a:
        __len__ method
        __getitem__ method

    """

    def __init__(
        self,
        dataset: BaseDataset,
        batchsize: int,
        preprocessor: Optional[Callable] = None,
    ) -> None:
        self.dataset = dataset
        self.data =  self.window()
        self.batchsize = batchsize
        self.preprocessor = preprocessor
        self.size = len(self.dataset)
        self.reset_index()
        

    def __len__(self) -> int:
        return int(len(self.data) / self.batchsize)
    
    def window(self) -> None:
        data = self.dataset
        list2 = []
        for i in range(24):
            n_window = len(data[i][1]) - 5 + 1
            time = torch.arange(0, 5).reshape(1, -1)
            window = torch.arange(0, n_window).reshape(-1, 1)
            idx = time + window
            test = data[i][1][idx]
            test2 = (data[i][0], test)
            list2.append(test2)
        return list2
        

    def reset_index(self) -> None:
        self.index_list = np.random.permutation(self.size)
        self.index = 0

    def batchloop(self) -> Sequence[Tuple]:
        batch = []
        for _ in range(self.batchsize):
            x, y = self.data[int(self.index_list[self.index])]
            batch.append((x, y))
            self.index += 1
        return batch

    def stream(self) -> Iterator:
        while True:
            if self.index > (self.size - self.batchsize):
                self.reset_index()
            batch = self.batchloop()
            if self.preprocessor is not None:
                X, Y = self.preprocessor(batch)  # noqa N806
            else:
                X, Y = zip(*batch)  # noqa N806
            yield X, Y


In [131]:
class BaseDataIterator:
    """This iterator will consume all data and stop automatically.
    The dataset should have a:
        __len__ method
        __getitem__ method

    """

    def __init__(self, dataset: BaseDataset, batchsize: int) -> None:
        self.dataset = dataset
        self.data = self.window()
        self.batchsize = batchsize

    def __len__(self) -> int:
        return int(len(self.data) / self.batchsize)

    def __iter__(self) -> BaseDataIterator:
        self.index = 0
        self.index_list = torch.randperm(len(self.data))
        return self
    
    def window(self) -> None:
        data = self.dataset
        list2 = []
        for i in range(24):
            n_window = len(data[i][1]) - 5 + 1
            time = torch.arange(0, 5).reshape(1, -1)
            window = torch.arange(0, n_window).reshape(-1, 1)
            idx = time + window
            test = data[i][1][idx]
            test2 = (data[i][0], test)
            list2.append(test2)
        return list2

    def batchloop(self) -> Tuple[List, List]:
        X = []  # noqa N806
        Y = []  # noqa N806
        for _ in range(self.batchsize):
            x, y = self.data[int(self.index_list[self.index])]
            X.append(x)
            Y.append(y)
            self.index += 1
        return X, Y

    def __next__(self) -> Tuple[Tensor, Tensor]:
        if self.index <= (len(self.data) - self.batchsize):
            X, Y = self.batchloop()  # noqa N806
            return torch.tensor(X), torch.tensor(Y)
        else:
            raise StopIteration
    
    def __getitem__(self, idx: int) -> Tuple:
        return self.data[idx]

In [128]:
test = BaseDataIterator(dataloader,10)

In [129]:
test.__len__()

2

In [130]:
x, y = test[0]
y.shape

torch.Size([184, 5, 14])

In [142]:
class PaddedDatagenerator(BaseDataIterator):
    """Iterator with additional padding of X

    Args:
        BaseDataIterator (_type_): _description_
    """

    def __init__(self, data: BaseDataset, batchsize: int) -> None:
        super().__init__(data, batchsize)

    def __next__(self) -> Tuple[Tensor, Tensor]:
        if self.index <= (len(self.data) - self.batchsize):
            X, Y = self.batchloop()  # noqa N806
            X_ = pad_sequence(X, batch_first=True, padding_value=0)  # noqa N806
            return X_, torch.tensor(Y)
        else:
            raise StopIteration


In [143]:
test2 = PaddedDatagenerator(test,1)

In [144]:
x, y = test2[0]
y.shape

torch.Size([180, 5, 5, 14])

In [63]:
def window(x: Tensor, n_time: int) -> Tensor:
    """
    Generates and index that can be used to window a timeseries.
    E.g. the single series [0, 1, 2, 3, 4, 5] can be windowed into 4 timeseries with
    length 3 like this:

    [0, 1, 2]
    [1, 2, 3]
    [2, 3, 4]
    [3, 4, 5]

    We now can feed 4 different timeseries into the model, instead of 1, all
    with the same length.
    """
    n_window = len(x) - n_time + 1
    time = torch.arange(0, n_time).reshape(1, -1)
    window = torch.arange(0, n_window).reshape(-1, 1)
    idx = time + window
    return idx

In [49]:
dataloader[0][1]

tensor([[4329.2300, 4009.2300, 4289.2300,  ..., 4280.5098, 4635.8999,
         4393.8501],
        [4324.6201, 4004.6201, 4293.8501,  ..., 4279.4902, 4632.8198,
         4384.1001],
        [4327.6899, 4006.6699, 4295.3799,  ..., 4282.0498, 4628.7202,
         4389.2300],
        ...,
        [4468.2100, 4044.6201, 4305.1299,  ..., 4367.6899, 4833.8501,
         4571.7900],
        [4461.0298, 4041.0300, 4300.0000,  ..., 4365.1299, 4826.6699,
         4558.4600],
        [4452.8198, 4032.3101, 4295.3799,  ..., 4353.3301, 4808.2100,
         4549.2300]])

In [69]:
len(dataloader[0][1])

188

In [120]:
for i in range(len(dataloader)):
    test_list =[]
    n_window = len(dataloader[i][1]) - 5 + 1
    time = torch.arange(0, 5).reshape(1, -1)
    window = torch.arange(0, n_window).reshape(-1, 1)
    idx = time + window
    test = dataloader[i][1][idx]
    #test_list.append(test)
    #test_full = (dataloader[i][0], torch.stack(test))


In [8]:
    def window2(dataloader) -> None:
        for i in range(len(data)):
            x, y = data[int(index_list[index])]
            test_list =[]
            n_window = len(y[i]) - 5 + 1
            time = torch.arange(0, 5).reshape(1, -1)
            window = torch.arange(0, n_window).reshape(-1, 1)
            idx = time + window
            test = y[i][idx]
            test_list.append(test)
            test_full = (x, torch.stack(test_list))
        return test_full

In [None]:
    def window(dataloader) -> None:
        data = dataloader
        for i in range(0, len(data len(data)):
            test_list =[]
            n_window = len(data[i][1]) - 5 + 1
            time = torch.arange(0, 5).reshape(1, -1)
            window = torch.arange(0, n_window).reshape(-1, 1)
            idx = time + window
            test = data[i][1][idx]
            test_list.append(test)
            test_full = (data[i][0], torch.stack(test_list))
        return test_full

In [56]:
def wind(dataloader):
    data = dataloader
    test_list =[]
    for i in range(len(data),len(data[i][1])):    
        n_window = len(data[i][1]) - 5 + 1
        time = torch.arange(0, 5).reshape(1, -1)
        window = torch.arange(0, n_window).reshape(-1, 1)
        idx = time + window
        test.append(i
        test = data[i][1][idx]
        
       

In [57]:
test.shape

torch.Size([184, 5, 14])

In [63]:
list2 = []
for i in range(24):
    n_window = len(dataloader[i][1]) - 5 + 1
    time = torch.arange(0, 5).reshape(1, -1)
    window = torch.arange(0, n_window).reshape(-1, 1)
    idx = time + window
    test = dataloader[i][1][idx]
    test2 = (dataloader[i][0], test)
    list2.append(test2)

    
    


  test = torch.tensor(test)


In [68]:
len(list2)

24

In [39]:
wind(dataloader)

In [40]:
test_list

[]

In [74]:
len(a)
a.shape()

AttributeError: 'tuple' object has no attribute 'shape'

In [115]:

n_window = len(dataloader[0][1]) - 5 + 1
time = torch.arange(0, 5).reshape(1, -1)
window = torch.arange(0, n_window).reshape(-1, 1)
idx = time + window
test = dataloader[0][1][idx]
len(test)

184

In [37]:
len(dataloader)

24

In [78]:
test_1 = BaseDatastreamer(dataloader,5)

In [79]:
len(test_1)

0

In [36]:
test_1

<__main__.BaseDatastreamer at 0x7f388b714910>

In [27]:
test_1.__next__()

AttributeError: 'BaseDatastreamer' object has no attribute '__next__'

In [14]:
def window(x: Tensor, n_time: int) -> Tensor:
    """
    Generates and index that can be used to window a timeseries.
    E.g. the single series [0, 1, 2, 3, 4, 5] can be windowed into 4 timeseries with
    length 3 like this:

    [0, 1, 2]
    [1, 2, 3]
    [2, 3, 4]
    [3, 4, 5]

    We now can feed 4 different timeseries into the model, instead of 1, all
    with the same length.
    """
    n_window = len(x) - n_time + 1
    time = torch.arange(0, n_time).reshape(1, -1)
    window = torch.arange(0, n_window).reshape(-1, 1)
    idx = time + window
    return idx