In [1]:
import numpy as np
import h5py
import sys
import os
from tqdm import tqdm

import torch
from torch.utils.data import Dataset, DataLoader

### Extract data from txt

In [None]:
def construct_dataset(train_path: str, test_path: str, columns_names: str, to_path: str):
    with h5py.File(to_path, 'w') as to_file:
        to_file.attrs['columns'] = columns_names

        for path in tqdm((train_path, test_path), file=sys.stdout):
            group_name = path.rsplit('/', maxsplit=1)[-1].split('_', maxsplit=1)[0]
            group = to_file.create_group(name=group_name)
            
            with open(path, 'r') as from_file:
                data = list()
                last_id = None
                
                for row in from_file:
                    row = [float(num) if '.' in num else int(num) for num in row.strip().split(' ')]

                    if last_id and last_id != row[0]:
                        group.create_dataset(name=str(last_id), data=np.array(data), dtype=np.float32)

                    data.append(row[2:])
                    last_id = row[0]
                
                group.create_dataset(name=str(last_id), data=np.array(data))

In [None]:
test_path = '../datasets/test_FD001.txt'
train_path = '../datasets/train_FD001.txt'

columns_names = ('1',
                 '2',
                 '3',
                 'T2',
                 'T24',
                 'T30',
                 'T50',
                 'P2',
                 'P15',
                 'P30',
                 'Nf',
                 'Nc',
                 'epr',
                 'Ps30',
                 'phi',
                 'NRf',
                 'NRc',
                 'BPR',
                 'farB',
                 'htBleed',
                 'Nf_dmd',
                 'PCNfR_dmd',
                 'W31',
                 'W32')

to_path = '../datasets/NASA.hdf'

if not os.path.exists(to_path):
    construct_dataset(train_path, test_path, columns_names, to_path)

### Custom Dataset class

In [2]:
class NasaDatasset_train(Dataset):

    def __init__(self, dataset_path: str, transform = None):
        self.hdf_file = h5py.File(dataset_path, 'r')['train']
        self.transfrom = transform

    def __len__(self):
        pass

    def __getitem__(self):
        pass


class NasaDataset_test(Dataset):

    def __init__(self, dataset_path: str, transform = None):
        self.hdf_file = h5py.File(dataset_path, 'r')['test']
        self.transfrom = transform
    
    def __len__(self):
        return len(self.hdf_file)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        sensors_data = self.hdf_file[str(idx+1)][:]

        if self.transfrom:
            sensors_data = self.transfrom(sensors_data)

        return sensors_data

In [4]:
dataset = NasaDataset_test('../datasets/NASA.hdf', 'train')
loader = DataLoader(dataset, batch_size=5, shuffle=True)