#Load data

In [None]:
import h5py

In [4]:
hdf5_file = h5py.File('/Users/fhy/isep3th/project/data/2018.01/GOLD_XYZ_OSC.0001_1024.hdf5', 'r')

#Know Data

In [5]:
dataset_names = list(hdf5_file.keys())

##Datasetname

In [6]:
print(dataset_names)

['X', 'Y', 'Z']


In [7]:
dataset_X = hdf5_file['X']
dataset_Y = hdf5_file['Y']
dataset_Z = hdf5_file['Z']

##Dataset type

In [8]:
data_type_x = dataset_X.dtype
data_type_y = dataset_Y.dtype
data_type_z = dataset_Z.dtype

In [9]:
print(f'Data type of dataset X: {data_type_x}')
print(f'Data type of dataset X: {data_type_y}')
print(f'Data type of dataset X: {data_type_z}')

Data type of dataset X: float32
Data type of dataset X: int64
Data type of dataset X: int64


#Transform from hdf5 to tensor

In [10]:
import torch

In [11]:
data_tensor_x = torch.Tensor(dataset_X[:])
data_tensor_y = torch.from_numpy(dataset_Y[:]).long()
data_tensor_z = torch.from_numpy(dataset_Z[:]).long()

#Learn about data in tensor type
##For ‘X’

In [13]:
# Inspect attributes and basic information
print("Tensor Dataset_X Information:")
print("--------------------------")

# Check the data type of the tensor dataset
print(f"Data type: {data_tensor_x.type()}")

# Get the shape (dimensions) of the data
print(f"Data shape: {data_tensor_x.shape}")

# Check the number of elements in the dataset
print(f"Number of elements: {data_tensor_x.numel()}")

# Access the first few elements of the data
print("First few data elements:")
print(data_tensor_x[:2])

Tensor Dataset_X Information:
--------------------------
Data type: torch.FloatTensor
Data shape: torch.Size([2555904, 1024, 2])
Number of elements: 5234491392
First few data elements:
tensor([[[ 0.0420,  0.2348],
         [-0.2729,  0.4051],
         [-0.2671,  0.2275],
         ...,
         [-0.7056, -0.2869],
         [-0.4116,  0.6683],
         [ 0.0649,  0.6358]],

        [[ 1.1986,  0.4494],
         [ 0.4854,  0.2827],
         [ 0.8679, -0.3319],
         ...,
         [-1.2129, -0.6429],
         [-0.3810,  0.7936],
         [ 0.0864,  1.0922]]])


##For ‘Y’

In [14]:
# Inspect attributes and basic information
print("Tensor Dataset_Y Information:")
print("--------------------------")

# Check the data type of the tensor dataset
print(f"Data type: {data_tensor_y.type()}")

# Get the shape (dimensions) of the data
print(f"Data shape: {data_tensor_y.shape}")

# Check the number of elements in the dataset
print(f"Number of elements: {data_tensor_y.numel()}")

# Access the first few elements of the data
print("First few data elements:")
print(data_tensor_y[:2:])

Tensor Dataset_Y Information:
--------------------------
Data type: torch.LongTensor
Data shape: torch.Size([2555904, 24])
Number of elements: 61341696
First few data elements:
tensor([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])


##For ‘Z’

In [15]:
# Inspect attributes and basic information
print("Tensor Dataset_Z Information:")
print("--------------------------")

# Check the data type of the tensor dataset
print(f"Data type: {data_tensor_z.type()}")

# Get the shape (dimensions) of the data
print(f"Data shape: {data_tensor_z.shape}")

# Check the number of elements in the dataset
print(f"Number of elements: {data_tensor_z.numel()}")

# Access the first few elements of the data
print("First few data elements:")
print(data_tensor_z[:2])

Tensor Dataset_Z Information:
--------------------------
Data type: torch.LongTensor
Data shape: torch.Size([2555904, 1])
Number of elements: 2555904
First few data elements:
tensor([[-20],
        [-20]])


#create TensorDataset object to store and train dataset and so on, use pytorch

In [18]:
from torch.utils.data import TensorDataset

In [19]:
dataset = TensorDataset(data_tensor_x, data_tensor_y, data_tensor_z)

#split data to train, test, evaluate

In [20]:
total_size = len(dataset)
train_size = int(0.6 * total_size)
val_size = int(0.2 * total_size)
test_size = total_size - train_size - val_size

#random split

In [21]:
from torch.utils.data import random_split

train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

#define batch

In [22]:
from torch.utils.data import DataLoader

batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

#Normalization

In [None]:
import torch
from tqdm import tqdm

data = data_tensor_x

# Initialize a progress bar for the calculation of mean and standard deviation
progress_bar_calculation = tqdm(total=len(data), unit="samples", desc="Calculating Mean and Std")

# Initialize accumulators for mean and standard deviation
mean_accumulator = 0.0
std_accumulator = 0.0

for sample in data:
    # Update the calculation progress bar
    progress_bar_calculation.update(1)

    # Calculate mean and standard deviation incrementally
    mean_accumulator += sample.mean().item()
    std_accumulator += sample.std().item()

# Finalize the calculation progress bar
progress_bar_calculation.close()

Calculating Mean and Std:   0%|          | 0/2555904 [00:00<?, ?samples/s]

In [None]:
mean = mean_accumulator / len(data)
std = std_accumulator / len(data)

In [None]:
# Initialize a new progress bar for the normalization step
progress_bar_normalization = tqdm(total=len(data), unit="samples", desc="Normalizing Data")

# Initialize an empty tensor for the normalized data
normalized_data = torch.empty(data.shape)

for i, sample in enumerate(data):
    # Update the normalization progress bar
    progress_bar_normalization.update(1)

    # Normalize the sample
    normalized_data[i] = (sample - mean) / std

# Finalize the normalization progress bar
progress_bar_normalization.close()

In [None]:
print("First few data elements:")
print(normalized_data[:2])