In [1]:
import torch
import h5py
from torch.utils.data import DataLoader, TensorDataset, random_split

#Load dataset

In [2]:
file_path = '/Users/fhy/isep3th/project/data/2018.01/GOLD_XYZ_OSC.0001_1024.hdf5'

In [3]:
#Open my file in read mode
file_raw = h5py.File(file_path,'r')

#get 10% data to test(optional)

In [4]:
#access the datasets within the HDF5 File
data_raw_x = file_raw['X'][:]
data_raw_y = file_raw['Y'][:]
data_raw_z = file_raw['Z'][:]

# #reduce the dataset to 10%
# # Calculate the new reduced size (10% of the original)
new_size = int(len(data_raw_x) * 0.10)

# Generate random indines to maintain the corresponding relationships
randon_indices = torch.randperm(len(data_raw_x))[:new_size]
# Use the random indices to create reduced data
data_raw_x_reduced = data_raw_x[randon_indices]
data_raw_y_reduced = data_raw_y[randon_indices]
data_raw_z_reduced = data_raw_z[randon_indices]

#hdf5->tensor

In [5]:
# # Convert the hdf5 dataset to a PyTorch tensor
# X_tensor = torch.from_numpy(data_raw_x).float()
# Y_tensor = torch.from_numpy(data_raw_y).long()
# Z_tensor = torch.from_numpy(data_raw_z).long()

# Convert the hdf5 dataset to a PyTorch tensor
X_tensor = torch.from_numpy(data_raw_x_reduced).float()
Y_tensor = torch.from_numpy(data_raw_y_reduced).long()
Z_tensor = torch.from_numpy(data_raw_z_reduced).long()

#Random data according to index and split

In [6]:
# Get the number of samples in dataset
num_samples = X_tensor.size(0)

# Create a random permutation of indices
random_indices = torch.randperm(num_samples)

# Use these indices to shuffle data tensors
X_tensor = X_tensor[random_indices]
Y_tensor = Y_tensor[random_indices]
Z_tensor = Z_tensor[random_indices]

#Normalize according to the batch size

In [7]:
#Normalize X_tensor
# Define the batch size for normalization
batch_size = 64

# Calculate the number of batches needed
num_samples = X_tensor.size(0)
num_batches = (num_samples + batch_size - 1)//batch_size

# Create a list to store the normalized batches
normalized_batches = []

# Normalize the data in batches
for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, num_samples)
    batch = X_tensor[start_idx:end_idx]

    # Calculate batch statistics for normalization
    batch_mean = batch.mean(dim=0)
    batch_std = batch.std(dim=0)

    # Normalize the batch using z-score
    normalized_batch = (batch - batch_mean) / batch_std

    # Append the normalized batch to the list
    normalized_batches.append(normalized_batch)

# Stack the normalized batches back together
X_tensor = torch.cat(normalized_batches, dim=0)

# Verify that the number of samples remains the same
assert X_tensor.size(0) == num_samples

###See the shape of X after normalization

In [8]:
# Get the shape (dimensions) of the data
print(f"Data shape: {X_tensor.shape}")

Data shape: torch.Size([255590, 1024, 2])


###See information about 'X' after normalization

In [9]:
# Inspect attributes and basic information
print("Tensor Dataset_X Information:")
print("--------------------------")

# Check the data type of the tensor dataset
print(f"Data type: {X_tensor.type()}")

# Get the shape (dimensions) of the data
print(f"Data shape: {X_tensor.shape}")

# Check the number of elements in the dataset
print(f"Number of elements: {X_tensor.numel()}")

# Access the first few elements of the data
print("First few data elements:")
print(X_tensor[:10])

Tensor Dataset_X Information:
--------------------------
Data type: torch.FloatTensor
Data shape: torch.Size([255590, 1024, 2])
Number of elements: 523448320
First few data elements:
tensor([[[-0.2231, -0.2646],
         [ 0.4982, -0.7664],
         [-0.1687,  0.5066],
         ...,
         [-0.0397, -0.2905],
         [ 0.6986, -0.2619],
         [ 0.4629, -1.0703]],

        [[ 0.2684, -1.1706],
         [ 0.2288, -0.5310],
         [-0.0122, -0.7538],
         ...,
         [-0.0544, -0.5224],
         [ 0.6662, -1.1999],
         [ 0.0125,  0.3357]],

        [[ 0.5383, -0.1015],
         [ 0.3191, -0.5162],
         [ 0.4773, -0.4351],
         ...,
         [ 0.1005, -0.5876],
         [ 0.2164,  0.0614],
         [ 0.5558, -0.3028]],

        ...,

        [[ 0.5037,  0.5626],
         [ 0.0100, -0.4297],
         [ 0.7238, -0.5382],
         ...,
         [-0.1338, -0.4628],
         [-0.0753, -0.9135],
         [-0.0127, -0.3052]],

        [[ 0.0839, -0.1596],
         [ 0.1

###See information about 'Y' after normalization

In [10]:
# Inspect attributes and basic information
print("Tensor Dataset_Y Information:")
print("--------------------------")

# Check the data type of the tensor dataset
print(f"Data type: {Y_tensor.type()}")

# Get the shape (dimensions) of the data
print(f"Data shape: {Y_tensor.shape}")

# Check the number of elements in the dataset
print(f"Number of elements: {Y_tensor.numel()}")

# Access the first few elements of the data
print("First few data elements:")
print(Y_tensor[:10])

Tensor Dataset_Y Information:
--------------------------
Data type: torch.LongTensor
Data shape: torch.Size([255590, 24])
Number of elements: 6134160
First few data elements:
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]])


###See information about 'Z' after normalization

In [11]:
# Inspect attributes and basic information
print("Tensor Dataset_Z Information:")
print("--------------------------")

# Check the data type of the tensor dataset
print(f"Data type: {Z_tensor.type()}")

# Get the shape (dimensions) of the data
print(f"Data shape: {Z_tensor.shape}")

# Check the number of elements in the dataset
print(f"Number of elements: {Z_tensor.numel()}")

# Access the first few elements of the data
print("First few data elements:")
print(Z_tensor[:50])

Tensor Dataset_Z Information:
--------------------------
Data type: torch.LongTensor
Data shape: torch.Size([255590, 1])
Number of elements: 255590
First few data elements:
tensor([[-12],
        [ -6],
        [ -2],
        [ 20],
        [ 22],
        [ 16],
        [ 28],
        [ -4],
        [  8],
        [ 10],
        [-16],
        [-12],
        [ 20],
        [ 22],
        [ 26],
        [ 26],
        [ -8],
        [ 20],
        [  4],
        [  4],
        [ 30],
        [ 30],
        [ -8],
        [ -2],
        [ 26],
        [ 20],
        [  4],
        [ -4],
        [  0],
        [  4],
        [ -4],
        [ 18],
        [  2],
        [ 28],
        [  8],
        [ -6],
        [ 26],
        [ 12],
        [-14],
        [ -4],
        [ 26],
        [ -8],
        [ -2],
        [  0],
        [ 20],
        [-18],
        [ 16],
        [ 24],
        [  4],
        [ 30]])


#Split the data into train, test and validation

In [12]:
# Combine X_tensor, Y_tensor, Z_tensor into a single dataset
dataset = TensorDataset(X_tensor, Y_tensor, Z_tensor)

# Calculate the sizes of training, validation, and test sets.
total_size = len(X_tensor)

train_size = int(total_size * 0.6)
val_size = int(total_size * 0.2)
test_size = total_size - train_size -val_size

#split the dataset
train_dataset = dataset[:train_size]
val_dataset = dataset[train_size:train_size + val_size]
test_dataset = dataset[train_size+val_size:]

#Define batch size
batch_size = 64

#Create DataLoaders for each set
train_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle= False)
val_loader = DataLoader(val_dataset, batch_size = batch_size, shuffle= False)
test_loader = DataLoader(test_dataset, batch_size = batch_size, shuffle= False)