In [9]:
import h5py
import os

# Define the path where the H5 files are located
path = "/export/c09/lavanya/languageIdentification/zinglish/large/embeddingLarge"

# Function to parse an H5 file
def parse_h5_file(file_path):
    with h5py.File(file_path, 'r') as h5_file:
        print(f"Parsing {file_path}...")
        # Access dataset keys in the H5 file
        for key in h5_file.keys():
            print(f"Dataset: {key}")
            data = h5_file[key][:]
            print(f"Data Shape: {data.shape}")
            #print(f"Data Sample: {data[:5]}")  # Print the first 5 entries as a sample

# Loop over each file in the directory and parse if it's an H5 file
for filename in os.listdir(path):
    if filename.endswith('.h5'):
        file_path = os.path.join(path, filename)
        parse_h5_file(file_path)


Parsing /export/c09/lavanya/languageIdentification/zinglish/large/embeddingLarge/embed_1728062711.h5...
Dataset: LID
Data Shape: (200,)
Dataset: Layer_1
Data Shape: (315, 200, 192)
Dataset: Layer_2
Data Shape: (315, 200, 256)
Dataset: Layer_3
Data Shape: (315, 200, 384)
Dataset: Layer_4
Data Shape: (315, 200, 512)
Dataset: Layer_5
Data Shape: (315, 200, 384)
Dataset: Layer_6
Data Shape: (315, 200, 256)
Parsing /export/c09/lavanya/languageIdentification/zinglish/large/embeddingLarge/embed_1728062733.h5...
Dataset: LID
Data Shape: (200,)
Dataset: Layer_1
Data Shape: (292, 200, 192)
Dataset: Layer_2
Data Shape: (292, 200, 256)
Dataset: Layer_3


KeyboardInterrupt: 

In [None]:

import os
import h5py
import torch
from torch.utils.data import Dataset, DataLoader, random_split
import torch.nn as nn
import torch.optim as optim
import argparse
import matplotlib.pyplot as plt
from torch.nn.utils.rnn import pad_sequence


class HDF5Dataset(Dataset):
    def __init__(self, hdf5_file_paths, layer):
        self.hdf5_file_paths = hdf5_file_paths  
        self.layer = layer
        self.num_files = sum(h5py.File(file, 'r')[self.layer].shape[1] for file in hdf5_file_paths)  
        # print("num_files",self.num_files)
        # #num_files 49672
        # print("len of hdf5_file_paths",len(self.hdf5_file_paths))
        # #len of hdf5_file_paths 249
        # print("layer",self.layer)
        # #layer Layer_3

    def __len__(self):
        return self.num_files
    
    def __getitem__(self, idx):
        current_idx = idx
        for hdf5_file_path in self.hdf5_file_paths:
            with h5py.File(hdf5_file_path, 'r') as hf:
                if current_idx < hf[self.layer].shape[1]:  
                    X = hf[self.layer][:, current_idx, :]  
                    y = hf['LID'][current_idx].decode('utf-8')
                    break
                current_idx -= hf[self.layer].shape[1]  
        X = torch.tensor(X, dtype=torch.float32) 
        label_map = {'English': 0, 'Mandarin': 1}  
        y = label_map[y]  
        # print("_______________________in get item______________________________")
        # print("in get item X",X)
        # print("in get item X shape",X.shape)
        # print("in get item y",y)
        # print("in get item y shape",y)
        return X, y

def collate_fn(batch):
    X, y = zip(*batch)
    X_padded = pad_sequence(X, batch_first=True)
    y_tensor = torch.tensor(y, dtype=torch.long)
    print("_______________________in collate ______________________________")
    # print("X_padded",X_padded)
    print("X_padded shape",X_padded.shape)
    print("y_tensor",y_tensor)
    print("y_tensor shape",y_tensor)

    return X_padded, y_tensor

def main(hdf5_dir, save_dir_plot, layer, batch_size):
    layer_dir_plot = os.path.join(save_dir_plot, layer)
    hdf5_file_paths = [os.path.join(hdf5_dir, f) for f in os.listdir(hdf5_dir) if f.endswith('.h5')]
    #print("len of hdf5_file_paths is ", len(hdf5_file_paths))
    #len of hdf5_file_paths is  249

   
    dataset = HDF5Dataset(hdf5_file_paths, layer)
    # only calls get item

    #print("dataset length is",len(dataset))
    #dataset length is 49672
    # print("type of datset", type(dataset))
    # #type of datset <class '__main__.HDF5Dataset'>
    # print("\nInspecting the first data sample:")
    # print("dataset[0] is ", dataset[0])

    '''
    dataset[0] is  (tensor([[ 0.3550,  0.0613, -0.3587,  ...,  0.5942, -0.3976,  0.0628],
        [ 0.3108, -0.1092,  0.0022,  ...,  0.2336, -0.6032,  0.2023],
        [ 0.5742, -0.2064, -0.2224,  ...,  0.2964, -0.9036,  0.1687],
        ...,
        [ 0.1466, -0.0193, -0.1113,  ..., -0.3567, -0.3011, -0.0782],
        [ 0.1592, -0.0165, -0.1007,  ..., -0.3558, -0.2960, -0.0840],
        [ 0.1399,  0.0355, -0.0534,  ..., -0.3566, -0.2699, -0.0644]]), 0)
    '''
    # print("shape is dataset[0][0] is ", dataset[0][0].shape)
    # # shape is dataset[0][0] is  torch.Size([315, 192])
    # print("len of datset[0] is",len(dataset[0]))
    # # 2
    # print("datset[0][1] is",dataset[0][1])
    # # 0


    # print("\nInspecting the ninth data sample:")
    # print("dataset[9] is ", dataset[9])

    '''
    (tensor([[ 0.2362,  0.0080, -0.4423,  ...,  0.5518, -0.3113, -0.0182],
        [-0.1450, -0.2879, -0.3316,  ...,  0.3387, -0.5585, -0.0862],
        [ 0.1668, -0.4250, -0.3199,  ...,  0.3697, -0.6664, -0.1377],
        ...,
        [ 0.0709, -0.0845, -0.1716,  ..., -0.3089, -0.3951, -0.1136],
        [ 0.0850, -0.0711, -0.1694,  ..., -0.3221, -0.3827, -0.1115],
        [ 0.0607,  0.0858, -0.1421,  ..., -0.3824, -0.2395, -0.0459]]), 0)
    '''
    # print("shape of dataset[8][0] is ", dataset[200][0].shape)
    # # shape of dataset[9][0] is  torch.Size([315, 192])
    # print("len of datset[9] is",len(dataset[9]))
    # # 2
    # print("datset[9][1] is",dataset[9][1])
    # # 0

    print("*****************************************************************")
    print("starting train_loader")
    
    train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, num_workers=4)
    # calls collate_fn and get item
    #print("train_loader",train_loader)
    #train_loader <torch.utils.data.dataloader.DataLoader object at 0x7fe5214d46a0>

    print("\nInspecting the first batch of data:")
    for batch_X, batch_y in train_loader:
        print("Batch X:", batch_X)  # Display the first batch of embeddings
        print("Batch X shape:", batch_X.shape)  # Display the first batch of embeddings
        print("Batch y:", batch_y)    # Display the corresponding labels
        print("Batch y shape:", batch_y.shape)    # Display the corresponding labels
        break  # Exit the loop after the first iteration



if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Train a language identification model.')
    parser.add_argument('--hdf5_dir', type=str, default='/export/c09/lavanya/languageIdentification/zinglish/large/embeddingLarge/', help='Path to the HDF5 dir')
    parser.add_argument('--save_dir_plot', type=str, default='/export/c09/lavanya/languageIdentification/zinglish/large/compareLarge', help='save plots')
    # change here for layer
    parser.add_argument('--layer', type=str, default='Layer_1', help='Layer to include (e.g., Layer_3).')
    parser.add_argument('--batch_size', type=int, default=10, help='Batch size for training (default: 32).')
    args = parser.parse_args()
    main(args.hdf5_dir, args.save_dir_plot, args.layer, args.batch_size)


In [1]:
import h5py
import os

# Define the path where the H5 files are located
path = "/export/c09/lavanya/languageIdentification/seame/embed/conversation"

# Function to parse an H5 file
def parse_h5_file(file_path):
    with h5py.File(file_path, 'r') as h5_file:
        print(f"Parsing {file_path}...")
        # Access dataset keys in the H5 file
        for key in h5_file.keys():
            print(f"Dataset: {key}")
            data = h5_file[key][:]
            print(f"Data Shape: {data.shape}")
            #print(f"Data Sample: {data[:5]}")  # Print the first 5 entries as a sample

# Loop over each file in the directory and parse if it's an H5 file
for filename in os.listdir(path):
    if filename.endswith('.h5'):
        file_path = os.path.join(path, filename)
        parse_h5_file(file_path)


Parsing /export/c09/lavanya/languageIdentification/seame/embed/conversation/embed_1732049581.h5...
Dataset: LID
Data Shape: (128,)
Dataset: Layer_1
Data Shape: (233, 128, 192)
Dataset: Layer_2
Data Shape: (233, 128, 256)
Dataset: Layer_3
Data Shape: (233, 128, 384)
Dataset: Layer_4
Data Shape: (233, 128, 512)
Dataset: Layer_5
Data Shape: (233, 128, 384)
Dataset: Layer_6
Data Shape: (233, 128, 256)
Parsing /export/c09/lavanya/languageIdentification/seame/embed/conversation/embed_1732049679.h5...
Dataset: LID
Data Shape: (128,)
Dataset: Layer_1
Data Shape: (224, 128, 192)
Dataset: Layer_2
Data Shape: (224, 128, 256)
Dataset: Layer_3
Data Shape: (224, 128, 384)
Dataset: Layer_4
Data Shape: (224, 128, 512)
Dataset: Layer_5
Data Shape: (224, 128, 384)
Dataset: Layer_6
Data Shape: (224, 128, 256)
Parsing /export/c09/lavanya/languageIdentification/seame/embed/conversation/embed_1732049768.h5...
Dataset: LID
Data Shape: (128,)
Dataset: Layer_1
Data Shape: (201, 128, 192)
Dataset: Layer_2


KeyboardInterrupt: 