In [None]:
import os
import torch
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader

from google.colab import drive
drive.mount('/content/drive')

csv_path = "/content/drive/My Drive/APS360_Project/train_metadata.csv"

npy_folder = "/content/drive/My Drive/APS360_Project/tensors_train"

class AudioDataset(Dataset):
    def __init__(self, csv_path, npy_folder):

        # Read the CSV file; pandas preserves the row order
        self.df = pd.read_csv(csv_path)
        self.npy_folder = npy_folder

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):

        # Get the CSV row for index idx
        row = self.df.iloc[idx]

        # Extract the path from the CSV (this is the foreign key btw)
        mp3_path = row["path"]

        # Derive the base file name to locate the .npy file
        base_name = os.path.splitext(os.path.basename(mp3_path))[0]
        npy_file = os.path.join(self.npy_folder, base_name + ".npy")

        # Load the raw tensor from the .npy file
        audio_data = np.load(npy_file)
        audio_tensor = torch.tensor(audio_data, dtype=torch.float32)
        metadata = row.to_dict()

        # Return the tensor, the original MP3 file path (as a label/identifier), and metadata
        return audio_tensor, mp3_path, metadata

if __name__ == "__main__":

    # Create the dataset; CSV row order is preserved so it they should correlate the tensor withe the mp3 file
    dataset = AudioDataset(csv_path, npy_folder)

    # Create DataLoader to keep sequential order.
    dataloader = DataLoader(dataset, batch_size=4, shuffle=False) #

    # Iterate over the data
    for batch_idx, (audio_tensors, mp3_paths, metadata) in enumerate(dataloader):
        print(f"Batch {batch_idx}:")
        print("  MP3 file paths:", mp3_paths)
        print("  Audio tensor shape:", audio_tensors.shape)
