In [35]:
import torchaudio
import torch

# Paths to the lossless and lossy songs
lossless_path = "/home/j597s263/scratch/j597s263/Datasets/Audio/Lossless/44"
lossy_path = "/home/j597s263/scratch/j597s263/Datasets/Audio/Lossy/44"

# Load the lossless and lossy waveforms
lossless_waveform, lossless_sample_rate = torchaudio.load(lossless_path)
lossy_waveform, lossy_sample_rate = torchaudio.load(lossy_path)

# Print shapes and sample rates
print(f"Lossless waveform shape: {lossless_waveform.shape}, Sample rate: {lossless_sample_rate}")
print(f"Lossy waveform shape: {lossy_waveform.shape}, Sample rate: {lossy_sample_rate}")

Lossless waveform shape: torch.Size([2, 33836963]), Sample rate: 88200
Lossy waveform shape: torch.Size([2, 16918482]), Sample rate: 44100


In [36]:
# Segment duration in seconds
segment_duration = 0.001  # Each segment corresponds to 0.1 seconds

# Calculate segment size in samples
lossless_segment_size = int(lossless_sample_rate * segment_duration)
lossy_segment_size = int(lossy_sample_rate * segment_duration)

print(f"Lossless segment size: {lossless_segment_size} samples")
print(f"Lossy segment size: {lossy_segment_size} samples")

Lossless segment size: 88 samples
Lossy segment size: 44 samples


In [37]:
# Split the lossless waveform into fixed-size segments
lossless_segments = [
    lossless_waveform[:, i:i + lossless_segment_size]
    for i in range(0, lossless_waveform.shape[1], lossless_segment_size)
    if lossless_waveform[:, i:i + lossless_segment_size].shape[1] == lossless_segment_size
]

# Split the lossy waveform into fixed-size segments
lossy_segments = [
    lossy_waveform[:, i:i + lossy_segment_size]
    for i in range(0, lossy_waveform.shape[1], lossy_segment_size)
    if lossy_waveform[:, i:i + lossy_segment_size].shape[1] == lossy_segment_size
]

# Pad each lossy segment to match the size of the lossless segments
padded_lossy_segments = []
for lossy_segment, lossless_segment in zip(lossy_segments, lossless_segments):
    current_size = lossy_segment.shape[1]
    target_size = lossless_segment.shape[1]
    if current_size < target_size:
        # Calculate random padding
        front_pad = torch.randint(0, target_size - current_size + 1, (1,)).item()
        back_pad = target_size - current_size - front_pad
        padded_segment = torch.nn.functional.pad(lossy_segment, (front_pad, back_pad))
    else:
        padded_segment = lossy_segment
    padded_lossy_segments.append(padded_segment)

print(f"Aligned number of segments: {len(lossless_segments)} == {len(padded_lossy_segments)}")

Aligned number of segments: 384510 == 384510


In [38]:
# Calculate memory usage per segment
segment_memory_lossless = lossless_segments[0].nelement() * torch.finfo(lossless_segments[0].dtype).bits / 8  # bytes
segment_memory_lossy = padded_lossy_segments[0].nelement() * torch.finfo(padded_lossy_segments[0].dtype).bits / 8  # bytes

# Memory per pair of segments
pair_memory = segment_memory_lossless + segment_memory_lossy
print(f"Memory usage per pair of segments: {pair_memory / 1024:.2f} KB")

Memory usage per pair of segments: 1.38 KB


In [39]:
# Total number of segments
total_segments = len(lossless_segments)

# Total memory usage for the song pair
total_memory = total_segments * pair_memory
print(f"Total memory usage for one pair of songs: {total_memory / (1024 ** 2):.2f} MB")

Total memory usage for one pair of songs: 516.31 MB


In [1]:
import os
import torch
import torchaudio
import random
import torchaudio.transforms as T

class AudioDataset:
    def __init__(self, lossless_dir, lossy_dir, segment_duration=0.1):
        """
        Initializes the dataset and processes songs one by one, adding valid pairs to the dataset.
        """
        self.lossless_files = sorted(
            [os.path.join(lossless_dir, f) for f in os.listdir(lossless_dir) if os.path.isfile(os.path.join(lossless_dir, f))]
        )
        self.lossy_files = sorted(
            [os.path.join(lossy_dir, f) for f in os.listdir(lossy_dir) if os.path.isfile(os.path.join(lossy_dir, f))]
        )

        assert len(self.lossless_files) == len(self.lossy_files), "Mismatch in number of lossless and lossy files!"

        self.segment_duration = segment_duration
        self.data = []  # Store valid segment pairs in memory

    def process_and_add(self):
        """
        Processes each song and adds valid pairs (with matching segment counts) to the dataset.
        """
        for idx, (lossless_path, lossy_path) in enumerate(zip(self.lossless_files, self.lossy_files)):
            song_data = self.process_pair(lossless_path, lossy_path)
            if song_data:  # Only add if the song pair is valid
                self.data.extend(song_data)
            if (idx + 1) % 10 == 0:
                print(f"Processed {idx + 1}/{len(self.lossless_files)} songs...")

        print(f"Dataset created with {len(self.data)} valid segment pairs.")

    def process_pair(self, lossless_path, lossy_path):
        """
        Processes a pair of lossless and lossy files into aligned stereo segments.
        Excludes the pair if the number of segments is unequal.
        """
        lossless_segments, lossless_segment_size = self.preprocess(lossless_path)
        lossy_segments, lossy_segment_size = self.preprocess(lossy_path)

        # Exclude songs with unequal segment counts
        if len(lossless_segments) != len(lossy_segments):
            print(f"Skipping {lossless_path} and {lossy_path} due to unequal segments.")
            return []

        # Randomly pad lossy segments to match lossless size
        padded_lossy_segments = [
            self.random_pad(lossy_segment, lossless_segment.shape[1])
            for lossy_segment, lossless_segment in zip(lossy_segments, lossless_segments)
        ]

        return list(zip(padded_lossy_segments, lossless_segments))

    def preprocess(self, file_path):
        """
        Loads an audio file, calculates dynamic segment size, and splits into stereo segments.
        """
        waveform, sample_rate = torchaudio.load(file_path)

        # Calculate segment size dynamically
        segment_size = int(sample_rate * self.segment_duration)

        # Split waveform into fixed-size segments
        segments = [
            waveform[:, i:i + segment_size]
            for i in range(0, waveform.shape[1], segment_size)
            if waveform[:, i:i + segment_size].shape[1] == segment_size
        ]
        return segments, segment_size

    def random_pad(self, segment, target_size):
        """
        Randomly pads the input segment to match the target size.
        """
        current_size = segment.shape[1]
        if current_size >= target_size:
            return segment  # No padding needed

        # Calculate padding size
        padding_size = target_size - current_size
        front_pad = random.randint(0, padding_size)
        back_pad = padding_size - front_pad

        return torch.nn.functional.pad(segment, (front_pad, back_pad))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        """
        Returns a single pair of lossy and lossless stereo segments.
        """
        return self.data[idx]


In [8]:
lossless_dir = "/home/j597s263/scratch/j597s263/Datasets/Audio/Lossless/temp/"
lossy_dir = "/home/j597s263/scratch/j597s263/Datasets/Audio/Lossy/temp"

# Create the dataset processor
dataset = AudioDataset(lossless_dir, lossy_dir, segment_duration=0.01)

# Process songs one by one
dataset.process_and_add()

# Check the size of the dataset
print(f"Total number of segment pairs: {len(dataset)}")


Processed 10/36 songs...
Processed 20/36 songs...
Processed 30/36 songs...
Dataset created with 675015 valid segment pairs.
Total number of segment pairs: 675015


In [6]:
dataset[1][1].shape

torch.Size([2, 480])

In [9]:
# Memory usage of a single data point
lossy_segment, lossless_segment = dataset[0]
single_pair_memory = (lossy_segment.element_size() * lossy_segment.nelement() +
                      lossless_segment.element_size() * lossless_segment.nelement())  # In bytes

# Total memory usage
total_memory = single_pair_memory * len(dataset)  # In bytes
total_memory_mb = total_memory / (1024 ** 2)  # Convert to MB
print(f"Total memory usage of the dataset: {total_memory_mb:.2f} MB")


Total memory usage of the dataset: 4943.96 MB


In [10]:
dataset[0][0].shape

torch.Size([2, 480])

In [11]:
import pickle

# Path to save the dataset
output_file = '/home/j597s263/scratch/j597s263/Datasets/Audio/Dataset/data_0.01.pkl'

# Save the dataset to a file
with open(output_file, 'wb') as f:
    pickle.dump(dataset.data, f)

print(f"Dataset saved to {output_file}.")


FileNotFoundError: [Errno 2] No such file or directory: '/home/j597s263/scratch/j597s263/Datasets/Audio/Dataset/data_0.01.pkl'