In [None]:
import torchaudio

# Download the LIBRISPEECH dataset
train_clean_360 = torchaudio.datasets.LIBRISPEECH("./clean", url="train-clean-360", download=True)

95.3%


KeyboardInterrupt: 

In [None]:
import torch
import random

def get_random_utterance(dataset):
    index = random.randint(0, len(dataset) - 1)
    waveform, sample_rate, utterance, _, _, _ = dataset[index]
    return waveform, sample_rate, utterance

def get_random_pair(dataset):
    source_waveform, sample_rate, source_utterance = get_random_utterance(dataset)
    noise_waveform, _, _ = get_random_utterance(dataset)
    return source_waveform, noise_waveform, sample_rate, source_utterance

def mix_utterances(source_waveform, noise_waveform):
    # Proportion of starting noise
    overlap_start = 0.1
    overlap_end = random.uniform(overlap_start, 1 - overlap_start)  # Adjusted the range
    
    # Determine the overlap percentage
    overlap_percentage = overlap_end - overlap_start  # Adjusted the calculation
    overlap_samples = int(overlap_percentage * source_waveform.shape[1])

    # If noise is longer than source or overlap, crop noise
    if overlap_samples < noise_waveform.shape[1]:
        noise_waveform = noise_waveform[:, :overlap_samples]

    # If noise is shorter than source or overlap, pad noise
    if noise_waveform.shape[1] < overlap_samples:
        padding = overlap_samples - noise_waveform.shape[1]
        noise_waveform = torch.nn.functional.pad(noise_waveform, (0, padding))

    # Mix the source and noise waveforms
    start = random.randint(source_waveform.shape[1] // 10, source_waveform.shape[1] - overlap_samples)
    end = start + overlap_samples
    
    # Scale the waveforms to prevent clipping
    mixed_waveform = source_waveform.clone()
    mixed_waveform[:, start:end] = (mixed_waveform[:, start:end] + noise_waveform[:, :overlap_samples]) / 2

    return mixed_waveform


### Training Data

In [None]:
import os
import torchaudio

# Initialize the dataset
dataset_path = os.path.join(".")
librispeech_dataset = torchaudio.datasets.LIBRISPEECH(dataset_path, download=True)

output_dir = os.path.join(".", "LIBRISPEECH2mix")
os.makedirs(output_dir, exist_ok=True)

# Create a directory for training samples
train_output_dir = os.path.join(".", "LIBRISPEECH2mix", "train")
os.makedirs(train_output_dir, exist_ok=True)

for i in range(100):  # assuming 100 training samples
    source_waveform, noise_waveform, sample_rate, _ = get_random_pair(librispeech_dataset)
    mixed_waveform = mix_utterances(source_waveform, noise_waveform)
    
    mixed_path = os.path.join(train_output_dir, f"mixed_train_{i}.wav")
    torchaudio.save(mixed_path, mixed_waveform, sample_rate)
    
    source_path = os.path.join(train_output_dir, f"source_train_{i}.wav")
    torchaudio.save(source_path, source_waveform, sample_rate)


### Eval Data

In [None]:
# Create a directory for evaluation samples
eval_output_dir = os.path.join(".", "LIBRISPEECH2mix", "eval")
os.makedirs(eval_output_dir, exist_ok=True)

for i in range(100):  # assuming 100 evaluation samples
    source_waveform, noise_waveform, sample_rate, source_utterance = get_random_pair(librispeech_dataset)
    mixed_waveform = mix_utterances(source_waveform, noise_waveform)
    
    mixed_path = os.path.join(eval_output_dir, f"mixed_eval_{i}.wav")
    torchaudio.save(mixed_path, mixed_waveform, sample_rate)
    
    text_path = os.path.join(eval_output_dir, f"mixed_eval_{i}.txt")
    with open(text_path, "w") as f:
        f.write(source_utterance)
