<a href="https://colab.research.google.com/github/Maya7991/gsc_classification/blob/main/syn_dataset_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import random
import subprocess
import numpy as np
import soundfile as sf
from TTS.api import TTS
from audiomentations import Compose, AddBackgroundNoise
import librosa

# === CONFIG ===
KEYWORDS = ["mask", "frame"]
NUM_SPEAKERS = 5
BASE_DIR = "gsc_compatible_keywords"
BACKGROUND_NOISE_DIR = "background_noises"
TARGET_SR = 16000  # GSC-compatible
TTS_MODEL = "tts_models/en/vctk/vits"

# Augmentation toggles
APPLY_AUGMENTATIONS = True
PITCH_SHIFT_STEPS = [-100, 100]
SPEED_FACTORS = [0.9, 1.1]
APPLY_NOISE = True

# === SETUP ===
tts = TTS(model_name=TTS_MODEL)
augment = Compose([
    AddBackgroundNoise(
        sounds_path=BACKGROUND_NOISE_DIR,
        min_snr_in_db=5.0,
        max_snr_in_db=20.0,
        p=1.0
    )
]) if APPLY_NOISE else None

def convert_to_gsc_format(input_path, output_path):
    # Convert to mono, 16-bit PCM, 16kHz
    audio, sr = librosa.load(input_path, sr=None, mono=True)
    audio_resampled = librosa.resample(audio, orig_sr=sr, target_sr=TARGET_SR)
    sf.write(output_path, audio_resampled, TARGET_SR, subtype='PCM_16')

def apply_noise_and_convert(input_wav_path, output_path):
    samples, sr = sf.read(input_wav_path)
    if sr != TARGET_SR:
        samples = librosa.resample(samples, sr, TARGET_SR)
    noisy = augment(samples=samples, sample_rate=TARGET_SR)
    sf.write(output_path, noisy, TARGET_SR, subtype='PCM_16')

# === GENERATE SAMPLES ===
for keyword in KEYWORDS:
    keyword_dir = os.path.join(BASE_DIR, keyword)
    os.makedirs(keyword_dir, exist_ok=True)

    for speaker_id in range(NUM_SPEAKERS):
        base_filename = f"{keyword}_speaker{speaker_id}"
        raw_output = f"{base_filename}_raw.wav"
        print(f"Generating: {raw_output}")
        tts.tts_to_file(text=keyword, speaker=speaker_id, file_path=raw_output)

        gsc_output = os.path.join(keyword_dir, f"{base_filename}.wav")
        convert_to_gsc_format(raw_output, gsc_output)

        paths_to_augment = [gsc_output]
        os.remove(raw_output)  # Clean up raw file

        # === Pitch & Speed ===
        if APPLY_AUGMENTATIONS:
            for shift in PITCH_SHIFT_STEPS:
                aug_path = f"{base_filename}_pitch{shift}.wav"
                subprocess.call(["sox", gsc_output, aug_path, "pitch", str(shift)])
                final_path = os.path.join(keyword_dir, aug_path)
                convert_to_gsc_format(aug_path, final_path)
                os.remove(aug_path)
                paths_to_augment.append(final_path)

            for speed in SPEED_FACTORS:
                aug_path = f"{base_filename}_speed{speed}.wav"
                subprocess.call(["sox", gsc_output, aug_path, "speed", str(speed)])
                final_path = os.path.join(keyword_dir, aug_path)
                convert_to_gsc_format(aug_path, final_path)
                os.remove(aug_path)
                paths_to_augment.append(final_path)

        # === Add Noise ===
        if APPLY_NOISE:
            for clean_path in paths_to_augment:
                noisy_path = clean_path.replace(".wav", "_noisy.wav")
                apply_noise_and_convert(clean_path, noisy_path)

print("✅ Dataset is ready in GSC-compatible format.")
