In [None]:
# Import required libraries
import torch
import os
from glob import glob
import soundfile as sf
from tqdm import tqdm 

In [None]:
# Load Silero Voice Activity Detection (VAD) model from torch hub
# This model detects speech regions in audio files
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                              model='silero_vad',
                              force_reload=True)

# Extract utility functions from the loaded model
# get_speech_timestamps: finds speech segments in audio
# read_audio: loads audio files with proper format for VAD
(get_speech_timestamps, _, read_audio, *_) = utils

Downloading: "https://github.com/snakers4/silero-vad/zipball/master" to C:\Users\ganes/.cache\torch\hub\master.zip


In [None]:
# VAD parameters configuration
sampling_rate = 16000  # Standard rate for speech models (16kHz)
buffer_time = 0.3      # Add 300ms padding before/after speech segments
buffer_samples = int(buffer_time * sampling_rate)  # Convert time to samples

In [None]:
# Define input and output directories
audio_dir = "Dataset/Training/Audio"               # Source audio files
output_dir = "Dataset/Training/Audio Denoise VAD"  # Processed output
os.makedirs(output_dir, exist_ok=True)             # Create output dir if needed

In [None]:
# Merge overlapping or close speech segments
def merge_speech_segments(segments, buffer_samples):
    """
    Combines speech segments that are close together or overlapping.
    This prevents choppy audio by keeping nearby segments as one.
    """
    if not segments:
        return []

    merged = []
    # Initialize first segment with buffer
    prev = {"start": max(0, segments[0]["start"] - buffer_samples),
            "end": min(segments[0]["end"] + buffer_samples, len(wav))}

    for i in range(1, len(segments)):
        # Add buffer to current segment
        start = max(0, segments[i]["start"] - buffer_samples)
        end = min(segments[i]["end"] + buffer_samples, len(wav))

        # If segments overlap, extend the previous one
        if start <= prev["end"]:
            prev["end"] = max(prev["end"], end)
        else:
            # No overlap, save previous and start new segment
            merged.append(prev)
            prev = {"start": start, "end": end}

    merged.append(prev)
    return merged

In [None]:
# Process all WAV files in the directory
for audio_path in tqdm(glob(os.path.join(audio_dir, "*.wav")), desc="Processing Audio Files"):

    # Load audio file at specified sampling rate
    wav = read_audio(audio_path, sampling_rate=sampling_rate)
    
    # Detect speech segments using VAD model
    speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=sampling_rate)

    # Skip files with no detected speech
    if not speech_timestamps:
        continue

    # Merge segments that are close together
    merged_speech_timestamps = merge_speech_segments(speech_timestamps, buffer_samples)

    # Extract audio from speech regions only
    speech_audio = [wav[segment["start"]:segment["end"]] for segment in merged_speech_timestamps]

    # Concatenate all speech segments and save
    if speech_audio:
        speech_audio = torch.cat(speech_audio)  # Combine all segments
        # Save to output directory with same filename
        sf.write(os.path.join(output_dir, os.path.basename(audio_path)), 
                speech_audio.numpy(), samplerate=sampling_rate)

print("Speech regions extracted and saved successfully!")

Processing Audio Files: 100%|██████████| 908/908 [02:14<00:00,  6.74it/s]

Speech regions extracted and saved successfully!



