# WebRTC

In [1]:
# !apt-get install ffmpeg

In [5]:
import webrtcvad
import collections
import contextlib
import wave
import os
import subprocess
from pydub import AudioSegment

# Convert MP3 to WAV
def convert_mp3_to_wav(mp3_path, wav_path):
    audio = AudioSegment.from_mp3(mp3_path)
    audio = audio.set_frame_rate(16000)  # Resample to 16000 Hz
    audio = audio.set_channels(1)  # Ensure audio is mono
    audio.export(wav_path, format="wav")

# Frame class to hold audio data
class Frame(object):
    def __init__(self, bytes, timestamp, duration):
        self.bytes = bytes
        self.timestamp = timestamp
        self.duration = duration

def read_wave(path):
    with contextlib.closing(wave.open(path, 'rb')) as wf:
        num_channels = wf.getnchannels()
        assert num_channels == 1
        sample_width = wf.getsampwidth()
        assert sample_width == 2
        sample_rate = wf.getframerate()
        assert sample_rate in (8000, 16000, 32000, 48000)
        pcm_data = wf.readframes(wf.getnframes())
        return pcm_data, sample_rate

def frame_generator(frame_duration_ms, audio, sample_rate):
    n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)
    offset = 0
    timestamp = 0.0
    duration = (float(n) / sample_rate) / 2.0
    while offset + n < len(audio):
        yield Frame(audio[offset:offset + n], timestamp, duration)
        timestamp += duration
        offset += n

def vad_collector(sample_rate, frame_duration_ms, padding_duration_ms, vad, frames):
    num_padding_frames = int(padding_duration_ms / frame_duration_ms)
    ring_buffer = collections.deque(maxlen=num_padding_frames)
    triggered = False

    voiced_frames = []
    intervals = []

    for frame in frames:
        is_speech = vad.is_speech(frame.bytes, sample_rate)

        if not triggered:
            ring_buffer.append((frame, is_speech))
            num_voiced = len([f for f, speech in ring_buffer if speech])
            if num_voiced > 0.9 * ring_buffer.maxlen:
                triggered = True
                intervals.append((ring_buffer[0][0].timestamp, None))
                for f, s in ring_buffer:
                    voiced_frames.append(f)
                ring_buffer.clear()
        else:
            voiced_frames.append(frame)
            ring_buffer.append((frame, is_speech))
            num_unvoiced = len([f for f, speech in ring_buffer if not speech])
            if num_unvoiced > 0.9 * ring_buffer.maxlen:
                intervals[-1] = (intervals[-1][0], frame.timestamp + frame.duration)
                triggered = False
                ring_buffer.clear()

    if triggered:
        intervals[-1] = (intervals[-1][0], frame.timestamp + frame.duration)

    return intervals

def save_audio_with_intervals(wav_path, intervals, output_with_voice, output_with_silence):
    audio = AudioSegment.from_wav(wav_path)
    silent_audio = AudioSegment.silent(duration=len(audio))

    audio_with_voice = silent_audio
    audio_with_silence = audio

    for start, end in intervals:
        start_ms = int(start * 1000)
        end_ms = int(end * 1000)
        audio_with_voice = audio_with_voice.overlay(audio[start_ms:end_ms], position=start_ms)
        audio_with_silence = audio_with_silence.overlay(silent_audio[start_ms:end_ms], position=start_ms)

    audio_with_voice.export(output_with_voice, format="wav")
    audio_with_silence.export(output_with_silence, format="wav")

# Paths
mp3_path = os.path.join("data","XC240120 - Soundscape.mp3")  # Upload your MP3 file to Colab
wav_path = "temp_audio.wav"
output_with_voice = "output_with_voice_WebRTC.wav"
output_with_silence = "output_with_silence_WebRTC.wav"

# Convert MP3 to WAV
convert_mp3_to_wav(mp3_path, wav_path)

# Read WAV file
audio, sample_rate = read_wave(wav_path)

# Initialize VAD
vad = webrtcvad.Vad(3)

# Generate frames
frames = frame_generator(30, audio, sample_rate)
frames = list(frames)

# Collect voiced segments
intervals = vad_collector(sample_rate, 30, 300, vad, frames)

# Print intervals
for start, end in intervals:
    print(f"Start: {start:.2f}s, End: {end:.2f}s")

# Save new audio files with intervals
save_audio_with_intervals(wav_path, intervals, output_with_voice, output_with_silence)

# Cleanup temporary WAV file
os.remove(wav_path)

Start: 10.41s, End: 14.61s
Start: 15.57s, End: 17.31s
Start: 19.95s, End: 22.80s
Start: 63.36s, End: 64.62s
Start: 65.97s, End: 69.69s
Start: 70.68s, End: 72.45s
Start: 73.11s, End: 75.27s
Start: 75.81s, End: 76.80s
Start: 80.43s, End: 82.17s
Start: 82.32s, End: 83.82s
Start: 86.07s, End: 87.09s
Start: 88.65s, End: 91.62s
Start: 93.45s, End: 95.85s
Start: 97.38s, End: 98.94s
Start: 100.08s, End: 102.18s
Start: 102.51s, End: 103.41s
Start: 104.49s, End: 105.12s
Start: 114.93s, End: 116.67s
Start: 121.05s, End: 121.86s
Start: 133.65s, End: 134.61s
Start: 137.19s, End: 138.30s
Start: 138.48s, End: 139.74s
Start: 140.64s, End: 144.33s
Start: 145.02s, End: 147.66s
Start: 148.23s, End: 150.15s
Start: 153.84s, End: 156.57s
Start: 157.02s, End: 159.60s
Start: 159.87s, End: 160.62s
Start: 161.07s, End: 163.05s
Start: 163.56s, End: 165.18s
Start: 180.66s, End: 181.26s
Start: 188.85s, End: 190.74s
Start: 191.46s, End: 192.45s
Start: 192.90s, End: 196.86s
Start: 201.63s, End: 204.45s
Start: 204.45

# Hugging Face Transformers

In [None]:
from pyannote.audio import Pipeline
from pydub import AudioSegment
import numpy as np
import os
from huggingface_hub import login
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import pipeline as hf_pipeline

# Read your Hugging Face token from environment. Do NOT hardcode secrets.
HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
if not HF_TOKEN:
    raise RuntimeError(
        "Missing Hugging Face token. Set HF_TOKEN or HUGGINGFACE_HUB_TOKEN in your environment."
    )

# Optionally ensure the token is active for the session (no-op if already logged in)
try:
    login(HF_TOKEN, add_to_git_credential=False)
except Exception:
    pass

# Model identifier for VAD (pyannote 3.1-compatible)
model_name = "pyannote/voice-activity-detection-3.1"

# Load pyannote pipeline with token (pyannote 3.x uses `token`)
pipeline = Pipeline.from_pretrained(model_name, token=HF_TOKEN)

# Function to load and preprocess audio
def load_and_preprocess_audio(audio_path):
    """Loads and preprocesses audio for VAD.  Resamples to 16kHz mono, converts to NumPy array."""
    try:
        audio = AudioSegment.from_file(audio_path)
    except Exception as e:
        print(f"Error loading audio file {audio_path}: {e}")
        return None, None, None

    audio = audio.set_channels(1)  # Convert to mono
    audio = audio.set_frame_rate(16000)  # Resample to 16kHz
    samples = np.array(audio.get_array_of_samples())
    samples = samples.astype(np.float32) / np.iinfo(np.int16).max  # Normalize
    return samples, audio, audio.frame_rate

# Function to perform VAD
def detect_speech_intervals(samples, sample_rate, pipeline):
    """Performs voice activity detection on the audio."""

    try:
        # The pipeline expects a dictionary with 'waveform' and 'sample_rate'
        input_data = {"waveform": torch.from_numpy(samples).unsqueeze(0), "sample_rate": sample_rate}
        vad_result = pipeline(input_data)
    except Exception as e:
        print(f"Error during VAD: {e}")
        return [], 0.0

    intervals = []
    for segment in vad_result.get_timeline():
        intervals.append((segment.start, segment.end))
    return intervals, 0.0

# Function to save audio with intervals
def save_audio_with_intervals(audio, intervals, output_with_voice, output_with_silence, duration):
    silent_audio = AudioSegment.silent(duration=len(audio))

    audio_with_voice = silent_audio
    audio_with_silence = audio

    for start, end in intervals:
        start_ms = int(start * 1000)
        end_ms = int(end * 1000)
        audio_with_voice = audio_with_voice.overlay(audio[start_ms:end_ms], position=start_ms)
        audio_with_silence = audio_with_silence.overlay(silent_audio[start_ms:end_ms], position=start_ms)

    audio_with_voice.export(output_with_voice, format="wav")
    audio_with_silence.export(output_with_silence, format="wav")

# Paths
# Update this to a valid local path on your machine
mp3_path = os.path.join("data","XC240120 - Soundscape.mp3") 

# Validate input path early
if not os.path.isfile(mp3_path):
    raise FileNotFoundError(
        f"Audio file not found: {mp3_path}. Update 'mp3_path' to a valid local file."
    )
output_with_voice = "output_with_voice_huggingface.wav"
output_with_silence = "output_with_silence_huggingface.wav"

# Load and preprocess audio
samples, audio_segment, sample_rate = load_and_preprocess_audio(mp3_path)

# Detect speech intervals
speech_intervals, duration = detect_speech_intervals(samples, sample_rate, pipeline)
duration = len(audio_segment) / 1000.0

print(speech_intervals)

# Save new audio files with intervals
save_audio_with_intervals(audio_segment, speech_intervals, output_with_voice, output_with_silence, duration)

  from .autonotebook import tqdm as notebook_tqdm
torchcodec is not installed correctly so built-in audio decoding will fail. Solutions are:
* use audio preloaded in-memory as a {'waveform': (channel, time) torch.Tensor, 'sample_rate': int} dictionary;
* fix torchcodec installation. Error message was:

Could not load libtorchcodec. Likely causes:
          1. FFmpeg is not properly installed in your environment. We support
             versions 4, 5, 6 and 7.
          2. The PyTorch version (2.8.0+cpu) is not compatible with
             this version of TorchCodec. Refer to the version compatibility
             table:
             https://github.com/pytorch/torchcodec?tab=readme-ov-file#installing-torchcodec.
          3. Another runtime dependency; see exceptions below.
        The following exceptions were raised as we tried to load libtorchcodec:
        
[start of libtorchcodec loading traceback]
FFmpeg version 7: Could not find module 'C:\Users\USER\anaconda3\envs\torchenv\Lib\s

ValueError: Revisions must be passed with `revision` keyword argument.

## Old version performance evaluation

In [None]:
from pyannote.audio import Pipeline
from pydub import AudioSegment
import numpy as np
import os  # For file existence check

In [None]:
# Load the pre-trained VAD pipeline
try:
    pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=True)
except Exception as e:
    print(f"Error loading pipeline: {e}")
    exit()

# Function to load audio
def load_audio(audio_path):
    audio = AudioSegment.from_file(audio_path)
    return audio

# Function to perform VAD
def detect_speech_intervals(audio_path, pipeline):
    audio = load_audio(audio_path)
    duration = len(audio) / 1000.0  # Convert to seconds
    vad_result = pipeline({"uri": audio_path, "audio": audio_path})

    intervals = []
    for segment in vad_result.get_timeline():
        intervals.append((segment.start, segment.end))
    return intervals, duration

# Function to load manual labels from file
def load_manual_labels(label_file):
    """Loads manual labels from the given text file."""
    labels = []
    try:
        with open(label_file, 'r') as f:
            for line in f:
                start, end, label = line.strip().split('\t')
                labels.append((float(start), float(end)))
    except FileNotFoundError:
        print(f"Error: Label file not found: {label_file}")
        return None
    except ValueError:
        print(f"Error: Invalid format in label file {label_file}.  Expecting tab-separated start, end, label.")
        return None
    return labels

# Function to calculate precision and recall
def calculate_precision_recall_f1(vad_intervals, manual_intervals):
    """Calculates precision, recall, and F1-score."""

    # Convert intervals to sets of time points for easier comparison
    def intervals_to_set(intervals):
        time_points = set()
        for start, end in intervals:
            for i in range(int(start * 100), int(end * 100)):  # Convert to 10ms resolution
                time_points.add(i)
        return time_points

    vad_set = intervals_to_set(vad_intervals)
    manual_set = intervals_to_set(manual_intervals)

    true_positives = len(vad_set.intersection(manual_set))
    predicted_positives = len(vad_set)
    actual_positives = len(manual_set)

    precision = true_positives / predicted_positives if predicted_positives > 0 else 0
    recall = true_positives / actual_positives if actual_positives > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1

# Function to save VAD intervals to a file, including performance metrics
def save_vad_intervals(vad_intervals, manual_intervals, output_file):
    """Saves VAD intervals to a text file, including performance metrics."""
    try:
        with open(output_file, 'w') as f:
            # Calculate performance metrics
            precision, recall, f1 = calculate_precision_recall_f1(vad_intervals, manual_intervals)

            # Write header with performance metrics
            f.write(f"# Precision: {precision:.4f}\n")
            f.write(f"# Recall: {recall:.4f}\n")
            f.write(f"# F1-score: {f1:.4f}\n")
            f.write("# start\tend\tVAD\n") # Header for intervals

            # Write VAD intervals
            for start, end in vad_intervals:
                f.write(f"{start}\t{end}\tVAD\n")

        print(f"VAD intervals and performance metrics saved to {output_file}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1-score: {f1:.4f}")


    except Exception as e:
        print(f"Error saving VAD intervals: {e}")

# Function to save audio with intervals
def save_audio_with_intervals(audio, intervals, output_with_voice, output_with_silence, duration):
    silent_audio = AudioSegment.silent(duration=len(audio))

    audio_with_voice = silent_audio
    audio_with_silence = audio

    for start, end in intervals:
        start_ms = int(start * 1000)
        end_ms = int(end * 1000)
        audio_with_voice = audio_with_voice.overlay(audio[start_ms:end_ms], position=start_ms)
        audio_with_silence = audio_with_silence.overlay(silent_audio[start_ms:end_ms], position=start_ms)

    audio_with_voice.export(output_with_voice, format="wav")
    audio_with_silence.export(output_with_silence, format="wav")

# Paths
mp3_path = "drive/MyDrive/datasets/audio_ambiental_narracion/XC237810_1 - Soundscape.mp3"  # Replace with your path
manual_label_file = "drive/MyDrive/datasets/audio_ambiental_narracion/Labels_XC237810_1.txt"  # Replace with your manual label file path
output_with_voice = "output_with_voice_huggingface.wav"
output_with_silence = "output_with_silence_huggingface.wav"
vad_output_file = "Labels_XC237810_1_detection.txt"  # The file to save the VAD intervals

# Check if the input file exists
if not os.path.exists(mp3_path):
    print(f"Error: Audio file not found at {mp3_path}")
    exit()

# Detect speech intervals
speech_intervals, duration = detect_speech_intervals(mp3_path, pipeline)
print(f"VAD Intervals (Pipeline): {speech_intervals}")

# Load manual labels
manual_intervals = load_manual_labels(manual_label_file)
if manual_intervals is None:
    exit()
print(f"Manual Intervals: {manual_intervals}")

# Save the VAD intervals to file, including performance metrics
save_vad_intervals(speech_intervals, manual_intervals, vad_output_file)

# Load audio
audio_segment = load_audio(mp3_path)

# Save new audio files with intervals
save_audio_with_intervals(audio_segment, speech_intervals, output_with_voice, output_with_silence, duration)
print("Finished!")

/usr/local/lib/python3.11/dist-packages/pytorch_lightning/utilities/migration/migration.py:208: You have multiple `ModelCheckpoint` callback states in this checkpoint, but we found state keys that would end up colliding with each other after an upgrade, which means we can't differentiate which of your checkpoint callbacks needs which states. At least one of your `ModelCheckpoint` callbacks will not be able to reload the state.
INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.1.3 to v2.5.0.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/pyannote/models--pyannote--segmentation/snapshots/059e96f964841d40f1a5e755bb7223f76666bba4/pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.7.1, yours is 2.5.1+cu124. Bad things might happen unless you revert torch to 1.x.
VAD Intervals (Pipeline): [(1.16159375, 4.840343750000001), (5.24534375, 9.329093750000002), (10.52721875, 15.809093750000002), (17.395343750000002, 20.669093750000002), (21.17534375, 23.21721875), (24.48284375, 28.515968750000003), (29.309093750000002, 37.25721875), (39.36659375, 41.03721875), (42.06659375, 45.188468750000006), (47.98971875, 48.985343750000006), (50.70659375, 52.73159375), (166.84034375000002, 168.86534375000002), (238.01909375000002, 241.00596875000002), (256.56471875, 259.50096875), (259.72034375000004, 262.74096875000004), (265.25534375, 265.72784375000003), (270.11534375, 272.20784375), (272.41034375000004, 282.34971875), (494.33346875, 495.46409375), (523.71284375, 526.76721875), (527.8640937499999, 529.9734687499999), (530.445