In [1]:
import whisper
import torchaudio
from vad import VAD, collect_chunks
import os
import torch

# Load models
vad_model = VAD('cuda' if torch.cuda.is_available() else 'cpu')
whisper_model = whisper.load_model("base")

# Path to your audio file
AUDIO_PATH = "your_audio_file.wav"  # Can be .wav, .mp3, etc.

# 1. Run Voice Activity Detection (Silero)
def get_speech_segments(audio_path):
    wav, sr = torchaudio.load(audio_path)
    assert sr == 16000, "Audio must be 16kHz. Resample if needed."

    speech_timestamps = vad_model(wav, sampling_rate=sr)
    chunks = collect_chunks(speech_timestamps, wav)
    return chunks, sr

# 2. Transcribe chunks using Whisper
def transcribe_chunks(chunks, sr):
    full_transcript = ""
    for i, chunk in enumerate(chunks):
        # Save temporary chunk
        chunk_path = f"temp_chunk_{i}.wav"
        torchaudio.save(chunk_path, chunk, sr)

        # Transcribe with Whisper
        result = whisper_model.transcribe(chunk_path)
        full_transcript += result['text'] + " "

        os.remove(chunk_path)

    return full_transcript.strip()

if __name__ == "__main__":
    print(f"Running VAD and Whisper on {AUDIO_PATH}...")
    chunks, sr = get_speech_segments(AUDIO_PATH)
    transcript = transcribe_chunks(chunks, sr)
    print("\n--- Transcript ---\n")
    print(transcript)


ModuleNotFoundError: No module named 'vad'