In [None]:
pip install numpy soundfile




In [None]:
import numpy as np
import soundfile as sf
from IPython.display import Audio, display

# ====== SETTINGS ======
duration = 3        # seconds
sample_rate = 16000 # 16 kHz audio
frequency = 440     # A4 tone (can change)

# ====== GENERATE SINE WAVE ======
t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False)
audio_data = 0.5 * np.sin(2 * np.pi * frequency * t)

# ====== SAVE AUDIO FILE ======
output_file = "generated_audio.wav"
sf.write(output_file, audio_data, sample_rate)

# ====== PRINT OUTPUT ======
print("Audio file created successfully!")
print("Saved as:", output_file)

# ====== PLAY AUDIO IN COLAB ======
display(Audio(output_file, autoplay=True))


Audio file created successfully!
Saved as: generated_audio.wav


In [None]:
!pip install soundfile mutagen


Collecting mutagen
  Downloading mutagen-1.47.0-py3-none-any.whl.metadata (1.7 kB)
Downloading mutagen-1.47.0-py3-none-any.whl (194 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.4/194.4 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mutagen
Successfully installed mutagen-1.47.0


In [None]:
import numpy as np
import soundfile as sf
from IPython.display import Audio, display
from mutagen.wave import WAVE
from mutagen.id3 import TextFrame

# ====== SETTINGS ======
duration = 3        # seconds
sample_rate = 16000 # 16 kHz audio
frequency = 440     # Sine tone frequency

# ====== GENERATE SINE WAVE ======
t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False)
audio_data = 0.5 * np.sin(2 * np.pi * frequency * t)

# ====== SAVE AUDIO FILE ======
output_file = "generated_audio_with_info.wav"
sf.write(output_file, audio_data, sample_rate)

# ====== ADD METADATA ======
metadata = WAVE(output_file)

# Use TextFrame for metadata values
metadata["INAM"] = TextFrame(encoding=3, text=["Sample Tone Audio"])# title or name of the audio
metadata["IART"] = TextFrame(encoding=3, text=["Your Name"]) # artist
metadata["ICMT"] = TextFrame(encoding=3, text=["This is a generated audio tone with embedded metadata."]) #comments
metadata["ICRD"] = TextFrame(encoding=3, text=["2025-02-14"]) # creation date
metadata.save()

# ====== PLAY AUDIO ======
print("Audio file created with metadata!\n")
display(Audio(output_file, autoplay=False))

# ====== READ AND PRINT METADATA ======
print("=== Embedded Metadata in WAV File ===")
for key, value in metadata.items():
    print(f"{key}: {value.text[0] if hasattr(value, 'text') else value}") # Access text attribute for TextFrame

Audio file created with metadata!



=== Embedded Metadata in WAV File ===
INAM: Sample Tone Audio
IART: Your Name
ICMT: This is a generated audio tone with embedded metadata.
ICRD: 2025-02-14


In [None]:
!pip install gTTS soundfile # google text to speech


Collecting gTTS
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Collecting click<8.2,>=7.1 (from gTTS)
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Downloading gTTS-2.5.4-py3-none-any.whl (29 kB)
Downloading click-8.1.8-py3-none-any.whl (98 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: click, gTTS
  Attempting uninstall: click
    Found existing installation: click 8.3.0
    Uninstalling click-8.3.0:
      Successfully uninstalled click-8.3.0
Successfully installed click-8.1.8 gTTS-2.5.4


In [None]:
from gtts import gTTS
from IPython.display import Audio, display
import soundfile as sf
import numpy as np

# ====== TEXT TO SPEECH INPUT ======
text = "Hello! This is an automatically generated audio message created in Google Colab."

# ====== GENERATE AUDIO FROM TEXT ======
tts = gTTS(text=text, lang='en')
tts.save("text_audio.mp3")

# Convert MP3 to WAV (optional)
# Load MP3 using audio libraries
import librosa
audio_data, sr = librosa.load("text_audio.mp3", sr=16000)
sf.write("text_audio.wav", audio_data, sr)

# ====== PLAY AUDIO ======
print("Text converted to speech and saved as 'text_audio.wav'")
display(Audio("text_audio.wav", autoplay=False))


Text converted to speech and saved as 'text_audio.wav'


In [None]:
# ============================
# Short Vosk vs Whisper Script
# ============================

!pip install -q vosk gTTS faster-whisper pydub soundfile sentencepiece
!apt -qq install -y ffmpeg

from google.colab import files
from gtts import gTTS
from IPython.display import Audio
import os, wave, json, subprocess

# ---- AUDIO PREP ----
print("Upload audio (optional). Skip to auto-create TTS sample.")
up = files.upload()

if up:
    audio = list(up.keys())[0]
else:
    tts = gTTS("Hello! This is a test audio for comparing Vosk and Whisper.", lang="en")
    audio = "sample.mp3"
    tts.save(audio)

# Convert → WAV 16khz mono
wav = "audio.wav"
subprocess.run(["ffmpeg","-y","-i",audio,"-ar","16000","-ac","1",wav],
               stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

Audio(wav)

# ---- VOSK ----
if not os.path.exists("vosk-model"):
    !wget -q https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip -O model.zip
    !unzip -q model.zip
    !mv vosk-model-small-en-us-0.15 vosk-model
    !rm model.zip

from vosk import Model, KaldiRecognizer
wf = wave.open(wav, "rb")
rec = KaldiRecognizer(Model("vosk-model"), wf.getframerate())
vosk_text = ""

while True:
    data = wf.readframes(4000)
    if not data: break
    if rec.AcceptWaveform(data):
        vosk_text += json.loads(rec.Result()).get("text"," ") + " "
vosk_text += json.loads(rec.FinalResult()).get("text"," ")

print("\n>>> VOSK:\n", vosk_text.strip())

# ---- WHISPER ----
import torch
from faster_whisper import WhisperModel

device = "cuda" if torch.cuda.is_available() else "cpu"
wmodel = WhisperModel("small", device=device)

segments, _ = wmodel.transcribe(wav)
whisper_text = " ".join([s.text for s in segments]).strip()

print("\n>>> WHISPER:\n", whisper_text)


  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.5/40.5 MB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.0/38.0 MB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.4/17.4 MB[0m [31m42.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?


>>> VOSK:
 hello this is a test audio for comparing va skin whisper


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocabulary.txt: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.bin:   0%|          | 0.00/484M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]


>>> WHISPER:
 Hello, this is a test audio for comparing Vosk and Whisper.


In [None]:
!pip install sounddevice vosk faster-whisper soundfile
!apt-get install -y portaudio19-dev

Collecting sounddevice
  Using cached sounddevice-0.5.3-py3-none-any.whl.metadata (1.6 kB)
Downloading sounddevice-0.5.3-py3-none-any.whl (32 kB)
Installing collected packages: sounddevice
Successfully installed sounddevice-0.5.3
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
portaudio19-dev is already the newest version (19.6.0-1.1).
0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.


convert to audio mp3


In [None]:
from gtts import gTTS
from IPython.display import Audio

tts = gTTS("Hello, this is a test audio for speech to text models.", lang="en")
tts.save("test_audio.mp3")

Audio("test_audio.mp3")


In [None]:
convert to WAV


In [None]:
!ffmpeg -i test_audio.mp3 -ar 16000 -ac 1 test.wav -y


ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

Run in vosk

In [None]:
!wget -q https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip
!unzip -q vosk-model-small-en-us-0.15.zip

from vosk import Model, KaldiRecognizer
import wave, json

wf = wave.open("test.wav", "rb")
rec = KaldiRecognizer(Model("vosk-model-small-en-us-0.15"), 16000)

result = ""
while True:
    data = wf.readframes(4000)
    if not data:
        break
    if rec.AcceptWaveform(data):
        result += json.loads(rec.Result())["text"] + " "

result += json.loads(rec.FinalResult())["text"]
print("VOSK:", result)


VOSK: hello this is a test audio for speech to text models 


RUN IN WHISPER


In [None]:
!pip install -q faster-whisper

from faster_whisper import WhisperModel
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
model = WhisperModel("small", device=device)

segments, info = model.transcribe("test.wav")
text = " ".join([s.text for s in segments])

print("WHISPER:", text)


WHISPER:  Hello, this is a test audio for speech-to-text models.


In [None]:
# ================================
# Single-Cell STT Pipeline (Colab)
# TTS → WAV → Vosk → Whisper
# ================================

# Install dependencies
!pip install -q gTTS vosk faster-whisper soundfile sentencepiece
!apt-get -qq install -y ffmpeg

from gtts import gTTS
from IPython.display import Audio
import subprocess, wave, json, os
import torch

# 1) Generate Speech Audio using TTS
text_input = "Hello, this is an automatic speech recognition test using Vosk and Whisper."
tts = gTTS(text_input, lang="en")
tts.save("tts.mp3")
print("Generated audio from text:", text_input)
Audio("tts.mp3")

# 2) Convert MP3 → WAV (16 kHz mono)
subprocess.run(["ffmpeg", "-y", "-i", "tts.mp3", "-ar", "16000", "-ac", "1", "audio.wav"],
               stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
print("Converted to audio.wav (16k mono)")

# 3) --- VOSK STT ---
if not os.path.exists("vosk-model"):
    !wget -q https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip -O model.zip
    !unzip -q model.zip
    !mv vosk-model-small-en-us-0.15 vosk-model
    !rm model.zip

from vosk import Model, KaldiRecognizer

wf = wave.open("audio.wav", "rb")
rec = KaldiRecognizer(Model("vosk-model"), 16000)

vosk_text = ""
while True:
    data = wf.readframes(4000)
    if not data: break
    if rec.AcceptWaveform(data):
        vosk_text += json.loads(rec.Result())["text"] + " "
vosk_text += json.loads(rec.FinalResult())["text"]

# 4) --- WHISPER STT ---
from faster_whisper import WhisperModel

device = "cuda" if torch.cuda.is_available() else "cpu"
whisper = WhisperModel("small", device=device)

segments, _ = whisper.transcribe("audio.wav")
whisper_text = " ".join([s.text for s in segments]).strip()

# 5) Results
print("\n===== VOSK OUTPUT =====")
print(vosk_text)

print("\n===== WHISPER OUTPUT =====")
print(whisper_text)

print("\n===== ORIGINAL TEXT =====")
print(text_input)


Generated audio from text: Hello, this is an automatic speech recognition test using Vosk and Whisper.
Converted to audio.wav (16k mono)

===== VOSK OUTPUT =====
hello this is an automatic speech recognition test using va can whisper

===== WHISPER OUTPUT =====
Hello, this is an automatic speech recognition test using Voskin Whisper.

===== ORIGINAL TEXT =====
Hello, this is an automatic speech recognition test using Vosk and Whisper.


DOWNLOAD AMI SAMPLE


In [None]:
!mkdir -p ami_sample
!wget -q https://groups.inf.ed.ac.uk/ami/AMICorpusSamples/ES2002a.Mix-Headset.wav -O ami_sample/ES2002a.wav

print("Downloaded AMI sample file:")
!ls -lh ami_sample


Downloaded AMI sample file:
total 0
-rw-r--r-- 1 root root 0 Nov 15 08:45 ES2002a.wav


Load the AMI sample audio




In [None]:
from IPython.display import Audio

Audio("ami_sample/ES2002a.wav")


Download a public speech dataset (LibriSpeech test subset)

In [None]:
!mkdir -p librispeech_sample
!wget -q https://www.openslr.org/resources/12/dev-clean.tar.gz -O dev-clean.tar.gz
!tar -xzf dev-clean.tar.gz --directory librispeech_sample --wildcards "*.flac"

print("Sample files:")
!find librispeech_sample -name "*.flac" | head


Sample files:
librispeech_sample/LibriSpeech/dev-clean/6313/66129/6313-66129-0027.flac
librispeech_sample/LibriSpeech/dev-clean/6313/66129/6313-66129-0006.flac
librispeech_sample/LibriSpeech/dev-clean/6313/66129/6313-66129-0022.flac
librispeech_sample/LibriSpeech/dev-clean/6313/66129/6313-66129-0021.flac
librispeech_sample/LibriSpeech/dev-clean/6313/66129/6313-66129-0009.flac
librispeech_sample/LibriSpeech/dev-clean/6313/66129/6313-66129-0018.flac
librispeech_sample/LibriSpeech/dev-clean/6313/66129/6313-66129-0026.flac
librispeech_sample/LibriSpeech/dev-clean/6313/66129/6313-66129-0015.flac
librispeech_sample/LibriSpeech/dev-clean/6313/66129/6313-66129-0024.flac
librispeech_sample/LibriSpeech/dev-clean/6313/66129/6313-66129-0023.flac


In [None]:
!mkdir -p librispeech_sample
!wget -q https://www.openslr.org/resources/12/dev-clean.tar.gz -O dev-clean.tar.gz
!tar -xzf dev-clean.tar.gz --directory librispeech_sample --wildcards "*.flac"

print("Sample files:")
!find librispeech_sample -name "*.flac" | head


Sample files:
librispeech_sample/LibriSpeech/dev-clean/6313/66129/6313-66129-0027.flac
librispeech_sample/LibriSpeech/dev-clean/6313/66129/6313-66129-0006.flac
librispeech_sample/LibriSpeech/dev-clean/6313/66129/6313-66129-0022.flac
librispeech_sample/LibriSpeech/dev-clean/6313/66129/6313-66129-0021.flac
librispeech_sample/LibriSpeech/dev-clean/6313/66129/6313-66129-0009.flac
librispeech_sample/LibriSpeech/dev-clean/6313/66129/6313-66129-0018.flac
librispeech_sample/LibriSpeech/dev-clean/6313/66129/6313-66129-0026.flac
librispeech_sample/LibriSpeech/dev-clean/6313/66129/6313-66129-0015.flac
librispeech_sample/LibriSpeech/dev-clean/6313/66129/6313-66129-0024.flac
librispeech_sample/LibriSpeech/dev-clean/6313/66129/6313-66129-0023.flac


Convert dataset audio (FLAC → WAV, 16k mono)

In [None]:
import subprocess, os

source = "librispeech_sample/LibriSpeech/dev-clean/1272/128104/1272-128104-0000.flac"
target = "sample.wav"

subprocess.run(["ffmpeg", "-y", "-i", source, "-ar", "16000", "-ac", "1", target])

print("Converted to:", target)


Converted to: sample.wav


Create a synthetic dataset automatically

In [None]:
from gtts import gTTS
import os

os.makedirs("synthetic_dataset", exist_ok=True)

sentences = [
    "Hello, welcome to the speech recognition test.",
    "This is a synthetic dataset created using text to speech.",
    "Speech models must be evaluated for accuracy.",
    "Different speakers and accents should be tested.",
    "Background noise can affect transcription quality.",
    "We will benchmark whisper and vosk models.",
    "This sentence is intentionally longer to test robustness.",
    "The quick brown fox jumps over the lazy dog.",
    "Artificial intelligence is transforming industries.",
    "Thank you for participating in this project."
]

for i, text in enumerate(sentences):
    tts = gTTS(text=text, lang="en")
    path = f"synthetic_dataset/audio_{i}.mp3"
    tts.save(path)

print("Synthetic dataset created:")
!ls -1 synthetic_dataset


Synthetic dataset created:
audio_0.mp3
audio_1.mp3
audio_2.mp3
audio_3.mp3
audio_4.mp3
audio_5.mp3
audio_6.mp3
audio_7.mp3
audio_8.mp3
audio_9.mp3


Convert synthetic audios to WAV

In [None]:
import subprocess
import os

files = os.listdir("synthetic_dataset")

for f in files:
    if f.endswith(".mp3"):
        mp3_path = f"synthetic_dataset/{f}"
        wav_path = mp3_path.replace(".mp3", ".wav")
        subprocess.run(["ffmpeg", "-y", "-i", mp3_path, "-ar", "16000", "-ac", "1", wav_path])

print("Converted WAV files:")
!ls synthetic_dataset/*.wav


Converted WAV files:
synthetic_dataset/audio_0.wav  synthetic_dataset/audio_5.wav
synthetic_dataset/audio_1.wav  synthetic_dataset/audio_6.wav
synthetic_dataset/audio_2.wav  synthetic_dataset/audio_7.wav
synthetic_dataset/audio_3.wav  synthetic_dataset/audio_8.wav
synthetic_dataset/audio_4.wav  synthetic_dataset/audio_9.wav


In [None]:
!pip install -q vosk faster-whisper soundfile
!apt-get -qq install -y ffmpeg


In [None]:
import json, wave, os, soundfile as sf
from vosk import Model, KaldiRecognizer
from faster_whisper import WhisperModel
import torch

# Load models once
vosk_model = Model("vosk-model-small-en-us-0.15")  # folder must exist
device = "cuda" if torch.cuda.is_available() else "cpu"
whisper_model = WhisperModel("small", device=device)

def transcribe_vosk(path):
    wf = wave.open(path, "rb")
    rec = KaldiRecognizer(vosk_model, wf.getframerate())

    result = ""
    while True:
        data = wf.readframes(4000)
        if not data:
            break
        if rec.AcceptWaveform(data):
            result += json.loads(rec.Result())["text"] + " "
    result += json.loads(rec.FinalResult())["text"]
    return result.strip()

def transcribe_whisper(path):
    segments, _ = whisper_model.transcribe(path)
    text = " ".join([s.text for s in segments])
    return text.strip()

def benchmark_file(path):
    print(f"\n===== FILE: {path} =====")
    print("\nVOSK:")
    print(transcribe_vosk(path))

    print("\nWHISPER:")
    print(transcribe_whisper(path))


In [None]:
import subprocess

input_folder = "synthetic_dataset"
output_folder = "dataset_wav"
os.makedirs(output_folder, exist_ok=True)

for f in os.listdir(input_folder):
    if f.lower().endswith((".mp3", ".wav", ".flac")):
        input_path = f"{input_folder}/{f}"
        base = os.path.splitext(f)[0]
        output_path = f"{output_folder}/{base}.wav"

        subprocess.run(["ffmpeg", "-y", "-i", input_path, "-ar", "16000", "-ac", "1", output_path])

print("Converted files:")
!ls dataset_wav


Converted files:
audio_0.wav  audio_2.wav  audio_4.wav  audio_6.wav  audio_8.wav
audio_1.wav  audio_3.wav  audio_5.wav  audio_7.wav  audio_9.wav


In [None]:
folder = "dataset_wav"

for f in os.listdir(folder):
    if f.endswith(".wav"):
        benchmark_file(folder + "/" + f)



===== FILE: dataset_wav/audio_0.wav =====

VOSK:
hello welcome to the speech recognition test

WHISPER:
Hello, welcome to the speech recognition test.

===== FILE: dataset_wav/audio_6.wav =====

VOSK:
this sentences intentionally longer to test robustness

WHISPER:
This sentence is intentionally longer to test robustness.

===== FILE: dataset_wav/audio_7.wav =====

VOSK:
the quick brown fox jumps over the lazy dog

WHISPER:
The quick brown fox jumps over the lazy dog.

===== FILE: dataset_wav/audio_5.wav =====

VOSK:
we will benchmark whisper and waske models

WHISPER:
We will benchmark Whisper and Vosk models.

===== FILE: dataset_wav/audio_2.wav =====

VOSK:
speech models must be evaluated for accuracy

WHISPER:
Speech models must be evaluated for accuracy.

===== FILE: dataset_wav/audio_1.wav =====

VOSK:
this is a synthetic dataset created using text to speech

WHISPER:
This is a synthetic dataset created using text to speech.

===== FILE: dataset_wav/audio_9.wav =====

VOSK:
than

Audio Data Samples for Testing (Practical Code)

In [None]:
from gtts import gTTS
import os

os.makedirs("sample_audio", exist_ok=True)

sentences = [
    "Welcome to the meeting. Today we will discuss performance metrics.",
    "We need to finish this project by next week.",
    "Can someone summarize the current status of development?",
    "The Whisper model generally gives better accuracy than Vosk."
]

for i, text in enumerate(sentences):
    tts = gTTS(text, lang="en")
    tts.save(f"sample_audio/audio_{i}.mp3")

print("Audio samples created!")
!ls sample_audio


Audio samples created!
audio_0.mp3  audio_1.mp3  audio_2.mp3  audio_3.mp3


Download AMI Dataset Sample and Convert to WAV

In [None]:
!mkdir -p ami_sample
!wget -q https://groups.inf.ed.ac.uk/ami/AMICorpusSamples/ES2002a.Mix-Headset.wav -O ami_sample/ES2002a.wav

import subprocess
subprocess.run(["ffmpeg","-y","-i","ami_sample/ES2002a.wav","-ar","16000","-ac","1","ami_sample/ami_16k.wav"])
print("AMI sample ready:", "ami_sample/ami_16k.wav")

AMI sample ready: ami_sample/ami_16k.wav


Run STT with Whisper and Vosk on AMI

In [None]:
!pip install -q vosk faster-whisper jiwer
!apt-get -qq install -y ffmpeg


Load models:

In [None]:
from vosk import Model, KaldiRecognizer
from faster_whisper import WhisperModel
import json, wave, torch

# Whisper model
device = "cuda" if torch.cuda.is_available() else "cpu"
whisper = WhisperModel("small", device=device)

# Vosk model
!wget -q https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip -O vosk_small.zip
!unzip -q vosk_small.zip
vosk_model = Model("vosk-model-small-en-us-0.15")


replace vosk-model-small-en-us-0.15/am/final.mdl? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


Transcribe AMI with Both Models + Compare

In [None]:
AMI_FILE = "ami_sample/ami_16k.wav"

def transcribe_vosk(path):
    wf = wave.open(path, "rb")
    rec = KaldiRecognizer(vosk_model, wf.getframerate())
    text=""
    while True:
        data = wf.readframes(4000)
        if not data:
            break
        if rec.AcceptWaveform(data):
            text += json.loads(rec.Result())["text"] + " "
    text += json.loads(rec.FinalResult())["text"]
    return text

def transcribe_whisper(path):
    segments, _ = whisper.transcribe(path)
    return " ".join([seg.text for seg in segments])


In [None]:
# =============================
# FIXED: AMI DOWNLOAD + CONVERT
# =============================

import os, subprocess

# 1. Create folder
os.makedirs("ami_sample", exist_ok=True)

# 2. Download AMI sample
print("Downloading AMI audio...")
!wget -q https://groups.inf.ed.ac.uk/ami/AMICorpusSamples/ES2002a.Mix-Headset.wav -O ami_sample/ES2002a.wav

# 3. Convert to 16k mono WAV
print("Converting to 16 kHz WAV...")
subprocess.run([
    "ffmpeg", "-y",
    "-i", "ami_sample/ES2002a.wav",
    "-ar", "16000",
    "-ac", "1",
    "ami_sample/ami_16k.wav"
])

# 4. Check if file exists
print("\nFiles in ami_sample:")
!ls -lh ami_sample


Downloading AMI audio...
Converting to 16 kHz WAV...

Files in ami_sample:
total 0
-rw-r--r-- 1 root root 0 Nov 15 09:04 ES2002a.wav


In [None]:
# =============================
# INSTALL MODELS
# =============================
!pip install -q vosk faster-whisper
!apt-get -qq install -y ffmpeg
!wget -q https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip -O vosk_small.zip
!unzip -q vosk_small.zip



replace vosk-model-small-en-us-0.15/am/final.mdl? [y]es, [n]o, [A]ll, [N]one, [r]ename: 