<a href="https://colab.research.google.com/github/Julfa-11/git/blob/main/whisper_to_transcribe_audio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# ============================
# Short Vosk vs Whisper Script
# ============================

!pip install -q vosk gTTS faster-whisper pydub soundfile sentencepiece
!apt -qq install -y ffmpeg

from google.colab import files
from gtts import gTTS
from IPython.display import Audio
import os, wave, json, subprocess

# ---- AUDIO PREP ----
print("Upload audio (optional). Skip to auto-create TTS sample.")
up = files.upload()

if up:
    audio = list(up.keys())[0]
else:
    tts = gTTS("Hello! This is a test audio for comparing Vosk and Whisper.", lang="en")
    audio = "sample.mp3"
    tts.save(audio)

# Convert → WAV 16khz mono
wav = "audio.wav"
subprocess.run(["ffmpeg","-y","-i",audio,"-ar","16000","-ac","1",wav],
               stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

Audio(wav)

# ---- VOSK ----
if not os.path.exists("vosk-model"):
    !wget -q https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip -O model.zip
    !unzip -q model.zip
    !mv vosk-model-small-en-us-0.15 vosk-model
    !rm model.zip

from vosk import Model, KaldiRecognizer
wf = wave.open(wav, "rb")
rec = KaldiRecognizer(Model("vosk-model"), wf.getframerate())
vosk_text = ""

while True:
    data = wf.readframes(4000)
    if not data: break
    if rec.AcceptWaveform(data):
        vosk_text += json.loads(rec.Result()).get("text"," ") + " "
vosk_text += json.loads(rec.FinalResult()).get("text"," ")

print("\n>>> VOSK:\n", vosk_text.strip())

# ---- WHISPER ----
import torch
from faster_whisper import WhisperModel

device = "cuda" if torch.cuda.is_available() else "cpu"
wmodel = WhisperModel("small", device=device)

segments, _ = wmodel.transcribe(wav)
whisper_text = " ".join([s.text for s in segments]).strip()

print("\n>>> WHISPER:\n", whisper_text)


ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.
Upload audio (optional). Skip to auto-create TTS sample.


Saving audio.wav.wav to audio.wav (1).wav

>>> VOSK:
 hello this is an automatically generated audio message created in google coal lab


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

vocabulary.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

model.bin:   0%|          | 0.00/484M [00:00<?, ?B/s]


>>> WHISPER:
 Hello, this is an automatically generated audio message created in Google Colab.
