<a href="https://colab.research.google.com/github/Julfa-11/git/blob/main/audio_to_text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q vosk gTTS faster-whisper pydub soundfile sentencepiece
!apt -qq install -y ffmpeg

from google.colab import files
from gtts import gTTS
from IPython.display import Audio
import os, wave, json, subprocess

# ---- AUDIO PREP ----
print("Upload audio (optional). Skip to auto-create TTS sample.")
up = files.upload()

if up:
    audio = list(up.keys())[0]
else:
    tts = gTTS("Hello! This is a test audio for comparing Vosk and Whisper.", lang="en")
    audio = "sample.mp3"
    tts.save(audio)

# Convert → WAV 16khz mono
wav = "audio.wav"
subprocess.run(["ffmpeg","-y","-i",audio,"-ar","16000","-ac","1",wav],
               stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

Audio(wav)

# ---- VOSK ----
if not os.path.exists("vosk-model"):
    !wget -q https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip -O model.zip
    !unzip -q model.zip
    !mv vosk-model-small-en-us-0.15 vosk-model
    !rm model.zip

from vosk import Model, KaldiRecognizer

wf = wave.open(wav, "rb")
vosk_model = Model("vosk-model")
rec = KaldiRecognizer(vosk_model, wf.getframerate())

vosk_text = ""

while True:
    data = wf.readframes(4000)
    if len(data) == 0:
        break
    if rec.AcceptWaveform(data):
        res = json.loads(rec.Result())
        vosk_text += res.get("text", "") + " "
    else:
        res = json.loads(rec.PartialResult())
        vosk_text += res.get("partial", "") + " "

vosk_text += json.loads(rec.FinalResult()).get("text", "")
vosk_text = vosk_text.strip()

print("\n>>> VOSK:\n", vosk_text)

# ---- WHISPER ----
import torch
from faster_whisper import WhisperModel

device = "cuda" if torch.cuda.is_available() else "cpu"
wmodel = WhisperModel("small.en", device=device)

segments, _ = wmodel.transcribe(wav)
whisper_text = " ".join([s.text for s in segments]).strip()

print("\n>>> WHISPER:\n", whisper_text)


  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m47.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m52.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.5/40.5 MB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.0/38.0 MB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.4/17.4 MB[0m [31m85.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[

Saving audio.wav.wav to audio.wav (1).wav

>>> VOSK:
 hello hello hello hello this is hello this is an hello this is an hello this is an automatic hello this is an automatically hello this is an automatically hello this is an automatically hello this is an automatically generated hello this is an automatically generated hello this is an automatically generated audio hello this is an automatically generated audio message hello this is an automatically generated audio message hello this is an automatically generated audio message hello this is an automatically generated audio message created and hello this is an automatically generated audio message created in google hello this is an automatically generated audio message created in google hello this is an automatically generated audio message created in google hello this is an automatically generated audio message created in google hello this is an automatically generated audio message created in google coal lab hello this is an automati

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocabulary.txt: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.bin:   0%|          | 0.00/484M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]


>>> WHISPER:
 Hello! This is an automatically generated audio message created in Google Colab.
