# Installing Whisper

The commands below will install the Python packages needed to use Whisper models and evaluate the transcription results.

In [None]:
! pip install git+https://github.com/openai/whisper.git
! pip install jiwer

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-oxh_ci_l
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-oxh_ci_l
  Resolved https://github.com/openai/whisper.git to commit 517a43ecd132a2089d85f4ebc044728a71d49f6e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken (from openai-whisper==20240930)
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->openai-whisper==20240930)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->openai-whisper==20240930)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-

# Loading the LibriSpeech dataset

The following will load the test-clean split of the LibriSpeech corpus using torchaudio.

In [None]:
import os
import numpy as np

try:
    import tensorflow  # required in Colab to avoid protobuf compatibility issues
except ImportError:
    pass

import torch
import pandas as pd
import whisper
import torchaudio

from tqdm.notebook import tqdm


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
class LibriSpeech(torch.utils.data.Dataset):
    """
    A simple class to wrap LibriSpeech and trim/pad the audio to 30 seconds.
    It will drop the last few seconds of a very small portion of the utterances.
    """
    def __init__(self, split="test-clean", device=DEVICE):
        self.dataset = torchaudio.datasets.LIBRISPEECH(
            root=os.path.expanduser("~/.cache"),
            url=split,
            download=True,
        )
        self.device = device

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, item):
        audio, sample_rate, text, _, _, _ = self.dataset[item]
        assert sample_rate == 16000
        audio = whisper.pad_or_trim(audio.flatten()).to(self.device)
        mel = whisper.log_mel_spectrogram(audio)

        return (mel, text)

In [None]:
dataset = LibriSpeech("test-clean")
loader = torch.utils.data.DataLoader(dataset, batch_size=16)

In [None]:
! pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [4]:
from huggingface_hub import notebook_login
notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import whisper
import requests
from datasets import load_dataset

# Load the Whisper model (use 'medium' or 'large' for better accuracy)
model = whisper.load_model("medium")

# Load the Vaani dataset from Hugging Face
dataset = load_dataset("ARTPARK-IISc/Vaani","Telangana_Karimnagar", split="train")

# Get an audio sample (Modify index for different samples)
audio_sample = dataset[0]["audio"]["url"]

# Download the audio file
audio_path = "telugu_audio.wav"
response = requests.get(audio_sample)
with open(audio_path, "wb") as f:
    f.write(response.content)

# Transcribe the audio
result = model.transcribe(audio_path, language="te")

# Print the Telugu text
print("Transcription:", result["text"])


Resolving data files:   0%|          | 0/57 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/47 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/47 [00:00<?, ?files/s]

train-00000-of-00047.parquet:   0%|          | 0.00/462M [00:00<?, ?B/s]

train-00001-of-00047.parquet:   0%|          | 0.00/413M [00:00<?, ?B/s]

train-00002-of-00047.parquet:   0%|          | 0.00/402M [00:00<?, ?B/s]

train-00003-of-00047.parquet:   0%|          | 0.00/431M [00:00<?, ?B/s]

train-00004-of-00047.parquet:   0%|          | 0.00/410M [00:00<?, ?B/s]

train-00005-of-00047.parquet:   0%|          | 0.00/437M [00:00<?, ?B/s]

train-00006-of-00047.parquet:   0%|          | 0.00/422M [00:00<?, ?B/s]

train-00007-of-00047.parquet:   0%|          | 0.00/393M [00:00<?, ?B/s]

train-00008-of-00047.parquet:   0%|          | 0.00/406M [00:00<?, ?B/s]

train-00009-of-00047.parquet:   0%|          | 0.00/340M [00:00<?, ?B/s]

train-00010-of-00047.parquet:   0%|          | 0.00/388M [00:00<?, ?B/s]

train-00011-of-00047.parquet:   0%|          | 0.00/432M [00:00<?, ?B/s]

train-00012-of-00047.parquet:   0%|          | 0.00/506M [00:00<?, ?B/s]

train-00013-of-00047.parquet:   0%|          | 0.00/543M [00:00<?, ?B/s]

train-00014-of-00047.parquet:   0%|          | 0.00/509M [00:00<?, ?B/s]

train-00015-of-00047.parquet:   0%|          | 0.00/512M [00:00<?, ?B/s]

train-00016-of-00047.parquet:   0%|          | 0.00/423M [00:00<?, ?B/s]

train-00017-of-00047.parquet:   0%|          | 0.00/411M [00:00<?, ?B/s]

train-00018-of-00047.parquet:   0%|          | 0.00/336M [00:00<?, ?B/s]

train-00019-of-00047.parquet:   0%|          | 0.00/417M [00:00<?, ?B/s]

train-00020-of-00047.parquet:   0%|          | 0.00/415M [00:00<?, ?B/s]

train-00021-of-00047.parquet:   0%|          | 0.00/408M [00:00<?, ?B/s]

train-00022-of-00047.parquet:   0%|          | 0.00/428M [00:00<?, ?B/s]

train-00023-of-00047.parquet:   0%|          | 0.00/469M [00:00<?, ?B/s]

train-00024-of-00047.parquet:   0%|          | 0.00/459M [00:00<?, ?B/s]

train-00025-of-00047.parquet:   0%|          | 0.00/442M [00:00<?, ?B/s]

train-00026-of-00047.parquet:   0%|          | 0.00/740M [00:00<?, ?B/s]

train-00027-of-00047.parquet:   0%|          | 0.00/740M [00:00<?, ?B/s]

train-00028-of-00047.parquet:   0%|          | 0.00/717M [00:00<?, ?B/s]

train-00029-of-00047.parquet:   0%|          | 0.00/638M [00:00<?, ?B/s]

train-00030-of-00047.parquet:   0%|          | 0.00/642M [00:00<?, ?B/s]

train-00031-of-00047.parquet:   0%|          | 0.00/685M [00:00<?, ?B/s]

train-00032-of-00047.parquet:   0%|          | 0.00/659M [00:00<?, ?B/s]

train-00033-of-00047.parquet:   0%|          | 0.00/725M [00:00<?, ?B/s]

train-00034-of-00047.parquet:   0%|          | 0.00/659M [00:00<?, ?B/s]

train-00035-of-00047.parquet:   0%|          | 0.00/650M [00:00<?, ?B/s]

train-00036-of-00047.parquet:   0%|          | 0.00/690M [00:00<?, ?B/s]

train-00037-of-00047.parquet:   0%|          | 0.00/686M [00:00<?, ?B/s]

train-00038-of-00047.parquet:   0%|          | 0.00/701M [00:00<?, ?B/s]

train-00039-of-00047.parquet:   0%|          | 0.00/664M [00:00<?, ?B/s]

train-00040-of-00047.parquet:   0%|          | 0.00/684M [00:00<?, ?B/s]

train-00041-of-00047.parquet:   0%|          | 0.00/630M [00:00<?, ?B/s]

train-00042-of-00047.parquet:   0%|          | 0.00/712M [00:00<?, ?B/s]

train-00043-of-00047.parquet:   0%|          | 0.00/716M [00:00<?, ?B/s]

train-00044-of-00047.parquet:   0%|          | 0.00/658M [00:00<?, ?B/s]

train-00045-of-00047.parquet:   0%|          | 0.00/679M [00:00<?, ?B/s]

train-00046-of-00047.parquet:   0%|          | 0.00/672M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/123541 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/51 [00:00<?, ?it/s]

KeyError: 'url'

# Running inference on the dataset using a base Whisper model

The following will take a few minutes to transcribe all utterances in the dataset.

In [None]:
model = whisper.load_model("base")
print(
    f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
    f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters."
)

RuntimeError: Model base.te not found; available models = ['tiny.en', 'tiny', 'base.en', 'base', 'small.en', 'small', 'medium.en', 'medium', 'large-v1', 'large-v2', 'large-v3', 'large', 'large-v3-turbo', 'turbo']

In [None]:
# predict without timestamps for short-form transcription
options = whisper.DecodingOptions(language="en", without_timestamps=True)

In [None]:
hypotheses = []
references = []

for mels, texts in tqdm(loader):
    results = model.decode(mels, options)
    hypotheses.extend([result.text for result in results])
    references.extend(texts)

  0%|          | 0/164 [00:00<?, ?it/s]

In [None]:
data = pd.DataFrame(dict(hypothesis=hypotheses, reference=references))
data

Unnamed: 0,hypothesis,reference
0,"He hoped there would be stew for dinner, turni...",HE HOPED THERE WOULD BE STEW FOR DINNER TURNIP...
1,"Stuffered into you, his belly counseled him.",STUFF IT INTO YOU HIS BELLY COUNSELLED HIM
2,After early nightfall the yellow lamps would l...,AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD L...
3,"Hello Bertie, any good in your mind?",HELLO BERTIE ANY GOOD IN YOUR MIND
4,Number 10. Fresh Nelly is waiting on you. Good...,NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD ...
...,...,...
2615,"Oh, to shoot my soul's full meaning into futur...",OH TO SHOOT MY SOUL'S FULL MEANING INTO FUTURE...
2616,"Then I, long tried by natural ills, received t...",THEN I LONG TRIED BY NATURAL ILLS RECEIVED THE...
2617,I love thee freely as men strive for right. I ...,I LOVE THEE FREELY AS MEN STRIVE FOR RIGHT I L...
2618,"I love thee with the passion put to use, in my...",I LOVE THEE WITH THE PASSION PUT TO USE IN MY ...


# Calculating the word error rate

Now, we use our English normalizer implementation to standardize the transcription and calculate the WER.

In [None]:
import jiwer
from whisper.normalizers import EnglishTextNormalizer

normalizer = EnglishTextNormalizer()

In [None]:
data["hypothesis_clean"] = [normalizer(text) for text in data["hypothesis"]]
data["reference_clean"] = [normalizer(text) for text in data["reference"]]
data

Unnamed: 0,hypothesis,reference,hypothesis_clean,reference_clean
0,"He hoped there would be stew for dinner, turni...",HE HOPED THERE WOULD BE STEW FOR DINNER TURNIP...,he hoped there would be stew for dinner turnip...,he hoped there would be stew for dinner turnip...
1,"Stuffered into you, his belly counseled him.",STUFF IT INTO YOU HIS BELLY COUNSELLED HIM,stuffered into you his belly counseled him,stuff it into you his belly counseled him
2,After early nightfall the yellow lamps would l...,AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD L...,after early nightfall the yellow lamps would l...,after early nightfall the yellow lamps would l...
3,"Hello Bertie, any good in your mind?",HELLO BERTIE ANY GOOD IN YOUR MIND,hello bertie any good in your mind,hello bertie any good in your mind
4,Number 10. Fresh Nelly is waiting on you. Good...,NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD ...,number 10 fresh nelly is waiting on you good n...,number 10 fresh nelly is waiting on you good n...
...,...,...,...,...
2615,"Oh, to shoot my soul's full meaning into futur...",OH TO SHOOT MY SOUL'S FULL MEANING INTO FUTURE...,0 to shoot my soul is full meaning into future...,0 to shoot my soul is full meaning into future...
2616,"Then I, long tried by natural ills, received t...",THEN I LONG TRIED BY NATURAL ILLS RECEIVED THE...,then i long tried by natural ills received the...,then i long tried by natural ills received the...
2617,I love thee freely as men strive for right. I ...,I LOVE THEE FREELY AS MEN STRIVE FOR RIGHT I L...,i love thee freely as men strive for right i l...,i love thee freely as men strive for right i l...
2618,"I love thee with the passion put to use, in my...",I LOVE THEE WITH THE PASSION PUT TO USE IN MY ...,i love thee with the passion put to use in my ...,i love thee with the passion put to use in my ...


In [None]:
wer = jiwer.wer(list(data["reference_clean"]), list(data["hypothesis_clean"]))

print(f"WER: {wer * 100:.2f} %")

WER: 4.26 %


VOSK_TELUGU

In [1]:
!pip install vosk datasets
!apt install -y ffmpeg

Collecting vosk
  Downloading vosk-0.3.45-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (1.8 kB)
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting srt (from vosk)
  Downloading srt-3.5.3.tar.gz (28 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading vosk-0.3.45-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (7.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[

In [2]:
!wget https://alphacephei.com/vosk/models/vosk-model-small-te-0.42.zip
!unzip -q vosk-model-small-te-0.42.zip

--2025-04-08 09:50:46--  https://alphacephei.com/vosk/models/vosk-model-small-te-0.42.zip
Resolving alphacephei.com (alphacephei.com)... 188.40.21.16, 2a01:4f8:13a:279f::2
Connecting to alphacephei.com (alphacephei.com)|188.40.21.16|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 60544249 (58M) [application/zip]
Saving to: ‘vosk-model-small-te-0.42.zip’


2025-04-08 09:50:50 (16.8 MB/s) - ‘vosk-model-small-te-0.42.zip’ saved [60544249/60544249]



In [5]:
# Step 3: Load Vaani dataset and get audio URL
from datasets import load_dataset
import requests

dataset = load_dataset("ARTPARK-IISc/Vaani", "Telangana_Karimnagar", split="train")


Resolving data files:   0%|          | 0/57 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/47 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/47 [00:00<?, ?files/s]

train-00000-of-00047.parquet:   0%|          | 0.00/462M [00:00<?, ?B/s]

train-00001-of-00047.parquet:   0%|          | 0.00/413M [00:00<?, ?B/s]

train-00002-of-00047.parquet:   0%|          | 0.00/402M [00:00<?, ?B/s]

train-00003-of-00047.parquet:   0%|          | 0.00/431M [00:00<?, ?B/s]

train-00004-of-00047.parquet:   0%|          | 0.00/410M [00:00<?, ?B/s]

train-00005-of-00047.parquet:   0%|          | 0.00/437M [00:00<?, ?B/s]

train-00006-of-00047.parquet:   0%|          | 0.00/422M [00:00<?, ?B/s]

train-00007-of-00047.parquet:   0%|          | 0.00/393M [00:00<?, ?B/s]

train-00008-of-00047.parquet:   0%|          | 0.00/406M [00:00<?, ?B/s]

train-00009-of-00047.parquet:   0%|          | 0.00/340M [00:00<?, ?B/s]

train-00010-of-00047.parquet:   0%|          | 0.00/388M [00:00<?, ?B/s]

train-00011-of-00047.parquet:   0%|          | 0.00/432M [00:00<?, ?B/s]

train-00012-of-00047.parquet:   0%|          | 0.00/506M [00:00<?, ?B/s]

train-00013-of-00047.parquet:   0%|          | 0.00/543M [00:00<?, ?B/s]

train-00014-of-00047.parquet:   0%|          | 0.00/509M [00:00<?, ?B/s]

train-00015-of-00047.parquet:   0%|          | 0.00/512M [00:00<?, ?B/s]

train-00016-of-00047.parquet:   0%|          | 0.00/423M [00:00<?, ?B/s]

train-00017-of-00047.parquet:   0%|          | 0.00/411M [00:00<?, ?B/s]

train-00018-of-00047.parquet:   0%|          | 0.00/336M [00:00<?, ?B/s]

train-00019-of-00047.parquet:   0%|          | 0.00/417M [00:00<?, ?B/s]

train-00020-of-00047.parquet:   0%|          | 0.00/415M [00:00<?, ?B/s]

train-00021-of-00047.parquet:   0%|          | 0.00/408M [00:00<?, ?B/s]

train-00022-of-00047.parquet:   0%|          | 0.00/428M [00:00<?, ?B/s]

train-00023-of-00047.parquet:   0%|          | 0.00/469M [00:00<?, ?B/s]

train-00024-of-00047.parquet:   0%|          | 0.00/459M [00:00<?, ?B/s]

train-00025-of-00047.parquet:   0%|          | 0.00/442M [00:00<?, ?B/s]

train-00026-of-00047.parquet:   0%|          | 0.00/740M [00:00<?, ?B/s]

train-00027-of-00047.parquet:   0%|          | 0.00/740M [00:00<?, ?B/s]

train-00028-of-00047.parquet:   0%|          | 0.00/717M [00:00<?, ?B/s]

train-00029-of-00047.parquet:   0%|          | 0.00/638M [00:00<?, ?B/s]

train-00030-of-00047.parquet:   0%|          | 0.00/642M [00:00<?, ?B/s]

train-00031-of-00047.parquet:   0%|          | 0.00/685M [00:00<?, ?B/s]

train-00032-of-00047.parquet:   0%|          | 0.00/659M [00:00<?, ?B/s]

train-00033-of-00047.parquet:   0%|          | 0.00/725M [00:00<?, ?B/s]

train-00034-of-00047.parquet:   0%|          | 0.00/659M [00:00<?, ?B/s]

train-00035-of-00047.parquet:   0%|          | 0.00/650M [00:00<?, ?B/s]

train-00036-of-00047.parquet:   0%|          | 0.00/690M [00:00<?, ?B/s]

train-00037-of-00047.parquet:   0%|          | 0.00/686M [00:00<?, ?B/s]

train-00038-of-00047.parquet:   0%|          | 0.00/701M [00:00<?, ?B/s]

train-00039-of-00047.parquet:   0%|          | 0.00/664M [00:00<?, ?B/s]

train-00040-of-00047.parquet:   0%|          | 0.00/684M [00:00<?, ?B/s]

train-00041-of-00047.parquet:   0%|          | 0.00/630M [00:00<?, ?B/s]

train-00042-of-00047.parquet:   0%|          | 0.00/712M [00:00<?, ?B/s]

train-00043-of-00047.parquet:   0%|          | 0.00/716M [00:00<?, ?B/s]

train-00044-of-00047.parquet:   0%|          | 0.00/658M [00:00<?, ?B/s]

train-00045-of-00047.parquet:   0%|          | 0.00/679M [00:00<?, ?B/s]

train-00046-of-00047.parquet:   0%|          | 0.00/672M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/123541 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/51 [00:00<?, ?it/s]

In [30]:
import soundfile as sf
audio_sample = dataset[1]["audio"]

# Save raw audio from dataset
raw_audio_path = "raw_audio.wav"
sf.write(raw_audio_path, audio_sample["array"], audio_sample["sampling_rate"])

# Step 4: Convert audio to Vosk-compatible format: mono, 16kHz
converted_audio_path = "converted_audio.wav"

In [25]:
# Convert to required format
!ffmpeg -y -i {raw_audio_path} -ar 16000 -ac 1 {converted_audio_path}

ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

In [28]:
# Step 5: Transcribe with Vosk
from vosk import Model, KaldiRecognizer
import wave
import json

# Load Vosk model
model = Model("vosk-model-small-te-0.42")

# Open converted audio file
wf = wave.open(converted_audio_path, "rb")
rec = KaldiRecognizer(model, wf.getframerate())

results = []
while True:
    data = wf.readframes(4000)
    if len(data) == 0:
        break
    if rec.AcceptWaveform(data):
        results.append(json.loads(rec.Result()))

results.append(json.loads(rec.FinalResult()))

In [29]:
# Step 6: Combine and print transcription
transcription = " ".join([res.get("text", "") for res in results])
print("Vosk Telugu Transcription:\n", transcription)

Vosk Telugu Transcription:
  


In [None]:
!pip install transformers torchaudio librosa

# Load model + processor
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch
import librosa


In [32]:
# Install required packages

# Load Wav2Vec2 model trained for Telugu
processor = Wav2Vec2Processor.from_pretrained("anuragshas/wav2vec2-large-xlsr-53-telugu")
model = Wav2Vec2ForCTC.from_pretrained("anuragshas/wav2vec2-large-xlsr-53-telugu")

# Convert dataset audio array to 16kHz (if not already)
speech_array = audio_sample["array"]
orig_sr = audio_sample["sampling_rate"]

# Resample if needed
if orig_sr != 16000:
    speech_array = librosa.resample(speech_array, orig_sr, 16000)

# Prepare input for the model
inputs = processor(speech_array, sampling_rate=16000, return_tensors="pt", padding=True)

# Get model output
with torch.no_grad():
    logits = model(inputs.input_values).logits

# Decode predicted ids to text
pred_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(pred_ids)[0]

# Print result
print("🗣️ Telugu Transcription (Wav2Vec2):", transcription)


preprocessor_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.56k [00:00<?, ?B/s]



vocab.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

🗣️ Telugu Transcription (Wav2Vec2): కిద్యంటీయార్స్క్యాచ్చు ఇలి కరిణారులోం జేతని కరాణమటిత్పైపస్త్ రోత్ జవునై సిరివిల్లా బస్పచిప్టలోవు మూదక్రం్కా ఇశ్యరఫ్తన్నాజు
