In [None]:
!pip install torch torchvision torchaudio
!pip install torchcodec
!pip install transformers datasets accelerate


Collecting torchcodec
  Downloading torchcodec-0.7.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (9.4 kB)
Downloading torchcodec-0.7.0-cp312-cp312-manylinux_2_28_x86_64.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m63.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torchcodec
Successfully installed torchcodec-0.7.0


In [None]:
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
import torch

# Load custom Urdu Whisper model and processor
model_id = "Abdul145/whisper-medium-urdu-custom"

processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

print(f"✅ Model loaded on {device}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/339 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/3.06G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/275 [00:00<?, ?B/s]

✅ Model loaded on cuda


In [None]:
from datasets import load_dataset, Audio

# Load the Urdu Common Voice processed dataset
ds = load_dataset("UmarRamzan/common-voice-urdu-processed")

# Resample audio to 16 kHz (required for Whisper models)
ds = ds.cast_column("audio", Audio(sampling_rate=16000))

print(ds)


README.md:   0%|          | 0.00/529 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/234M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/112M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9425 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/4056 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['path', 'audio', 'sentence', 'variant'],
        num_rows: 9425
    })
    test: Dataset({
        features: ['path', 'audio', 'sentence', 'variant'],
        num_rows: 4056
    })
})


In [None]:
def prepare_dataset(batch):
    audio = batch["audio"]
    batch["input_features"] = processor(
        audio["array"],
        sampling_rate=audio["sampling_rate"],
        return_tensors="pt"
    ).input_features[0]
    return batch

# Add input features column; remove unnecessary columns
ds = ds.map(prepare_dataset, remove_columns=["path", "variant"])
print("✅ Audio features prepared successfully!")


Map:   0%|          | 0/9425 [00:00<?, ? examples/s]

Map:   0%|          | 0/4056 [00:00<?, ? examples/s]

✅ Audio features prepared successfully!


In [None]:
from tqdm import tqdm
import torch

def transcribe(batch):
    # Convert list to tensor
    input_features = torch.tensor(batch["input_features"]).unsqueeze(0).to(device)
    with torch.no_grad():
        predicted_ids = model.generate(input_features)
    batch["prediction"] = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    return batch

# Run inference on 5 samples from the test set
results = ds["test"].select(range(5)).map(transcribe)

for i, example in enumerate(results):
    print(f"\n🔊 Example {i+1}")
    print(f"Reference : {example['sentence']}")
    print(f"Predicted : {example['prediction']}")


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



🔊 Example 1
Reference : بے ذوق نہیں اگرچہ فطرت
Predicted : بے ذوق نہیں اگر چی فطرت

🔊 Example 2
Reference : یہی تناسب یوتھ کا بھی ہے
Predicted : یہی تناسب یوت کا بھی ہے۔

🔊 Example 3
Reference : اس کا مخمصہ یہ ہے کہ عوامی تائید نوازشریف کے ساتھ ہے
Predicted : اس کا مخمصہ یہ ہے کہ عوامی تعید نواز شریف کے ساتھ ہے۔

🔊 Example 4
Reference : یہاں سیکڑوں کارواں اور بھی ہیں
Predicted : یہاں سیکڑوں کاروان اور بھی ہیں۔

🔊 Example 5
Reference : پاکستان کیلئےبیل اٹ پیکج امریکا نے ائی ایم ایف کو خبردارکردیا
Predicted : پاکستان کے لیے بیلٹ پیکس امریکہ نے ائی ایم ایف خبردار کردیا


In [None]:
# Install dependencies
!pip install evaluate jiwer

import evaluate

# Load both metrics
wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")

# Collect predictions and references
preds = [x["prediction"] for x in results]
refs = [x["sentence"] for x in results]

# Compute metrics
wer_score = wer_metric.compute(predictions=preds, references=refs)
cer_score = cer_metric.compute(predictions=preds, references=refs)

# Display results
print(f"✅ Word Error Rate (WER): {wer_score:.2%}")
print(f"✅ Character Error Rate (CER): {cer_score:.2%}")


Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting jiwer
  Downloading jiwer-4.0.0-py3-none-any.whl.metadata (3.3 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.14.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jiwer-4.0.0-py3-none-any.whl (23 kB)
Downloading rapidfuzz-3.14.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m106.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer, evaluate
Successfully installed evaluate-0.4.6 jiwer-4.0.0 rapidfuzz-3.14.1


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

✅ Word Error Rate (WER): 42.50%
✅ Character Error Rate (CER): 11.64%


In [None]:
from tqdm import tqdm
import torch

# Batch size (adjust for GPU memory)
BATCH_SIZE = 4

def batched_transcribe(dataset, batch_size=BATCH_SIZE):
    preds, refs = [], []
    n = len(dataset)

    for i in tqdm(range(0, n, batch_size), desc="🚀 Transcribing full test set"):
        batch = dataset.select(range(i, min(i + batch_size, n)))

        # Stack model input features into a batch tensor
        feature_tensors = [torch.tensor(f) for f in batch["input_features"]]
        features = torch.stack(feature_tensors).to(device)

        with torch.no_grad():
            predicted_ids = model.generate(features)

        # Decode predicted text
        transcriptions = processor.batch_decode(predicted_ids, skip_special_tokens=True)
        preds.extend(transcriptions)
        refs.extend(batch["sentence"])

    return preds, refs


# 🕐 Run transcription (this may take some time)
preds, refs = batched_transcribe(ds["test"], BATCH_SIZE)

print("✅ Transcription completed.")
print(f"Total samples processed: {len(preds)}")

# Optional: Save results to avoid rerunning transcription
import pandas as pd
pd.DataFrame({"reference": refs, "prediction": preds}).to_csv("urdu_transcriptions.csv", index=False)
print("💾 Transcriptions saved to 'urdu_transcriptions.csv'")


🚀 Transcribing full test set: 100%|██████████| 1014/1014 [3:16:51<00:00, 11.65s/it]

✅ Transcription completed.
Total samples processed: 4056
💾 Transcriptions saved to 'urdu_transcriptions.csv'





In [None]:
!pip install evaluate jiwer
import evaluate

# Load metrics
wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")

# Compute both metrics
wer_score = wer_metric.compute(predictions=preds, references=refs)
cer_score = cer_metric.compute(predictions=preds, references=refs)

print("\n📊 Evaluation Results on Full Test Set")
print(f"✅ Word Error Rate (WER): {wer_score:.2%}")
print(f"✅ Character Error Rate (CER): {cer_score:.2%}")



📊 Evaluation Results on Full Test Set
✅ Word Error Rate (WER): 26.76%
✅ Character Error Rate (CER): 8.83%
