Package Installs

In [1]:
!pip install -q transformers[torch] datasets[audio] accelerate evaluate jiwer librosa torchcodec


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.1/2.1 MB[0m [31m93.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m51.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.2/3.2 MB[0m [31m123.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m66.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
!pip install ffmpeg

Collecting ffmpeg
  Downloading ffmpeg-1.4.tar.gz (5.1 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: ffmpeg
  Building wheel for ffmpeg (setup.py) ... [?25l[?25hdone
  Created wheel for ffmpeg: filename=ffmpeg-1.4-py3-none-any.whl size=6083 sha256=953c42ffaed57dd9de529365851f11875a4c8b8b8c446357ca5b30df5cf79a87
  Stored in directory: /root/.cache/pip/wheels/26/21/0c/c26e09dff860a9071683e279445262346e008a9a1d2142c4ad
Successfully built ffmpeg
Installing collected packages: ffmpeg
Successfully installed ffmpeg-1.4


Load G drive and change directory

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd /content/drive/MyDrive/Pronunciation-model/

/content/drive/MyDrive/Pronunciation-model


In [5]:
!pip install soundfile



Dataset Loader

In [6]:
# dataset_loader.py
import os
import json
import torch
import soundfile as sf

class SinhalaDataset(torch.utils.data.Dataset):
    def __init__(self, root):
        self.samples = []

        with open(f"{root}/phonemes.json", "r", encoding="utf-8") as f:
            self.data_map = json.load(f)

        for folder_name, info in self.data_map.items():
            folder_path = os.path.join(root, folder_name)
            if os.path.exists(folder_path):
                for fname in os.listdir(folder_path):
                    if fname.endswith(".wav"):
                        self.samples.append(
                            (os.path.join(folder_path, fname), info["phonemes"])
                        )

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        wav_path, phonemes = self.samples[idx]

        audio, sr = sf.read(wav_path, dtype="float32")

        # Convert to torch
        wav = torch.from_numpy(audio)

        # Handle stereo
        if wav.ndim == 2:
            wav = wav.mean(dim=1)

        # Resampling
        if sr != 16000:
            wav = torch.nn.functional.interpolate(
                wav.unsqueeze(0).unsqueeze(0),
                scale_factor=16000 / sr,
                mode="linear",
                align_corners=False,
            ).squeeze()

        return wav, phonemes


Test data loading

In [8]:
root = "/content/drive/MyDrive/Pronunciation-model/dataset"

dataset = SinhalaDataset(root)
print(len(dataset))

wav, phones = dataset[0]
print(wav.shape)
print(phones)


1028
torch.Size([11200])
['ba', 'l', 'la:']


**Vocabulary**

In [9]:
# vocab_builder.py
import json

phonemes = [
"ba", "l", "la:", "i", "b", "ba:", "da", "th", "po", "d", "k", "sa", "ma", "na", "la", "ya:", "du", "ru", "wa:", "lu:", "nu", "a", "n", "na:", "si", "ha:", "ya", "thu", "ra", "tha", "wa", "o", "lu", "ha", "ga", "ka", "s", "kri:", "m", "pa", "pra", "sh", "da:"
]

vocab = {"[BLANK]": 0}

for i, p in enumerate(phonemes, start=1):
    vocab[p] = i

vocab["[PAD]"] = len(vocab)
vocab["[UNK]"] = len(vocab)

with open("vocab.json", "w", encoding="utf-8") as f:
    json.dump(vocab, f, ensure_ascii=False, indent=2)

print("✅ Vocab size:", len(vocab))


✅ Vocab size: 46


Processor + Model

In [10]:
from transformers import (
    Wav2Vec2CTCTokenizer,
    Wav2Vec2FeatureExtractor,
    Wav2Vec2Processor,
    Wav2Vec2ForCTC,
    Wav2Vec2Config
)

tokenizer = Wav2Vec2CTCTokenizer(
    "vocab.json",
    unk_token="[UNK]",
    pad_token="[PAD]",
    word_delimiter_token=None
)

processor = Wav2Vec2Processor(
    feature_extractor=Wav2Vec2FeatureExtractor(
        sampling_rate=16000,
        do_normalize=True,
        return_attention_mask=True
    ),
    tokenizer=tokenizer
)

config = Wav2Vec2Config.from_pretrained(
    "facebook/wav2vec2-xls-r-300m",
    vocab_size=len(tokenizer),
    ctc_blank_id=tokenizer.convert_tokens_to_ids("[BLANK]"),
    pad_token_id=tokenizer.pad_token_id,
    ctc_zero_infinity=True
)

model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-xls-r-300m",
    config=config,
    ignore_mismatched_sizes=True
)

model.freeze_feature_encoder()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/422 [00:00<?, ?it/s]

Wav2Vec2ForCTC LOAD REPORT from: facebook/wav2vec2-xls-r-300m
Key                          | Status     | 
-----------------------------+------------+-
quantizer.weight_proj.bias   | UNEXPECTED | 
project_q.bias               | UNEXPECTED | 
project_hid.bias             | UNEXPECTED | 
quantizer.weight_proj.weight | UNEXPECTED | 
quantizer.codevectors        | UNEXPECTED | 
project_q.weight             | UNEXPECTED | 
project_hid.weight           | UNEXPECTED | 
lm_head.bias                 | MISSING    | 
lm_head.weight               | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Data Collator

In [11]:
def collate_fn(batch):
    audio = [b[0].numpy() for b in batch]
    phonemes = [b[1] for b in batch]

    inputs = processor(audio, sampling_rate=16000,
                       return_tensors="pt", padding=True)

    labels = processor.tokenizer(
        phonemes,
        is_split_into_words=True,
        return_tensors="pt",
        padding=True
    ).input_ids

    labels[labels == tokenizer.pad_token_id] = -100
    return inputs.input_values, labels


Training Logic

In [16]:
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from torch.nn.utils import clip_grad_norm_
from torch.cuda.amp import autocast, GradScaler

# -----------------------------
# CONFIG (COLAB FREE SAFE)
# -----------------------------
EPOCHS = 15              # upper bound (early stopping will stop earlier)
BATCH_SIZE = 4
LR = 1e-5
MAX_GRAD_NORM = 1.0
UNFREEZE_EPOCH = 4
PATIENCE = 2             # aggressive early stopping
MIN_DELTA = 1.0
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# -----------------------------
# SETUP
# -----------------------------
model.to(DEVICE)
model.config.ctc_zero_infinity = True

loader = DataLoader(
    dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=collate_fn,
    pin_memory=True
)

optimizer = AdamW(model.parameters(), lr=LR)
scaler = GradScaler()

# Freeze feature extractor
for param in model.wav2vec2.feature_extractor.parameters():
    param.requires_grad = False

# -----------------------------
# TRAINING
# -----------------------------
best_loss = float("inf")
no_improve = 0

print("🚀 Starting Colab-safe training...")

for epoch in range(EPOCHS):
    model.train()
    epoch_loss = 0

    # 🔓 Unfreeze feature extractor
    if epoch == UNFREEZE_EPOCH:
        for param in model.wav2vec2.feature_extractor.parameters():
            param.requires_grad = True
        print("🔓 Feature extractor unfrozen")

    for step, (inputs, labels) in enumerate(loader):
        inputs = inputs.to(DEVICE)
        labels = labels.to(DEVICE)

        optimizer.zero_grad()

        # AMP forward
        with autocast():
            outputs = model(input_values=inputs, labels=labels)
            loss = outputs.loss

        if torch.isnan(loss):
            print(f"⚠️ NaN at step {step}, skipping")
            continue

        # Backward
        scaler.scale(loss).backward()

        # Gradient clipping
        scaler.unscale_(optimizer)
        clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)

        scaler.step(optimizer)
        scaler.update()

        epoch_loss += loss.item()

        if step % 20 == 0:
            print(f"Epoch {epoch} | Step {step} | Loss {loss.item():.2f}")

    avg_loss = epoch_loss / len(loader)
    print(f"✅ Epoch {epoch} | Avg Loss: {avg_loss:.4f}")

    # -----------------------------
    # EARLY STOPPING
    # -----------------------------
    if avg_loss < best_loss - MIN_DELTA:
        best_loss = avg_loss
        no_improve = 0

        model.save_pretrained("sinhala-pronunciation-model")
        processor.save_pretrained("sinhala-pronunciation-model")
        print("### Best model saved")

    else:
        no_improve += 1
        print(f"⏸️ No improvement ({no_improve}/{PATIENCE})")

        if no_improve >= PATIENCE:
            print("### Early stopping triggered")
            break

print("🎉 Training finished")


  scaler = GradScaler()
  with autocast():


🚀 Starting Colab-safe training...
Epoch 0 | Step 0 | Loss 68.18
Epoch 0 | Step 20 | Loss 53.83
Epoch 0 | Step 40 | Loss 43.99
Epoch 0 | Step 60 | Loss 41.28
Epoch 0 | Step 80 | Loss 55.32
Epoch 0 | Step 100 | Loss 47.00
Epoch 0 | Step 120 | Loss 52.26
Epoch 0 | Step 140 | Loss 50.17
Epoch 0 | Step 160 | Loss 54.88
Epoch 0 | Step 180 | Loss 51.76
Epoch 0 | Step 200 | Loss 57.34
Epoch 0 | Step 220 | Loss 61.72
Epoch 0 | Step 240 | Loss 59.04
✅ Epoch 0 | Avg Loss: 53.4988


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

### Best model saved
Epoch 1 | Step 0 | Loss 58.19
Epoch 1 | Step 20 | Loss 47.58
Epoch 1 | Step 40 | Loss 62.09
Epoch 1 | Step 60 | Loss 43.79
Epoch 1 | Step 80 | Loss 65.11
Epoch 1 | Step 100 | Loss 58.70
Epoch 1 | Step 120 | Loss 45.24
Epoch 1 | Step 140 | Loss 49.97
Epoch 1 | Step 160 | Loss 59.62
Epoch 1 | Step 180 | Loss 40.24
Epoch 1 | Step 200 | Loss 51.55
Epoch 1 | Step 220 | Loss 45.92
Epoch 1 | Step 240 | Loss 46.46
✅ Epoch 1 | Avg Loss: 52.9695
⏸️ No improvement (1/2)
Epoch 2 | Step 0 | Loss 46.12
Epoch 2 | Step 20 | Loss 45.13
Epoch 2 | Step 40 | Loss 53.86
Epoch 2 | Step 60 | Loss 34.42
Epoch 2 | Step 80 | Loss 56.22
Epoch 2 | Step 100 | Loss 45.95
Epoch 2 | Step 120 | Loss 58.36
Epoch 2 | Step 140 | Loss 47.67
Epoch 2 | Step 160 | Loss 53.88
Epoch 2 | Step 180 | Loss 57.07
Epoch 2 | Step 200 | Loss 52.37
Epoch 2 | Step 220 | Loss 55.18
Epoch 2 | Step 240 | Loss 59.53
✅ Epoch 2 | Avg Loss: 52.1833


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

### Best model saved
Epoch 3 | Step 0 | Loss 57.62
Epoch 3 | Step 20 | Loss 52.13
Epoch 3 | Step 40 | Loss 53.18
Epoch 3 | Step 60 | Loss 49.07
Epoch 3 | Step 80 | Loss 55.30
Epoch 3 | Step 100 | Loss 46.97
Epoch 3 | Step 120 | Loss 50.69
Epoch 3 | Step 140 | Loss 55.60
Epoch 3 | Step 160 | Loss 50.16
Epoch 3 | Step 180 | Loss 50.47
Epoch 3 | Step 200 | Loss 54.08
Epoch 3 | Step 220 | Loss 51.52
Epoch 3 | Step 240 | Loss 54.70
✅ Epoch 3 | Avg Loss: 50.7421


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

### Best model saved
🔓 Feature extractor unfrozen
Epoch 4 | Step 0 | Loss 50.28
Epoch 4 | Step 20 | Loss 49.54
Epoch 4 | Step 40 | Loss 52.19
Epoch 4 | Step 60 | Loss 56.98
Epoch 4 | Step 80 | Loss 41.49
Epoch 4 | Step 100 | Loss 59.24
Epoch 4 | Step 120 | Loss 45.67
Epoch 4 | Step 140 | Loss 42.66
Epoch 4 | Step 160 | Loss 42.05
Epoch 4 | Step 180 | Loss 48.62
Epoch 4 | Step 200 | Loss 49.22
Epoch 4 | Step 220 | Loss 43.83
Epoch 4 | Step 240 | Loss 46.79
✅ Epoch 4 | Avg Loss: 48.9811


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

### Best model saved
Epoch 5 | Step 0 | Loss 47.55
Epoch 5 | Step 20 | Loss 45.86
Epoch 5 | Step 40 | Loss 56.75
Epoch 5 | Step 60 | Loss 44.33
Epoch 5 | Step 80 | Loss 43.16
Epoch 5 | Step 100 | Loss 38.40
Epoch 5 | Step 120 | Loss 47.14
Epoch 5 | Step 140 | Loss 50.55
Epoch 5 | Step 160 | Loss 48.32
Epoch 5 | Step 180 | Loss 39.14
Epoch 5 | Step 200 | Loss 43.72
Epoch 5 | Step 220 | Loss 49.69
Epoch 5 | Step 240 | Loss 41.26
✅ Epoch 5 | Avg Loss: 47.1917


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

### Best model saved
Epoch 6 | Step 0 | Loss 43.06
Epoch 6 | Step 20 | Loss 48.03
Epoch 6 | Step 40 | Loss 41.39
Epoch 6 | Step 60 | Loss 39.75
Epoch 6 | Step 80 | Loss 38.15
Epoch 6 | Step 100 | Loss 45.58
Epoch 6 | Step 120 | Loss 44.55
Epoch 6 | Step 140 | Loss 42.91
Epoch 6 | Step 160 | Loss 40.40
Epoch 6 | Step 180 | Loss 48.14
Epoch 6 | Step 200 | Loss 42.81
Epoch 6 | Step 220 | Loss 46.38
Epoch 6 | Step 240 | Loss 37.71
✅ Epoch 6 | Avg Loss: 44.8555


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

### Best model saved
Epoch 7 | Step 0 | Loss 39.93
Epoch 7 | Step 20 | Loss 37.86
Epoch 7 | Step 40 | Loss 46.22
Epoch 7 | Step 60 | Loss 40.34
Epoch 7 | Step 80 | Loss 48.19
Epoch 7 | Step 100 | Loss 43.41
Epoch 7 | Step 120 | Loss 40.97
Epoch 7 | Step 140 | Loss 59.45
Epoch 7 | Step 160 | Loss 36.46
Epoch 7 | Step 180 | Loss 48.58
Epoch 7 | Step 200 | Loss 43.14
Epoch 7 | Step 220 | Loss 44.14
Epoch 7 | Step 240 | Loss 43.12
✅ Epoch 7 | Avg Loss: 42.1374


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

### Best model saved
Epoch 8 | Step 0 | Loss 34.82
Epoch 8 | Step 20 | Loss 33.81
Epoch 8 | Step 40 | Loss 38.69
Epoch 8 | Step 60 | Loss 41.97
Epoch 8 | Step 80 | Loss 38.82
Epoch 8 | Step 100 | Loss 39.37
Epoch 8 | Step 120 | Loss 40.20
Epoch 8 | Step 140 | Loss 38.01
Epoch 8 | Step 160 | Loss 41.49
Epoch 8 | Step 180 | Loss 44.68
Epoch 8 | Step 200 | Loss 40.51
Epoch 8 | Step 220 | Loss 43.76
Epoch 8 | Step 240 | Loss 33.59
✅ Epoch 8 | Avg Loss: 39.1210


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

### Best model saved
Epoch 9 | Step 0 | Loss 39.16
Epoch 9 | Step 20 | Loss 35.98
Epoch 9 | Step 40 | Loss 33.27
Epoch 9 | Step 60 | Loss 36.27
Epoch 9 | Step 80 | Loss 37.13
Epoch 9 | Step 100 | Loss 40.10
Epoch 9 | Step 120 | Loss 33.19
Epoch 9 | Step 140 | Loss 39.08
Epoch 9 | Step 160 | Loss 39.82
Epoch 9 | Step 180 | Loss 33.32
Epoch 9 | Step 200 | Loss 35.38
Epoch 9 | Step 220 | Loss 25.74
Epoch 9 | Step 240 | Loss 27.75
✅ Epoch 9 | Avg Loss: 36.5121


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

### Best model saved
Epoch 10 | Step 0 | Loss 33.92
Epoch 10 | Step 20 | Loss 31.50
Epoch 10 | Step 40 | Loss 32.49
Epoch 10 | Step 60 | Loss 32.93
Epoch 10 | Step 80 | Loss 35.01
Epoch 10 | Step 100 | Loss 34.92
Epoch 10 | Step 120 | Loss 33.90
Epoch 10 | Step 140 | Loss 25.29
Epoch 10 | Step 160 | Loss 34.53
Epoch 10 | Step 180 | Loss 21.18
Epoch 10 | Step 200 | Loss 33.04
Epoch 10 | Step 220 | Loss 30.16
Epoch 10 | Step 240 | Loss 25.72
✅ Epoch 10 | Avg Loss: 34.4903


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

### Best model saved
Epoch 11 | Step 0 | Loss 41.39
Epoch 11 | Step 20 | Loss 34.52
Epoch 11 | Step 40 | Loss 34.23
Epoch 11 | Step 60 | Loss 36.57
Epoch 11 | Step 80 | Loss 33.43
Epoch 11 | Step 100 | Loss 27.03
Epoch 11 | Step 120 | Loss 44.68
Epoch 11 | Step 140 | Loss 30.22
Epoch 11 | Step 160 | Loss 27.81
Epoch 11 | Step 180 | Loss 30.83
Epoch 11 | Step 200 | Loss 34.74
Epoch 11 | Step 220 | Loss 34.56
Epoch 11 | Step 240 | Loss 32.40
✅ Epoch 11 | Avg Loss: 32.1268


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

### Best model saved
Epoch 12 | Step 0 | Loss 40.94
Epoch 12 | Step 20 | Loss 30.88
Epoch 12 | Step 40 | Loss 25.05
Epoch 12 | Step 60 | Loss 20.92
Epoch 12 | Step 80 | Loss 36.27
Epoch 12 | Step 100 | Loss 25.40
Epoch 12 | Step 120 | Loss 27.22
Epoch 12 | Step 140 | Loss 22.51
Epoch 12 | Step 160 | Loss 39.88
Epoch 12 | Step 180 | Loss 36.44
Epoch 12 | Step 200 | Loss 26.81
Epoch 12 | Step 220 | Loss 28.12
Epoch 12 | Step 240 | Loss 28.93
✅ Epoch 12 | Avg Loss: 29.6334


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

### Best model saved
Epoch 13 | Step 0 | Loss 21.55
Epoch 13 | Step 20 | Loss 18.53
Epoch 13 | Step 40 | Loss 23.46
Epoch 13 | Step 60 | Loss 28.33
Epoch 13 | Step 80 | Loss 35.24
Epoch 13 | Step 100 | Loss 27.44
Epoch 13 | Step 120 | Loss 31.31
Epoch 13 | Step 140 | Loss 20.30
Epoch 13 | Step 160 | Loss 28.56
Epoch 13 | Step 180 | Loss 14.35
Epoch 13 | Step 200 | Loss 24.21
Epoch 13 | Step 220 | Loss 29.36
Epoch 13 | Step 240 | Loss 28.02
✅ Epoch 13 | Avg Loss: 26.9560


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

### Best model saved
Epoch 14 | Step 0 | Loss 19.99
Epoch 14 | Step 20 | Loss 22.89
Epoch 14 | Step 40 | Loss 38.77
Epoch 14 | Step 60 | Loss 19.06
Epoch 14 | Step 80 | Loss 21.77
Epoch 14 | Step 100 | Loss 21.93
Epoch 14 | Step 120 | Loss 17.87
Epoch 14 | Step 140 | Loss 19.87
Epoch 14 | Step 160 | Loss 34.66
Epoch 14 | Step 180 | Loss 29.05
Epoch 14 | Step 200 | Loss 24.15
Epoch 14 | Step 220 | Loss 29.94
Epoch 14 | Step 240 | Loss 25.40
✅ Epoch 14 | Avg Loss: 24.6749


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

### Best model saved
🎉 Training finished


Testing

Load the model

In [20]:
import torch
import json
import torchaudio
import torchaudio.functional as F
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_path = "./sinhala-pronunciation-model"

processor = Wav2Vec2Processor.from_pretrained(model_path)
model = Wav2Vec2ForCTC.from_pretrained(model_path).to(device)
model.eval()

# Load vocab
with open(f"{model_path}/vocab.json", "r", encoding="utf-8") as f:
    training_vocab = json.load(f)

# Set blank token
model.config.ctc_blank_id = processor.tokenizer.pad_token_id

print("✅ Model ready")


Loading weights:   0%|          | 0/424 [00:00<?, ?it/s]

✅ Model ready


GOP normalization

In [21]:
def normalize_gop(log_prob, min_lp=-12.0, max_lp=-5.0):
    log_prob = max(min(log_prob, max_lp), min_lp)
    return (log_prob - min_lp) / (max_lp - min_lp)

def get_sinhala_feedback(score):
    if score >= 0.75:
        return "ඉතා හොඳයි! 🌟"
    elif score >= 0.55:
        return "හොඳයි. තව ටිකක් පුහුණු වෙමු 👍"
    elif score >= 0.35:
        return "නැවත උත්සාහ කරමු 👂"
    else:
        return "නැවත පැහැදිලිව පවසන්න ❌"


Pronunciation evaluation

In [28]:
def evaluate_pronunciation(
    wav_path,
    target_phonemes,
    model,
    processor,
    training_vocab,
    device
):
    import soundfile as sf
    import librosa

    wav, sr = sf.read(wav_path)

    if wav.ndim > 1:
        wav = wav.mean(axis=1)  # mono

    if sr != 16000:
        wav = librosa.resample(wav, orig_sr=sr, target_sr=16000)
        sr = 16000

    wav = torch.tensor(wav).unsqueeze(0)


    if sr != 16000:
        wav = torchaudio.functional.resample(wav, sr, 16000)

    inputs = processor(
        wav.squeeze().numpy(),
        sampling_rate=16000,
        return_tensors="pt"
    ).input_values.to(device)

    with torch.no_grad():
        emissions = model(inputs).logits.log_softmax(-1)

    target_ids = torch.tensor(
        [training_vocab[p] for p in target_phonemes],
        device=device
    ).unsqueeze(0)

    blank_id = model.config.ctc_blank_id

    alignment, scores = F.forced_align(
        emissions,
        target_ids,
        blank=blank_id
    )

    results = []
    avg_score = 0.0

    for i, p in enumerate(target_phonemes):
        raw_gop = scores[0][i].item()
        score = normalize_gop(raw_gop)
        avg_score += score

        results.append({
            "phoneme": p,
            "score": round(score, 2),
            "feedback": get_sinhala_feedback(score)
        })

    avg_score /= len(target_phonemes)

    return {
        "average_score": round(avg_score, 2),
        "verdict": "good" if avg_score >= 0.6 else "needs_practice",
        "details": results
    }


Utterance confidence

In [32]:
def ctc_confidence(wav_path, model, processor, device):
    import librosa
    import torch

    audio, _ = librosa.load(wav_path, sr=16000)
    inputs = processor(audio, sampling_rate=16000, return_tensors="pt").to(device)

    with torch.no_grad():
        log_probs = model(**inputs).logits.log_softmax(-1)

    # mean max probability over time
    confidence = log_probs.max(dim=-1).values.mean().exp().item()
    return confidence


Pipeline

In [33]:
def run_pronunciation_pipeline(
    wav_path,
    target_phonemes,
    model,
    processor,
    training_vocab,
    device,
    confidence_threshold=0.35
):
    conf = ctc_confidence(wav_path, model, processor, device)

    if conf < confidence_threshold:
        return {
            "status": "invalid_input",
            "confidence": round(conf, 2),
            "message": "වචනය හඳුනාගත නොහැක. කරුණාකර පැහැදිලිව පවසන්න."
        }

    result = evaluate_pronunciation(
        wav_path,
        target_phonemes,
        model,
        processor,
        training_vocab,
        device
    )

    result["confidence"] = round(conf, 2)
    return result

Test with a sample

In [34]:
test_file = "/content/drive/MyDrive/1.wav"

# Sinhala: balla → /b a l l a:/
test_targets = ["ba", "l", "la:"]

output = run_pronunciation_pipeline(
    wav_path=test_file,
    target_phonemes=test_targets,
    model=model,
    processor=processor,
    training_vocab=training_vocab,
    device=device
)

print("🟢 Average score:", output["average_score"])
print("📌 Verdict:", output["verdict"])
print("\n--- Phoneme feedback ---")

for r in output["details"]:
    print(f"{r['phoneme']} → {r['score']} | {r['feedback']}")


🟢 Average score: 1.0
📌 Verdict: good

--- Phoneme feedback ---
ba → 1.0 | ඉතා හොඳයි! 🌟
l → 1.0 | ඉතා හොඳයි! 🌟
la: → 1.0 | ඉතා හොඳයි! 🌟


  alignment, scores = F.forced_align(


In [25]:
!pip install torchcodec

Collecting torchcodec
  Downloading torchcodec-0.10.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (11 kB)
Downloading torchcodec-0.10.0-cp312-cp312-manylinux_2_28_x86_64.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torchcodec
Successfully installed torchcodec-0.10.0


In [27]:
!pip install soundfile librosa


