## Creating Testing Dataset for Baseline Whisper

In [1]:
!pip install jiwer openai-whisper torchcodec
from transformers import WhisperForConditionalGeneration, WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor
from transformers import AutoFeatureExtractor, WhisperModel
from transformers import LlamaTokenizer
from datasets import load_dataset
import torch, torchaudio
from torch import nn
from torch.nn.utils.rnn import pad_sequence
import numpy as np
from jiwer import wer as calculate_wer
import pickle
from datasets import Dataset, Audio, Value
import os, random
from typing import Optional
from whisper.normalizers import EnglishTextNormalizer
import math
from sentencepiece import SentencePieceProcessor, SentencePieceTrainer
from pathlib import Path
import whisper
import copy, heapq
import pandas as pd
import torch.nn.functional as F


Collecting jiwer
  Downloading jiwer-4.0.0-py3-none-any.whl.metadata (3.3 kB)
Collecting openai-whisper
  Downloading openai_whisper-20250625.tar.gz (803 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/803.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.2/803.2 kB[0m [31m41.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting torchcodec
  Downloading torchcodec-0.9.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (11 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading jiwer-4.0.0-py3-none-any.whl (23 kB)
Downloading torchcodec-0.9.0-cp312-cp312-manylinux_2_28_x86_64.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from google.colab import drive
from google.colab import userdata
userdata.get('HF_TOKEN')
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
def data_preparation(csv_path, file_list_path, feature_extractor, tokenizer=None, base_audio_dir="", max_label_length=None):
    df_text = pd.read_csv(csv_path)

    with open(file_list_path, "r") as f:
        file_paths = [line.strip() for line in f if line.strip()]

    dataset = []
    i = 0
    for file_path in file_paths:
        row = df_text[df_text["path"] == file_path]
        if row.empty:
            print(f"[WARN] No transcript found for: {file_path}")
            continue
        text = str(row["sentence"].values[0]).lower().strip()

        full_audio_path = os.path.join(base_audio_dir, file_path)

        audio, sr = torchaudio.load(full_audio_path)
        if sr != 16000:
            audio = torchaudio.functional.resample(audio, sr, 16000)

        feats = feature_extractor(
            audio.squeeze(0).numpy(),
            sampling_rate=16000,
            return_tensors="pt"
        )["input_features"]

        item = {"input_features": feats, "text": text}

        if tokenizer is not None:
            labels = tokenizer(
                text,
                return_tensors="pt",
                padding="longest",
                truncation=True,
                max_length=max_label_length
            )["input_ids"][0]
            item["labels"] = labels
        i += 1
        print("Missing:", i / len(file_paths) * 100, "%")
        dataset.append(item)

    return dataset

In [4]:
feature_extractor = WhisperFeatureExtractor.from_pretrained('openai/whisper-small', language='en', task='transcribe')
tokenizer = WhisperTokenizer.from_pretrained('openai/whisper-small', language='en', task='transcribe')
processor = WhisperProcessor.from_pretrained('openai/whisper-small', language='en', task='transcribe')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
normalizer = EnglishTextNormalizer()

preprocessor_config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

In [5]:
test_data_csv = '/content/drive/MyDrive/data/final_test.csv'
test_audio_list = '/content/drive/MyDrive/data/test_files.txt'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

initial_model = WhisperForConditionalGeneration.from_pretrained('openai/whisper-small')
max_label_length = initial_model.config.max_target_positions
del initial_model



cuda


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json: 0.00B [00:00, ?B/s]

In [9]:
df_test_set = pd.read_csv('/content/drive/MyDrive/data/final_test.csv')
df_train_set = pd.read_csv('/content/drive/MyDrive/data/final_train.csv')
train_audio_list = '/content/drive/MyDrive/data/train_files.txt'
accents = df_test_set['accents'].unique().tolist() + df_train_set['accents'].unique().tolist()
accent = [list(set(accents))[1]]
print(accent)
for i in accent:
    new_df_test = df_test_set[df_test_set['accents']==i]
    new_df_test.to_csv(f'/content/drive/MyDrive/data/{i}/test.csv', index=False)

    df_prep = data_preparation(f'/content/drive/MyDrive/data/{i}/test.csv', test_audio_list, feature_extractor, tokenizer, base_audio_dir='/content/drive/MyDrive/data/test_data', max_label_length=max_label_length)
    torch.save(df_prep, f'/content/drive/MyDrive/data/{i}/test.pt')

    new_df_train = df_train_set[df_train_set['accents']==i]
    new_df_train.to_csv(f'/content/drive/MyDrive/data/{i}/train.csv', index=False)

    df_prep_train = data_preparation(f'/content/drive/MyDrive/data/{i}/train.csv', train_audio_list, feature_extractor, tokenizer, base_audio_dir='/content/drive/MyDrive/data/train_data', max_label_length=max_label_length)
    torch.save(df_prep_train, f'/content/drive/MyDrive/data/{i}/train.pt')




[1;30;43mA saída de streaming foi truncada nas últimas 5000 linhas.[0m
[WARN] No transcript found for: common_voice_en_39575170.mp3
[WARN] No transcript found for: common_voice_en_531143.mp3
[WARN] No transcript found for: common_voice_en_17271350.mp3
[WARN] No transcript found for: common_voice_en_21670469.mp3
[WARN] No transcript found for: common_voice_en_16048127.mp3
[WARN] No transcript found for: common_voice_en_18488099.mp3
[WARN] No transcript found for: common_voice_en_19291064.mp3
[WARN] No transcript found for: common_voice_en_14871.mp3
[WARN] No transcript found for: common_voice_en_22371871.mp3
[WARN] No transcript found for: common_voice_en_317115.mp3
[WARN] No transcript found for: common_voice_en_22946937.mp3
[WARN] No transcript found for: common_voice_en_19388570.mp3
[WARN] No transcript found for: common_voice_en_21862091.mp3
[WARN] No transcript found for: common_voice_en_125779.mp3
[WARN] No transcript found for: common_voice_en_27284972.mp3
[WARN] No transcript 

## Creating Unlabaled dataset

In [10]:
def generate_pseudo_labels(df_unlabeled, model_frozen, feature_extractor, tokenizer, device, base_dirs):
    pseudo_records = []

    for path in df_unlabeled["path"].tolist():
        full_path = None
        for d in base_dirs:
            candidate = os.path.join(d, path)
            if os.path.exists(candidate):
                full_path = candidate
                break

        if full_path is None:
            if os.path.exists(path):
                full_path = path
            else:
                print(f"[WARN] audio file not found for: {path}")
                continue

        audio, sr = torchaudio.load(full_path)
        if sr != 16000:
            audio = torchaudio.functional.resample(audio, sr, 16000)

        audio_np = audio.squeeze(0).numpy()

        inputs = feature_extractor(
            audio_np,
            sampling_rate=16000,
            return_tensors="pt"
        )
        input_features = inputs["input_features"].to(device)

        with torch.no_grad():
            outputs = model_frozen.generate(
                input_features,
                output_scores=True,
                return_dict_in_generate=True
            )
            generated_ids = outputs.sequences

        transcription = tokenizer.batch_decode(
            generated_ids,
            skip_special_tokens=True
        )[0].lower().strip()

        pseudo_records.append({"path": path, "sentence": transcription})

    df_pseudo = pd.DataFrame(pseudo_records)
    df_pseudo["source"] = "pseudo"

    return df_pseudo

In [11]:
model_frozen = WhisperForConditionalGeneration.from_pretrained('openai/whisper-small')
model_frozen.to(device)
accent = ['India and South Asia (India, Pakistan, Sri Lanka)']
for i in accent:
    print(i)
    df = pd.read_csv('/content/drive/MyDrive/data/final_train.csv')
    df_train = pd.read_csv(f'/content/drive/MyDrive/data/{i}/train.csv')
    df_unlabeled = df_train

    base_dirs = ['/content/drive/MyDrive/data/train_data']

    pseudo_labels_df = generate_pseudo_labels(df_unlabeled, model_frozen, feature_extractor, tokenizer, device, base_dirs)

    pseudo_labels_df.to_csv(f'/content/drive/MyDrive/data/{i}/df_pseudo_training.csv', index=False)

    df_pseudo_training = pd.read_csv(f'/content/drive/MyDrive/data/{i}/df_pseudo_training.csv')


The following generation flags are not valid and may be ignored: ['output_scores']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.
Transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English. This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`. See https://github.com/huggingface/transformers/pull/28687 for more details.


India and South Asia (India, Pakistan, Sri Lanka)


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [13]:
model_config_for_len = WhisperForConditionalGeneration.from_pretrained('openai/whisper-small')
max_label_length = model_config_for_len.config.max_target_positions
del model_config_for_len
accent = ['India and South Asia (India, Pakistan, Sri Lanka)']
for i in accent:
    print(i)
    df_pseudo_training = pd.read_csv(f'/content/drive/MyDrive/data/{i}/df_pseudo_training.csv')
    train_dataset = data_preparation(f'/content/drive/MyDrive/data/{i}/df_pseudo_training.csv', train_audio_list, feature_extractor, tokenizer, base_audio_dir='/content/drive/MyDrive/data/train_data', max_label_length=max_label_length)
    torch.save(train_dataset, f'/content/drive/MyDrive/data/{i}/df_pseudo_training.pt')

[1;30;43mA saída de streaming foi truncada nas últimas 5000 linhas.[0m
[WARN] No transcript found for: common_voice_en_39575170.mp3
[WARN] No transcript found for: common_voice_en_531143.mp3
[WARN] No transcript found for: common_voice_en_17271350.mp3
[WARN] No transcript found for: common_voice_en_21670469.mp3
[WARN] No transcript found for: common_voice_en_16048127.mp3
[WARN] No transcript found for: common_voice_en_18488099.mp3
[WARN] No transcript found for: common_voice_en_19291064.mp3
[WARN] No transcript found for: common_voice_en_14871.mp3
[WARN] No transcript found for: common_voice_en_22371871.mp3
[WARN] No transcript found for: common_voice_en_317115.mp3
[WARN] No transcript found for: common_voice_en_22946937.mp3
[WARN] No transcript found for: common_voice_en_19388570.mp3
[WARN] No transcript found for: common_voice_en_21862091.mp3
[WARN] No transcript found for: common_voice_en_125779.mp3
[WARN] No transcript found for: common_voice_en_27284972.mp3
[WARN] No transcript 