## Creating Testing Dataset for Baseline Whisper

In [1]:
!pip install jiwer openai-whisper torchcodec tqdm
from transformers import WhisperForConditionalGeneration, WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor
from transformers import AutoFeatureExtractor, WhisperModel
from transformers import LlamaTokenizer
from datasets import load_dataset
import torch, torchaudio
from torch import nn
from torch.nn.utils.rnn import pad_sequence
import numpy as np
from jiwer import wer as calculate_wer
import pickle
from datasets import Dataset, Audio, Value
import os, random
from typing import Optional
from whisper.normalizers import EnglishTextNormalizer
import math
from sentencepiece import SentencePieceProcessor, SentencePieceTrainer
from pathlib import Path
import whisper
import copy, heapq
import pandas as pd
import torch.nn.functional as F
from tqdm import tqdm
import gc


Collecting jiwer
  Downloading jiwer-4.0.0-py3-none-any.whl.metadata (3.3 kB)
Collecting openai-whisper
  Downloading openai_whisper-20250625.tar.gz (803 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.2/803.2 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting torchcodec
  Downloading torchcodec-0.9.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (11 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading jiwer-4.0.0-py3-none-any.whl (23 kB)
Downloading torchcodec-0.9.1-cp312-cp312-manylinux_2_28_x86_64.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m80.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.14.3-cp312-cp31

In [2]:
from google.colab import drive
from google.colab import userdata
userdata.get('HF_TOKEN')
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
def data_preparation(csv_path, file_list_path, feature_extractor, tokenizer=None, base_audio_dir="", max_label_length=None):
    df_text = pd.read_csv(csv_path)

    with open(file_list_path, "r") as f:
        file_paths = [line.strip() for line in f if line.strip()]

    dataset = []
    i = 0
    for file_path in file_paths:
        row = df_text[df_text["path"] == file_path]
        if row.empty:
            print(f"[WARN] No transcript found for: {file_path}")
            continue
        text = str(row["sentence"].values[0]).lower().strip()

        full_audio_path = os.path.join(base_audio_dir, file_path)

        audio, sr = torchaudio.load(full_audio_path)
        if sr != 16000:
            audio = torchaudio.functional.resample(audio, sr, 16000)

        feats = feature_extractor(
            audio.squeeze(0).numpy(),
            sampling_rate=16000,
            return_tensors="pt"
        )["input_features"]

        item = {"input_features": feats, "text": text}

        if tokenizer is not None:
            labels = tokenizer(
                text,
                return_tensors="pt",
                padding="longest",
                truncation=True,
                max_length=max_label_length
            )["input_ids"][0]
            item["labels"] = labels
        i += 1
        print("Missing:", i / len(file_paths) * 100, "%")
        dataset.append(item)

    return dataset

In [3]:
feature_extractor = WhisperFeatureExtractor.from_pretrained('openai/whisper-tiny', language='en', task='transcribe')
tokenizer = WhisperTokenizer.from_pretrained('openai/whisper-tiny', language='en', task='transcribe')
processor = WhisperProcessor.from_pretrained('openai/whisper-tiny', language='en', task='transcribe')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
normalizer = EnglishTextNormalizer()

preprocessor_config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

In [4]:
test_data_csv = '/content/drive/MyDrive/data/final_test.csv'
test_audio_list = '/content/drive/MyDrive/data/test_files.txt'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

initial_model = WhisperForConditionalGeneration.from_pretrained('openai/whisper-tiny')
max_label_length = initial_model.config.max_target_positions
del initial_model



cuda


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/151M [00:00<?, ?B/s]

generation_config.json: 0.00B [00:00, ?B/s]

## Creating Unlabaled dataset

In [5]:
def prepare_pseudo_dataset_chunked(model, raw_dataset, tokenizer, feature_extractor, device, num_chunks=4):
    """
    Processes the dataset in chunks to save RAM.
    Saves temporary files (chunk_0.pt, chunk_1.pt...) and merges them at the end.
    """
    model.eval()
    model.config.output_attentions = True

    # Prompt tokens
    prompt_ids = [50258, 50259, 50359, 50363]
    n_prompt_toks = len(prompt_ids)

    # Calculate chunk size
    total_size = len(raw_dataset)
    chunk_size = (total_size + num_chunks - 1) // num_chunks  # Ceiling division

    temp_files = []

    print(f" Phase 1: Generating Pseudo-labels (Split into {num_chunks} chunks)")

    # --- MAIN CHUNK LOOP ---
    for chunk_idx in range(num_chunks):
        start = chunk_idx * chunk_size
        end = min((chunk_idx + 1) * chunk_size, total_size)

        if start >= total_size:
            break

        print(f"\nProcessing Chunk {chunk_idx + 1}/{num_chunks} (Samples {start} to {end})...")

        # Slice the raw dataset
        current_chunk_raw = raw_dataset[start:end]
        chunk_processed_data = []

        for item in tqdm(current_chunk_raw):
            try:
                # --- SAFE AUDIO LOADING ---
                audio_data = item['audio']['array']
                if isinstance(audio_data, list):
                    audio_data = np.array(audio_data)
                if len(audio_data) == 0:
                    continue

                inputs = feature_extractor(audio_data, sampling_rate=16000, return_tensors="pt")
                mel = inputs.input_features.to(device)

                with torch.no_grad():
                    hidden_states = model.model.encoder(mel).last_hidden_state
                    decoder_input_ids = torch.tensor([prompt_ids]).to(device)

                    probs = []
                    weights = []
                    outputs = None

                    # Generate up to 100 tokens
                    for _ in range(100):
                        outputs = model(
                            encoder_outputs=(hidden_states,),
                            decoder_input_ids=decoder_input_ids,
                            output_attentions=True,
                            return_dict=True
                        )

                        logits = torch.softmax(outputs.logits[0, -1, :] / 1.0, dim=-1)
                        next_token = torch.argmax(logits).unsqueeze(0)

                        probs.append(float(logits[next_token]))
                        decoder_input_ids = torch.cat((decoder_input_ids, next_token.unsqueeze(0)), dim=-1)

                        if next_token == tokenizer.eos_token_id:
                            break

                    new_item = {
                        'input_features': mel.squeeze(0).detach().cpu(),
                        'pseudo_label_ids': decoder_input_ids.squeeze(0).detach().cpu(),
                        'text': item.get('text', '')
                    }
                    chunk_processed_data.append(new_item)

            except Exception as e:
                continue

        # --- SAVE CHUNK TO DISK ---
        temp_filename = f"{'/content/drive/MyDrive/data/pseudo/'}/temp_chunk_{chunk_idx}.pt"
        print(f"Saving temporary chunk to {temp_filename}...")
        torch.save(chunk_processed_data, temp_filename)
        temp_files.append(temp_filename)

        # --- CLEAR RAM ---
        del chunk_processed_data
        del current_chunk_raw
        torch.cuda.empty_cache()
        gc.collect()

    # --- MERGE AND FILTER ---
    print("\nProcessing complete. Merging chunks...")
    full_dataset = []

    for f in temp_files:
        print(f"Loading {f}...")
        data = torch.load(f, weights_only=False)
        full_dataset.extend(data)

        # Optional: Delete temp file to free disk space immediately
        # os.remove(f)

    if len(full_dataset) == 0:
        print("WARNING: Dataset empty.")
        return []

    print(f"Total samples processed: {len(full_dataset)}")

    # save full dataset before filtering
    save_path_full = f"/content/drive/MyDrive/data/pseudo/pseudo_data.pt"
    print(f"Saving full dataset to {save_path_full}")
    torch.save(full_dataset, save_path_full)

    # Cleanup temp files
    for f in temp_files:
        if os.path.exists(f):
            os.remove(f)

    return full_dataset

In [9]:
model_frozen = WhisperForConditionalGeneration.from_pretrained('openai/whisper-tiny', attn_implementation='eager')
model_frozen.to(device)
df = torch.load('/content/drive/MyDrive/data/train_all.pt', weights_only=False)
pseudo_labels_df = prepare_pseudo_dataset_chunked(model_frozen, df,  tokenizer,feature_extractor, device)

 Phase 1: Generating Pseudo-labels (Split into 4 chunks)

Processing Chunk 1/4 (Samples 0 to 2000)...


100%|██████████| 2000/2000 [04:42<00:00,  7.07it/s]


Saving temporary chunk to /content/drive/MyDrive/data/pseudo//temp_chunk_0.pt...

Processing Chunk 2/4 (Samples 2000 to 4000)...


100%|██████████| 2000/2000 [04:33<00:00,  7.31it/s]


Saving temporary chunk to /content/drive/MyDrive/data/pseudo//temp_chunk_1.pt...

Processing Chunk 3/4 (Samples 4000 to 6000)...


100%|██████████| 2000/2000 [04:32<00:00,  7.35it/s]


Saving temporary chunk to /content/drive/MyDrive/data/pseudo//temp_chunk_2.pt...

Processing Chunk 4/4 (Samples 6000 to 8000)...


100%|██████████| 2000/2000 [04:44<00:00,  7.04it/s]


Saving temporary chunk to /content/drive/MyDrive/data/pseudo//temp_chunk_3.pt...

Processing complete. Merging chunks...
Loading /content/drive/MyDrive/data/pseudo//temp_chunk_0.pt...
Loading /content/drive/MyDrive/data/pseudo//temp_chunk_1.pt...
Loading /content/drive/MyDrive/data/pseudo//temp_chunk_2.pt...
Loading /content/drive/MyDrive/data/pseudo//temp_chunk_3.pt...
Total samples processed: 8000
Saving full dataset to /content/drive/MyDrive/data/pseudo/pseudo_data.pt
