In [1]:
from google.colab import drive
import os
drive.mount('/content/drive')
ROOT_FOLDER = '/content/drive/My Drive/Project/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Install ffmpeg
!apt-get update
!apt-get install -y ffmpeg

# # Verify installation
# !ffmpeg -version
!pip install evaluate
!pip install jiwer
!pip install torchcodec

0% [Working]            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
0% [Connecting to archive.ubuntu.com (91.189.92.23)] [Connecting to security.ub                                                                               Get:2 https://cli.github.com/packages stable InRelease [3,917 B]
0% [Connecting to archive.ubuntu.com (91.189.92.23)] [Connecting to security.ub                                                                               Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:4 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:6 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Get:8 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [9,556 kB]
Hit:9 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:10 https://ppa.launc

In [3]:
!pip install --upgrade transformers accelerate



In [3]:
from transformers import WhisperForConditionalGeneration
from transformers import WhisperProcessor
import os
import pandas as pd
import json
from datasets import Dataset, Audio
from typing import Any, List, Dict, Union
import torch
import jiwer

from transformers import (
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

import evaluate

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
processor = WhisperProcessor.from_pretrained(
    "openai/whisper-small",
    language="English",      # sets English as decoding language
    task="transcribe",        # task can be 'transcribe' or 'translate'
    feature_extractor_type="whisper",  # Explicitly set feature extractor type
    feature_size=80 # Set feature size to 80
)
model.generation_config.task = 'transcribe'
model.generation_config.forced_decoder_ids = None
# from peft import LoraConfig, PeftModel

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
from datasets import load_from_disk
data_set_path = os.path.join(ROOT_FOLDER, "data", "data_sets",'prepared_val')
val_dataset = load_from_disk(data_set_path)

In [5]:
from datasets import load_from_disk, concatenate_datasets
base_dir = os.path.join(ROOT_FOLDER, "data", "data_sets")

parts = []
n_parts = 20

for i in range(1, n_parts + 1):
    data_set_path = os.path.join(base_dir, f"prepared_train_part{i}")
    if not os.path.isdir(data_set_path):
        print(f"Warning: missing directory {data_set_path}")
        continue
    ds = load_from_disk(data_set_path)
    parts.append(ds)
    print(f"Loaded part {i}: {data_set_path} with {len(ds)} rows")

# Optionally concatenate into a single dataset
if parts:
    train_dataset = concatenate_datasets(parts)
    print(f"Concatenated dataset rows: {len(train_dataset)}")
else:
    train_dataset = None
    print("No parts loaded.")


Loaded part 1: /content/drive/My Drive/Project/data/data_sets/prepared_train_part1 with 124 rows
Loaded part 2: /content/drive/My Drive/Project/data/data_sets/prepared_train_part2 with 124 rows
Loaded part 3: /content/drive/My Drive/Project/data/data_sets/prepared_train_part3 with 124 rows
Loaded part 4: /content/drive/My Drive/Project/data/data_sets/prepared_train_part4 with 124 rows
Loaded part 5: /content/drive/My Drive/Project/data/data_sets/prepared_train_part5 with 124 rows
Loaded part 6: /content/drive/My Drive/Project/data/data_sets/prepared_train_part6 with 124 rows
Loaded part 7: /content/drive/My Drive/Project/data/data_sets/prepared_train_part7 with 124 rows
Loaded part 8: /content/drive/My Drive/Project/data/data_sets/prepared_train_part8 with 124 rows
Loaded part 9: /content/drive/My Drive/Project/data/data_sets/prepared_train_part9 with 124 rows
Loaded part 10: /content/drive/My Drive/Project/data/data_sets/prepared_train_part10 with 124 rows
Loaded part 11: /content/dri

In [6]:
class DataCollatorSpeechSeq2SeqWithPadding:

    def __init__(self, processor: Any, decoder_start_token_id: int):
        self.processor = processor
        self.decoder_start_token_id = decoder_start_token_id

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # Use processor.feature_extractor to extract features
        input_features = [{'input_features': self.processor.feature_extractor(feature['path']['array'], sampling_rate=feature['path']['sampling_rate']).input_features[0]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors='pt')

        label_features = [{'input_ids': feature['labels'][0:448]} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors='pt')

        labels = labels_batch['input_ids'].masked_fill(labels_batch.attention_mask.ne(1), -100)

        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch['labels'] = labels

        return batch


data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

# metric = evaluate.load('wer')

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    # Concatenate for MICRO / WEIGHTED WER
    concatenated_pred = " ".join(pred_str)
    concatenated_ref = " ".join(label_str)

    weighted_wer = 100 * jiwer.wer(
        concatenated_ref,
        concatenated_pred
    )

    return {'wer': weighted_wer}

# def compute_metrics(pred):
#     pred_ids = pred.predictions
#     label_ids = pred.label_ids

#     # Replace -100 with pad token
#     label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

#     # Decode
#     pred_str = processor.tokenizer.batch_decode(
#         pred_ids, skip_special_tokens=True
#     )
#     label_str = processor.tokenizer.batch_decode(
#         label_ids, skip_special_tokens=True
#     )

#     total_edits = 0
#     total_ref_words = 0

#     for hyp, ref in zip(pred_str, label_str):
#         hyp_words = hyp.strip().split()
#         ref_words = ref.strip().split()

#         # Skip empty references to avoid divide-by-zero
#         if len(ref_words) == 0:
#             continue

#         # Compute word-level edit distance
#         measures = jiwer.compute_measures(ref, hyp)

#         total_edits += measures["substitutions"] \
#                      + measures["deletions"] \
#                      + measures["insertions"]

#         total_ref_words += len(ref_words)

#     weighted_wer = 100.0 * total_edits / total_ref_words

#     return {
#         "wer": weighted_wer
#     }


In [7]:
#parameters
out_dir = os.path.join(ROOT_FOLDER, 'finetuned_model_files', 'Small_full_training')
batch_size = 16
gradient_accum = 1
epochs = 45
dataloader_num = 16


#save/eval strategy at every 3 epoch
step_size = int(train_dataset.shape[0]/(batch_size*gradient_accum) + 1)*3



In [8]:
training_args = Seq2SeqTrainingArguments(
    output_dir=out_dir,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps= gradient_accum,
    learning_rate=1*10**-6,
    # warmup_steps=300,

    #add regularization
    warmup_ratio=0.1,
    weight_decay=0.01,
    lr_scheduler_type="constant",

    bf16=False,
    fp16=True,
    num_train_epochs=epochs,
    logging_strategy='epoch',
    #save_strategy='epoch',
    predict_with_generate=True,
    generation_max_length=448,
    report_to=['tensorboard'],
    load_best_model_at_end=False,
    metric_for_best_model='wer',
    greater_is_better=False,
    dataloader_num_workers= dataloader_num,
    save_total_limit=4,
    seed=42,
    data_seed=42,
    remove_unused_columns=False, # Add this line
    eval_strategy="steps",
    eval_steps=step_size,
    save_strategy='steps',
    save_steps=step_size

)

In [9]:
# from peft import LoraConfig, get_peft_model
# lora_config = LoraConfig(
#     r=64,             # Rank
#     lora_alpha=128,   # Scaling
#     target_modules= ["q_proj","k_proj","v_proj","out_proj"],  # Attention projection modules (important for Whisper)
#     lora_dropout=0.1,
#     bias="none",
#     # task_type="SEQ_2_SEQ_LM"
# )

# model = get_peft_model(model, lora_config)
# model.print_trainable_parameters()   # Debug: shows % of params tuned


for param in model.parameters():
    param.requires_grad = True

trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f"Trainable: {trainable_params}")
print(f"Total: {total_params}")


Trainable: 241734912
Total: 241734912


In [10]:
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

  trainer = Seq2SeqTrainer(


In [11]:
trainer.train()

You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenize

Step,Training Loss,Validation Loss,Wer
465,1.3098,1.129399,185.231774
930,0.9098,0.967227,83.93217
1395,0.7395,0.900942,84.759191
1860,0.6172,0.866672,87.837932
2325,0.5171,0.854393,79.658072
2790,0.4281,0.854228,50.441309
3255,0.3469,0.851015,130.71096
3720,0.2682,0.869292,112.134269
4185,0.2131,0.894555,110.174439
4650,0.1695,0.922345,82.639516


You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenize

KeyboardInterrupt: 


Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/jax/_src/lib/__init__.py", line 127, in _xla_gc_callback
    def _xla_gc_callback(*args):


In [None]:
trainer.train(
    resume_from_checkpoint=os.path.join(out_dir, "checkpoint-4650")
)


You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenize

Step,Training Loss,Validation Loss,Wer
5115,0.0515,0.957288,23.406769
5580,0.0494,0.960312,22.982834
6045,0.0489,0.951249,23.865453
6510,0.0486,0.952604,23.330322
6975,0.0506,0.973605,22.920286


You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenize

TrainOutput(global_step=6975, training_loss=0.016593539330267137, metrics={'train_runtime': 1473.7368, 'train_samples_per_second': 75.665, 'train_steps_per_second': 4.733, 'total_flos': 3.44532132274176e+19, 'train_loss': 0.016593539330267137, 'epoch': 45.0})

In [None]:
trainer.save_model(os.path.join(out_dir, "best_model"))

#to get latest checkpoint use , go to latest check_point
# To find the actual best checkpoint, you need to look at the trainer_state.json file generated in your output directory (or inside any checkpoint-XXX folder).
# Open output_dir/checkpoint-XXX/trainer_state.json.
