In [1]:
import os, signal
import whisper
import torch
import gc
from datasets import load_dataset, DatasetDict, Audio
from transformers import (
    WhisperFeatureExtractor,
    DataCollatorForSeq2Seq,
    WhisperTokenizer,
    WhisperProcessor,
    AutoModelForCausalLM,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    pipeline,
    SequenceFeatureExtractor,
)
from torch.utils.data import DataLoader
from dataclasses import dataclass
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from typing import Any, Dict, List, Union
import evaluate
from torch.nn import DataParallel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = DatasetDict()

data["train"] = load_dataset(
    "mozilla-foundation/common_voice_8_0",
    "sw",
    token="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",
    split="train",
    trust_remote_code=True,
)
# data['validation'] = load_dataset("mozilla-foundation/common_voice_8_0", "sw", token='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', split="validation",trust_remote_code=True)
data["test"] = load_dataset(
    "mozilla-foundation/common_voice_8_0",
    "sw",
    token="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",
    split="test",
    trust_remote_code=True,
)

In [3]:
data

DatasetDict({
    train: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_rows: 19606
    })
    test: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_rows: 8941
    })
})

In [4]:
data = data.remove_columns(
    [
        "accent",
        "age",
        "client_id",
        "down_votes",
        "gender",
        "path",
        "locale",
        "segment",
        "up_votes",
    ]
)

In [5]:
data

DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 19606
    })
    test: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 8941
    })
})

In [6]:
data["train"].features

{'audio': Audio(sampling_rate=48000, mono=True, decode=True, id=None),
 'sentence': Value(dtype='string', id=None)}

In [7]:
data = data.cast_column("audio", Audio(sampling_rate=16000))
data["train"].features

{'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None),
 'sentence': Value(dtype='string', id=None)}

In [8]:
large_model_name = "openai/whisper-medium"  # Example model name, use the correct one
## Create the Feature extractor
feature_extractor = WhisperFeatureExtractor.from_pretrained(large_model_name)
## create the tokenizer
tokenizer = WhisperTokenizer.from_pretrained(
    large_model_name, language="Swahili", task="transcribe", padding_side="left"
)

tokenizer.padding_side = "left"
## Whisper processor
processor = WhisperProcessor.from_pretrained(
    large_model_name, language="Swahili", task="transcribe"
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
input_str = data["train"][0]["sentence"]
labels = tokenizer(input_str).input_ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

In [10]:
print(f"Input:                 {input_str}")
print(f"Decoded w/ special:    {decoded_with_special}")
print(f"Decoded w/out special: {decoded_str}")
print(f"Are equal:             {input_str == decoded_str}")

Input:                 Ilhali walipata chakula kidogo mno, wengi walikuwa wagonjwa na kuuawa baadaye.
Decoded w/ special:    <|startoftranscript|><|sw|><|transcribe|><|notimestamps|>Ilhali walipata chakula kidogo mno, wengi walikuwa wagonjwa na kuuawa baadaye.<|endoftext|>
Decoded w/out special: Ilhali walipata chakula kidogo mno, wengi walikuwa wagonjwa na kuuawa baadaye.
Are equal:             True


In [11]:
def data_preprocessing(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # create Features that will be fed into the model
    batch["input_features"] = feature_extractor(
        audio["array"], sampling_rate=audio["sampling_rate"]
    ).input_features[0]

    # encode target text to label ids
    batch["input_ids"] = tokenizer(batch["sentence"]).input_ids
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    # batch["labels"] = batch["sentence"]

    return batch

In [12]:
output = data_preprocessing(data["train"][0])

In [13]:
data = data.map(data_preprocessing, remove_columns=["audio", "sentence"], num_proc=None)

In [14]:
output

{'audio': {'path': '/root/.cache/huggingface/datasets/downloads/extracted/bdfd92e203ffb325b8ffd8c7a4b9a5b132bc12c50e5dfb2e59d99b855edddec0/cv-corpus-8.0-2022-01-19/sw/clips/common_voice_sw_28660672.mp3',
  'array': array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         1.24327983e-07, 1.93533054e-07, 1.68759641e-07]),
  'sampling_rate': 16000},
 'sentence': 'Ilhali walipata chakula kidogo mno, wengi walikuwa wagonjwa na kuuawa baadaye.',
 'input_features': array([[-0.56316686, -0.56316686, -0.56316686, ..., -0.56316686,
         -0.56316686, -0.56316686],
        [-0.56316686, -0.56316686, -0.56316686, ..., -0.56316686,
         -0.56316686, -0.56316686],
        [-0.56316686, -0.56316686, -0.56316686, ..., -0.56316686,
         -0.56316686, -0.56316686],
        ...,
        [-0.56316686, -0.56316686, -0.56316686, ..., -0.56316686,
         -0.56316686, -0.56316686],
        [-0.56316686, -0.56316686, -0.56316686, ..., -0.56316686,
         -0.56316686, -0.56316686],
   

In [15]:
data_collator = DataCollatorForSeq2Seq(
    padding=True, tokenizer=tokenizer, model=large_model_name
)

In [16]:
dataloader = DataLoader(data, collate_fn=DataCollatorForSeq2Seq(tokenizer))

In [17]:
# training_args = Seq2SeqTrainingArguments(
#     output_dir="./whisper-small-hi", # Change to a repo name of your choice
#     per_device_train_batch_size=16, # Adjust based on your GPU memory
#     gradient_accumulation_steps=1, # Increase by 2x for every 2x decrease in batch size
#     learning_rate=1e-5, # Start with a lower learning rate for fine-tuning
#     #warmup_steps=500, # Adjust based on your training schedule
#     max_steps=5000, # Increase for more training steps
#     num_train_epochs=3, # Specify the number of epochs for training
#     gradient_checkpointing=True, # Use gradient checkpointing for memory efficiency
#     fp16=True, # Use mixed precision training for faster training
#     evaluation_strategy="steps", # Evaluate model at regular intervals
#     per_device_eval_batch_size=8, # Adjust based on your GPU memory
#     predict_with_generate=True, # Use generate method for prediction
#     generation_max_length=225, # Set the maximum length for generated sequences
#     save_steps=1000, # Save model checkpoints every 1000 steps
#     eval_steps=1000, # Evaluate model every 1000 steps
#     logging_steps=25, # Log training progress every 25 steps
#     report_to=["tensorboard"], # Use TensorBoard for logging
#     load_best_model_at_end=True, # Load the best model at the end of training
#     metric_for_best_model="wer", # Use Word Error Rate (WER) as the metric for best model
#     greater_is_better=False, # Since lower WER is better
#     #push_to_hub=True, # Push model checkpoints to Hugging Face Hub
# )

training_args = Seq2SeqTrainingArguments(
    output_dir="./fine-tuned-model",
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,
    learning_rate=1e-5,
    warmup_steps=2,
    num_train_epochs=2,  # Adjust this based on your desired number of epochs
    # gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=500,
    #push_to_hub=True,
)

In [18]:
metric = evaluate.load("wer")

In [19]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [20]:
torch.cuda.is_available()

True

## Implementing Lora for more efficinet Fine Tuning of whisper

**LORA**- (Low-Rank Adaptation), is a training technique that reduces the number of trainable parameters.
It inserts a smaller number of new weights into the model and only these are trained.

This makes training with LoRA much faster, memory-efficient, and produces smaller model weights which are easier to store and share.

LoRA can also be combined with other training techniques like ' **DreamBooth**' to speedup the training process.

**Note** : Look into DreamBooth as well to see how training can be done faster


In [21]:
#model

In [None]:
class ClearCache:
    def __enter__(self):
        torch.cuda.empty_cache()

    def __exit__(self, exc_type, exc_val, exc_tb):
        torch.cuda.empty_cache()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def make_inputs_require_grad(module, input, output):
        output.requires_grad_(True)

In [22]:
# Use the context manager
with ClearCache():
    # Define and train the PyTorch model
    # model = whisper.load_model('medium',device='cuda')
    model = AutoModelForCausalLM.from_pretrained(large_model_name, use_cache=False).to(
        device
    )

    model = prepare_model_for_kbit_training(model)

    model.model.decoder.layers.register_forward_hook(make_inputs_require_grad)
    lora_config = LoraConfig(r=32, lora_alpha=64, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none", )

    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()

    trainer = Seq2SeqTrainer(
        args=training_args,
        model=model,
        train_dataset=data["train"],
        eval_dataset=data["test"],
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
    )

    trainer.train()



trainable params: 6,291,456 || all params: 462,932,992 || trainable%: 1.3590424767133469


Step,Training Loss,Validation Loss,Wer
500,3.5257,0.675676,3.204477
1000,0.2579,0.218122,0.003673
1500,0.1033,0.172991,0.002449


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='le

In [None]:
model.save_pretrained("./fine-tuned-medium-sw", from_pt=True)
tokenizer.save_pretrained("./fine-tuned-tokenizer")
feature_extractor.save_pretrained("./fine-tuned-feature-ectractor")

In [None]:
# model2 = whisper.from_pretrained('./whisper_model_saved_2')
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("./fine-tuned-medium-sw")

In [None]:
model