In [2]:
# @title Install dependencies
!pip install transformers datasets torch torchaudio soundfile jiwer librosa



In [3]:
!pip install hf_xet
!pip install peft



In [45]:
# @title Import Necessary Librarires
import os
import random
import numpy as np
import torch
from datasets import load_dataset, load_from_disk, Audio
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, Trainer, TrainingArguments
from jiwer import wer, cer
import librosa
from sklearn.metrics import confusion_matrix
import gc
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [5]:
# @title Device Set Up
if torch.cuda.is_available():
    print("CUDA is available! You can use GPU acceleration.")
else:
    print("CUDA is not available. You will be using the CPU.")

CUDA is available! You can use GPU acceleration.


In [6]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [7]:
# @title Configuration Set up
# Configuration
class Config:
    # Dataset paths
    COMMON_VOICE_PATH = "mozilla-foundation/common_voice_11_0"
    FLEURS_PATH = "google/fleurs"

    # Pre-trained model
    MODEL_NAME = "facebook/wav2vec2-base"

    # Training parameters
    BATCH_SIZE = 2  # Reduced batch size for CPU
    NUM_EPOCHS = 5
    MAX_AUDIO_LEN = 16_000 * 10  # 10 seconds of audio
    SAMPLING_RATE = 16_000
    MAX_PREDICTION_LENGTH = 100
    WEIGHT_DECAY = 0.01

    # Output directories
    OUTPUT_DIR = "./results"
    OUTPUT_DIR1 = "./results_fleurs"

    LOG_DIR = "./logs"
    LOG_DIR1 = "./logs_fleurs"



In [8]:
# @title Data augmentation functions
def time_stretch(audio, rate=1.1):
    """Apply time stretching to the audio."""
    return librosa.effects.time_stretch(audio, rate=rate)

def pitch_shift(audio, sr, n_steps=2):
    """Apply pitch shifting to the audio."""
    return librosa.effects.pitch_shift(audio, sr=sr, n_steps=n_steps)

def augment_audio(audio, sr):
    """Apply random augmentations to the audio."""
    if random.random() > 0.5:  # 50% chance to apply time stretching
        audio = time_stretch(audio)
    if random.random() > 0.5:  # 50% chance to apply pitch shifting
        audio = pitch_shift(audio, sr)
    return audio


In [9]:
# @title Dataset Statistics
def print_dataset_statistics(dataset, name="Dataset"):
    """
    Prints the number of samples and total audio duration for a dataset.
    Args:
        dataset: Preprocessed dataset (train or test split).
        name (str): Name of the dataset (e.g., "Common Voice Train").
    """
    # Number of samples
    num_samples = len(dataset)

    # Total audio duration (in hours)
    total_duration = sum(len(batch["audio"]["array"]) / batch["audio"]["sampling_rate"] for batch in dataset) / 3600

    print(f"{name}:")
    print(f"  - Number of samples: {num_samples}")
    print(f"  - Total audio duration: {total_duration:.2f} hours")

In [20]:
# @title Preprocessing & Download Dataset function
def prepare_dataset(batch, processor, sr=16000, augment=False):
    """
    Preprocesses the dataset with optional data augmentation.
    Args:
        batch: A single example from the dataset.
        processor: Wav2Vec2Processor for feature extraction.
        sr: Sampling rate (default: 16kHz).
        augment (bool): Whether to apply data augmentation.
    Returns:
        dict: Processed batch with augmented or clean audio.
    """
    # Extract raw audio
    audio = batch["audio"]["array"]
    sampling_rate = batch["audio"]["sampling_rate"]

    # Resample audio if necessary
    if sampling_rate != sr:
        audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=sr)

    # Apply data augmentation (only if augment=True)
    if augment:
        audio = augment_audio(audio, sr)

    # Process audio
    inputs = processor(
        audio,
        sampling_rate=sr,
        max_length=Config.MAX_AUDIO_LEN,
        truncation=True
    )
    # Handle different column names for text transcription
    if "sentence" in batch:
        text_column = "sentence"
    elif "transcription" in batch:
        text_column = "transcription"
    else:
        raise KeyError("Dataset does not contain 'sentence' or 'transcription' column.")
    batch["input_values"] = torch.tensor(inputs.input_values[0]).to(device)
    batch["labels"] = torch.tensor(processor.tokenizer(batch[text_column]).input_ids).to(device)
    return batch

# Download and preprocess datasets
def download_and_preprocess_datasets(dset_name, language="hi", language_code="hi_in"):
    """
    Downloads and preprocesses Common Voice and FLEURS datasets.
    Args:
        dset_name (str): Dataset name (e.g., "C" for Common Voice, "F" for FLEURS).
        language (str): Language code for Common Voice (e.g., "hi").
        language_code (str): Language code for FLEURS (e.g., "hi_in").
    Returns:
        Tuple: Preprocessed train and test datasets.
    """
    # Load processor
    processor = Wav2Vec2Processor.from_pretrained(Config.MODEL_NAME)

    # Download and preprocess Common Voice
    if dset_name == 'C':
        asr_dset = load_dataset(Config.COMMON_VOICE_PATH, language, trust_remote_code=True)
    if dset_name == 'F':
        asr_dset = load_dataset(Config.FLEURS_PATH, language_code, trust_remote_code=True)

    asr_dset = asr_dset.cast_column("audio", Audio(sampling_rate=16000))
    if dset_name == 'C':
        print("Common Voice columns:", asr_dset["train"].column_names)
        print("Common Voice sample:", asr_dset["train"][0])

        # Print dataset statistics
        print("Common Voice Dataset Statistics:")
        print_dataset_statistics(asr_dset["train"], name="Common Voice Train")
        print_dataset_statistics(asr_dset["test"], name="Common Voice Test")
    if dset_name == 'F':
        print("FLEURS columns:", asr_dset["train"].column_names)
        print("FLEURS sample:", asr_dset["train"][0])

        # Print dataset statistics
        print("FLEURS Dataset Statistics:")
        print_dataset_statistics(asr_dset["train"], name="FLEURS Train")
        print_dataset_statistics(asr_dset["test"], name="FLEURS Test")
    if dset_name == 'C':
        asr_train = asr_dset["train"].map(
        lambda x: prepare_dataset(x, processor, augment=True),
        remove_columns=[col for col in asr_dset["train"].column_names if col not in ["audio", "sentence"]]
        )
        asr_test = asr_dset["test"].map(
        lambda x: prepare_dataset(x, processor, augment=False),
        remove_columns=[col for col in asr_dset["test"].column_names if col not in ["audio", "sentence"]]
        )
    if dset_name == 'F':
        asr_train = asr_dset["train"].map(
        lambda x: prepare_dataset(x, processor, augment=True),
        remove_columns=[col for col in asr_dset["train"].column_names if col not in ["audio", "transcription"]]
        )
        asr_test = asr_dset["test"].map(
        lambda x: prepare_dataset(x, processor, augment=False),
        remove_columns=[col for col in asr_dset["test"].column_names if col not in ["audio", "transcription"]]
        )

    return asr_train, asr_test, processor


In [18]:
from torch.cuda.amp import autocast

def evaluate_pretrained_model(model, processor, test_dataset):
    """
    Evaluates the pre-trained model on the test dataset.
    Args:
        model: Pre-trained Wav2Vec 2.0 model.
        processor: Wav2Vec2Processor for decoding predictions.
        test_dataset: Preprocessed test dataset.
    """
    references = []
    hypotheses = []

    # Determine the text column name dynamically
    if "sentence" in test_dataset.column_names:
        text_column = "sentence"
    elif "transcription" in test_dataset.column_names:
        text_column = "transcription"
    elif "text" in test_dataset.column_names:  # Add support for 'text' column
        text_column = "text"
    else:
        raise KeyError("Dataset does not contain 'sentence' or 'transcription' or 'text' column.")

    for batch in test_dataset:
        inputs = torch.tensor(batch["input_values"]).unsqueeze(0).to(device)
        with autocast():
            with torch.no_grad():
                logits = model(inputs).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor.decode(
            predicted_ids[0],
            skip_special_tokens=True)
        transcription = transcription[:Config.MAX_PREDICTION_LENGTH]
        # Remove repetitive patterns
        def remove_repetitions(text):
            return ''.join([text[i] for i in range(len(text)) if i == 0 or text[i] != text[i - 1]])

        transcription = remove_repetitions(transcription)
        references.append(batch[text_column])
        hypotheses.append(transcription)

    # Calculate Word Error Rate (WER)
    word_error_rate = wer(references, hypotheses)
    # Calculate Character Error Rate (CER)
    char_error_rate = cer(references, hypotheses)
    def normalize_text(text):
        return " ".join(text.strip().lower().split())

    references = [normalize_text(ref) for ref in references]
    hypotheses = [normalize_text(hyp) for hyp in hypotheses]
    print(f"Pre-trained Model Word Error Rate (WER): {word_error_rate:.2f}")
    print(f"Pre-trained Model Character Error Rate (CER): {char_error_rate:.2f}")


In [12]:
def tokenize(texts):
    """
    Tokenizes a list of texts into individual characters or words.
    Args:
        texts (list): List of strings to tokenize.
    Returns:
        list: Flattened list of tokens.
    """
    # Tokenize into characters
    return list("".join(texts))

In [14]:
!pip show transformers

Name: transformers
Version: 4.50.3
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /usr/local/lib/python3.11/dist-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: peft, sentence-transformers


In [13]:
class CustomDataCollatorCTCWithPadding:
    def __init__(self, processor: Wav2Vec2Processor, padding=True):
        self.processor = processor
        self.padding = padding

    def __call__(self, features):
        # Split inputs and labels since they need different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        # Pad input values
        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )

        # Pad labels
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt",
            )

        # Replace labels with -100 where padding tokens are present
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # Update batch with padded labels
        batch["labels"] = labels

        return batch

In [35]:
class CustomWav2Vec2ForCTC(Wav2Vec2ForCTC):
    def get_input_embeddings(self):
        import torch.nn as nn
        return nn.Identity()

In [33]:
# @title Function for Fine-tune the model
def fine_tune_model(train_dataset, test_dataset, processor):
    """
    Fine-tunes the Wav2Vec 2.0 Base model.
    Args:
        train_dataset: Preprocessed training dataset.
        test_dataset: Preprocessed test dataset.
        processor: Wav2Vec2Processor for feature extraction.
    """
    # Load pre-trained model
    model = Wav2Vec2ForCTC.from_pretrained(Config.MODEL_NAME)
    model.to(device)
    model.gradient_checkpointing_enable()

    NUM_TRAIN_EXAMPLES = len(train_dataset)

    # Calculate total steps
    total_steps = (NUM_TRAIN_EXAMPLES / Config.BATCH_SIZE) * Config.NUM_EPOCHS

    # Define warmup steps (10% of total steps)
    WARMUP_STEPS = int(0.1 * total_steps)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=Config.OUTPUT_DIR,
        num_train_epochs=Config.NUM_EPOCHS,
        per_device_train_batch_size=Config.BATCH_SIZE,
        per_device_eval_batch_size=Config.BATCH_SIZE,
        evaluation_strategy="epoch",
        logging_dir=Config.LOG_DIR,
        logging_steps=10,
        save_steps=500,
        save_total_limit=2,
        gradient_checkpointing=True,
        gradient_accumulation_steps=2,
        fp16=True,
        learning_rate=5e-5,
        warmup_steps=WARMUP_STEPS,
        weight_decay=Config.WEIGHT_DECAY,
        push_to_hub=False,
    )
    data_collator = CustomDataCollatorCTCWithPadding(processor=processor, padding=True)


    # Create Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        tokenizer=processor.tokenizer,
        data_collator=data_collator,  # Use data collator for padding
    )

    # Start training
    trainer.train()
    trainer.save_model(Config.OUTPUT_DIR)

!pip install peft
from peft import LoraConfig, get_peft_model


# Evaluate the model
def evaluate_model(model, processor, test_dataset):
    """
    Evaluates the fine-tuned model using Word Error Rate (WER).
    Args:
        model: Fine-tuned Wav2Vec 2.0 model.
        processor: Wav2Vec2Processor for decoding predictions.
        test_dataset: Preprocessed test dataset.
    """
    references = []
    hypotheses = []

        # Determine the text column name dynamically
    if "sentence" in test_dataset.column_names:
        text_column = "sentence"
    elif "transcription" in test_dataset.column_names:
        text_column = "transcription"
    else:
        raise KeyError("Dataset does not contain 'sentence' or 'transcription' column.")

    for batch in test_dataset:
        inputs = torch.tensor(batch["input_values"]).unsqueeze(0).to(device)
        with autocast():
            with torch.no_grad():
                logits = model(inputs).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor.decode(predicted_ids[0])
        references.append(batch[text_column])
        hypotheses.append(transcription)

    # Calculate Word Error Rate (WER)
    word_error_rate = wer(references, hypotheses)
    # Calculate Character Error Rate (CER)
    char_error_rate = cer(references, hypotheses)
    print(f"Fine Tuned Model - Word Error Rate (WER): {word_error_rate:.2f}")
    print(f"Fine Tuned Model - Character Error Rate (CER): {char_error_rate:.2f}")




In [39]:

# Function for Fine-tune the model on fleurs with LoRA
def fine_tune_model_fleurs(train_dataset, test_dataset, processor):
    """
    Fine-tunes the Wav2Vec 2.0 Base model.
    Args:
        train_dataset: Preprocessed training dataset.
        test_dataset: Preprocessed test dataset.
        processor: Wav2Vec2Processor for feature extraction.
    """
    # Load pre-trained model
    lora_config = LoraConfig(
    r=8,  # Rank of the low-rank matrices
    lora_alpha=16,  # Scaling factor
    target_modules=["q_proj", "v_proj"],  # Target attention layers
    lora_dropout=0.1,  # Dropout rate
    )
    model = CustomWav2Vec2ForCTC.from_pretrained(Config.MODEL_NAME)
    model = get_peft_model(model, lora_config)
    # Check if the model has input embeddings (for debugging purposes)

    model.to(device)
    model.gradient_checkpointing = False

    NUM_TRAIN_EXAMPLES = len(train_dataset)

    # Calculate total steps
    total_steps = (NUM_TRAIN_EXAMPLES / Config.BATCH_SIZE) * Config.NUM_EPOCHS

    # Define warmup steps (10% of total steps)
    WARMUP_STEPS = int(0.1 * total_steps)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=Config.OUTPUT_DIR1,
        num_train_epochs=Config.NUM_EPOCHS,
        per_device_train_batch_size=Config.BATCH_SIZE,
        per_device_eval_batch_size=Config.BATCH_SIZE,
        evaluation_strategy="epoch",
        logging_dir=Config.LOG_DIR1,
        logging_steps=10,
        save_steps=500,
        save_total_limit=2,
        gradient_checkpointing=True,
        gradient_accumulation_steps=2,
        fp16=True,
        learning_rate=5e-5,
        warmup_steps=WARMUP_STEPS,
        weight_decay=Config.WEIGHT_DECAY,
        push_to_hub=False,
    )
    data_collator = CustomDataCollatorCTCWithPadding(processor=processor, padding=True)
    # Create Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        tokenizer=processor.tokenizer,
        data_collator=data_collator,  # Use data collator for padding
    )

    # Start training
    trainer.train()
    trainer.save_model(Config.OUTPUT_DIR1)


In [17]:
# @title Common Voice Dataset
# Step 1: Download and preprocess Common Voice Dataset
dset_name = 'C' #Common Voice
common_voice_train , common_voice_test , processor = download_and_preprocess_datasets(dset_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.84k [00:00<?, ?B/s]



vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/14.4k [00:00<?, ?B/s]

common_voice_11_0.py:   0%|          | 0.00/8.13k [00:00<?, ?B/s]

languages.py:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

release_stats.py:   0%|          | 0.00/60.9k [00:00<?, ?B/s]

n_shards.json:   0%|          | 0.00/12.2k [00:00<?, ?B/s]

audio/hi/train/hi_train_0.tar:   0%|          | 0.00/114M [00:00<?, ?B/s]

audio/hi/dev/hi_dev_0.tar:   0%|          | 0.00/61.9M [00:00<?, ?B/s]

audio/hi/test/hi_test_0.tar:   0%|          | 0.00/92.2M [00:00<?, ?B/s]

audio/hi/other/hi_other_0.tar:   0%|          | 0.00/113M [00:00<?, ?B/s]

audio/hi/invalidated/hi_invalidated_0.ta(…):   0%|          | 0.00/23.4M [00:00<?, ?B/s]

transcript/hi/train.tsv:   0%|          | 0.00/1.30M [00:00<?, ?B/s]

transcript/hi/dev.tsv:   0%|          | 0.00/627k [00:00<?, ?B/s]

transcript/hi/test.tsv:   0%|          | 0.00/824k [00:00<?, ?B/s]

transcript/hi/other.tsv:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

transcript/hi/invalidated.tsv:   0%|          | 0.00/201k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]


Reading metadata...: 4361it [00:00, 80540.00it/s]


Generating validation split: 0 examples [00:00, ? examples/s]


Reading metadata...: 2179it [00:00, 67354.42it/s]


Generating test split: 0 examples [00:00, ? examples/s]


Reading metadata...: 2894it [00:00, 70300.62it/s]


Generating other split: 0 examples [00:00, ? examples/s]


Reading metadata...: 3328it [00:00, 72213.45it/s]


Generating invalidated split: 0 examples [00:00, ? examples/s]


Reading metadata...: 680it [00:00, 87755.05it/s]


Common Voice columns: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment']
Common Voice sample: {'client_id': '0f018a99663f33afbb7d38aee281fb1afcfd07f9e7acd00383f604e1e17c38d6ed8adf1bd2ccbf927a52c5adefb8ac4b158ce27a7c2ed9581e71202eb302dfb3', 'path': '/root/.cache/huggingface/datasets/downloads/extracted/1bfc12b9ee30f73bf143fa237d4ba38488008883c25816876e1a35295c9575d3/hi_train_0/common_voice_hi_26008353.mp3', 'audio': {'path': '/root/.cache/huggingface/datasets/downloads/extracted/1bfc12b9ee30f73bf143fa237d4ba38488008883c25816876e1a35295c9575d3/hi_train_0/common_voice_hi_26008353.mp3', 'array': array([ 3.81639165e-17,  2.42861287e-17, -1.73472348e-17, ...,
       -1.30981789e-07,  2.63096808e-07,  4.77157300e-08]), 'sampling_rate': 16000}, 'sentence': 'हमने उसका जन्मदिन मनाया।', 'up_votes': 2, 'down_votes': 0, 'age': '', 'gender': '', 'accent': '', 'locale': 'hi', 'segment': ''}
Common Voice Dataset Statistics:
Common Voice

Map:   0%|          | 0/4361 [00:00<?, ? examples/s]

Map:   0%|          | 0/2894 [00:00<?, ? examples/s]

In [19]:
common_voice_train.save_to_disk("./common_voice_train")
common_voice_test.save_to_disk("./common_voice_test")


Saving the dataset (0/4 shards):   0%|          | 0/4361 [00:00<?, ? examples/s]

Saving the dataset (0/3 shards):   0%|          | 0/2894 [00:00<?, ? examples/s]

In [20]:
# Step 2: Load PreTrained model
pretrained_model = Wav2Vec2ForCTC.from_pretrained(Config.MODEL_NAME).to(device)
pretrained_model.gradient_checkpointing_enable()
print(f"Model is on device: {next(pretrained_model.parameters()).device}")




pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model is on device: cuda:0


In [None]:
# Load preprocessed Common Voice dataset from disk when needed
common_voice_train = load_from_disk("./common_voice_train")
common_voice_test = load_from_disk("./common_voice_test")

In [21]:
# Step 3: Evaluate pre-trained model on Common Voice datasets
print("Evaluating pre-trained model on Common Voice:")
evaluate_pretrained_model(pretrained_model, processor, common_voice_test)


Evaluating pre-trained model on Common Voice:


  with autocast():


Pre-trained Model Word Error Rate (WER): 1.01
Pre-trained Model Character Error Rate (CER): 1.83


In [22]:
# Step 4: Fine-tune and evaluate on Common Voice
print("Fine-tuning on Common Voice...")
fine_tune_model(common_voice_train, common_voice_test, processor)
fine_tuned_model = Wav2Vec2ForCTC.from_pretrained(Config.OUTPUT_DIR)
fine_tuned_model.to(device)
print("Evaluating fine-tuned model on Common Voice:")
evaluate_model(fine_tuned_model, processor, common_voice_test)

Fine-tuning on Common Voice...


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mm23csa520[0m ([33mm23csa520-iit-jodhpur[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Epoch,Training Loss,Validation Loss
1,49.1574,46.326904
2,46.1511,42.220951
3,46.7354,39.906254
4,37.3798,33.357956




Evaluating fine-tuned model on Common Voice:


  with autocast():


Fine Tuned Model - Word Error Rate (WER): 1.09
Fine Tuned Model - Character Error Rate (CER): 2.79


In [23]:
# Step 5 : Clears RAM
# Save preprocessed Common Voice dataset to disk
common_voice_train.save_to_disk("./common_voice_train")
common_voice_test.save_to_disk("./common_voice_test")
del common_voice_train
del common_voice_test
gc.collect()

Saving the dataset (0/4 shards):   0%|          | 0/4361 [00:00<?, ? examples/s]

Saving the dataset (0/3 shards):   0%|          | 0/2894 [00:00<?, ? examples/s]

5348

In [24]:
gc.collect()

52

In [21]:
# @title Fleurs Dataset
#Step 1: Load Fleurs Datset
dset_name = 'F' #Fleurs Dataset
fleurs_train , fleurs_test , processor = download_and_preprocess_datasets(dset_name)





FLEURS columns: ['id', 'num_samples', 'path', 'audio', 'transcription', 'raw_transcription', 'gender', 'lang_id', 'language', 'lang_group_id']
FLEURS sample: {'id': 93, 'num_samples': 138240, 'path': '/root/.cache/huggingface/datasets/downloads/extracted/d84cd21d7e2a1e02acc33f0f542109eae3f14413bd96886bc87b1e60e3f08f64/10002503286825416621.wav', 'audio': {'path': 'train/10002503286825416621.wav', 'array': array([ 0.        ,  0.        ,  0.        , ..., -0.00179154,
        0.0009104 , -0.00036222]), 'sampling_rate': 16000}, 'transcription': 'राजनीतिज्ञों ने कहा कि उन्होंने निर्णायक मत को अनावश्यक रूप से निर्धारित करने के लिए अफ़गान संविधान में काफी अस्पष्टता पाई थी', 'raw_transcription': 'राजनीतिज्ञों ने कहा कि उन्होंने निर्णायक मत को अनावश्यक रूप से निर्धारित करने के लिए अफ़गान संविधान में काफी अस्पष्टता पाई थी.', 'gender': 1, 'lang_id': 32, 'language': 'Hindi', 'lang_group_id': 4}
FLEURS Dataset Statistics:
FLEURS Train:
  - Number of samples: 2120
  - Total audio duration: 6.66 ho

Map:   0%|          | 0/2120 [00:00<?, ? examples/s]

Map:   0%|          | 0/418 [00:00<?, ? examples/s]

In [22]:
# Step 2: Load PreTrained model
pretrained_model = Wav2Vec2ForCTC.from_pretrained(Config.MODEL_NAME).to(device)
pretrained_model.gradient_checkpointing_enable()


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
#Step 3 : Evaluate pre-trained model on Fleurs datasets
print("Evaluating pre-trained model on FLEURS:")
evaluate_pretrained_model(pretrained_model, processor, fleurs_test)

Evaluating pre-trained model on FLEURS:


  with autocast():


Pre-trained Model Word Error Rate (WER): 1.00
Pre-trained Model Character Error Rate (CER): 1.05


In [None]:
# @title Fine Tuning using LoRA on Fleurs Dataset
# Step 4: Fine-tune and evaluate on FLEURS
Config.adapter_attn_dim = 128
print("Fine-tuning on FLEURS...")
fine_tune_model_fleurs(fleurs_train, fleurs_test, processor)


Fine-tuning on FLEURS...


Some weights of CustomWav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss
1,635.925,No log


In [44]:
fine_tuned_model_fleurs = CustomWav2Vec2ForCTC.from_pretrained(Config.OUTPUT_DIR1, ignore_mismatched_sizes=True)
fine_tuned_model_fleurs.to(device)
print("Evaluating fine-tuned model with LoRA on FLEURS:")
evaluate_model(fine_tuned_model_fleurs, processor, fleurs_test)

Some weights of CustomWav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ValueError: Cannot load_adapter for ./results_fleurs if `config.adapter_attn_dim` is not defined.

In [46]:
print("Fine-tuning on Fleurs...")
fine_tune_model(fleurs_train, fleurs_test, processor)
fine_tuned_model = Wav2Vec2ForCTC.from_pretrained(Config.OUTPUT_DIR)
fine_tuned_model.to(device)
print("Evaluating fine-tuned model on Fleurs :")
evaluate_model(fine_tuned_model, processor, fleurs_test)

Fine-tuning on Fleurs...


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,171.3832,inf
2,121.6809,inf
3,150.9246,inf
4,132.335,inf
5,133.3208,inf


Evaluating fine-tuned model on Common Voice:
Fine Tuned Model - Word Error Rate (WER): 1.00
Fine Tuned Model - Character Error Rate (CER): 1.00


In [None]:
# Save preprocessed Fleurs dataset to disk
fleurs_train.save_to_disk("./fleurs_train")
fleurs_test.save_to_disk("./fleurs_test")
del fleurs_train
del fleurs_test
gc.collect()