In [1]:
# @title Install dependencies
!pip install transformers datasets torch torchaudio soundfile jiwer librosa

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting jiwer
  Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collec

In [2]:
!pip install hf_xet

Collecting hf_xet
  Downloading hf_xet-1.0.3-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (494 bytes)
Downloading hf_xet-1.0.3-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (53.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.8/53.8 MB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: hf_xet
Successfully installed hf_xet-1.0.3


In [3]:
# @title Import Necessary Librarires
import os
import random
import numpy as np
import torch
from datasets import load_dataset, load_from_disk, Audio
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, Trainer, TrainingArguments
from jiwer import wer, cer
import librosa
from sklearn.metrics import confusion_matrix
import gc
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
if torch.cuda.is_available():
    print("CUDA is available! You can use GPU acceleration.")
else:
    print("CUDA is not available. You will be using the CPU.")

CUDA is available! You can use GPU acceleration.


In [5]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [56]:
# @title Configuration Set up
# Configuration
class Config:
    # Dataset paths
    COMMON_VOICE_PATH = "mozilla-foundation/common_voice_11_0"
    FLEURS_PATH = "google/fleurs"

    # Pre-trained model
    MODEL_NAME = "facebook/wav2vec2-base"

    # Training parameters
    BATCH_SIZE = 2  # Reduced batch size for CPU
    NUM_EPOCHS = 5
    MAX_AUDIO_LEN = 16_000 * 10  # 10 seconds of audio
    SAMPLING_RATE = 16_000
    MAX_PREDICTION_LENGTH = 100
    WEIGHT_DECAY = 0.01

    # Output directories
    OUTPUT_DIR = "./results"
    LOG_DIR = "./logs"


In [7]:
# @title Data augmentation functions
def time_stretch(audio, rate=1.1):
    """Apply time stretching to the audio."""
    return librosa.effects.time_stretch(audio, rate=rate)

def pitch_shift(audio, sr, n_steps=2):
    """Apply pitch shifting to the audio."""
    return librosa.effects.pitch_shift(audio, sr=sr, n_steps=n_steps)

def augment_audio(audio, sr):
    """Apply random augmentations to the audio."""
    if random.random() > 0.5:  # 50% chance to apply time stretching
        audio = time_stretch(audio)
    if random.random() > 0.5:  # 50% chance to apply pitch shifting
        audio = pitch_shift(audio, sr)
    return audio


In [8]:
# @title Dataset Statistics
def print_dataset_statistics(dataset, name="Dataset"):
    """
    Prints the number of samples and total audio duration for a dataset.
    Args:
        dataset: Preprocessed dataset (train or test split).
        name (str): Name of the dataset (e.g., "Common Voice Train").
    """
    # Number of samples
    num_samples = len(dataset)

    # Total audio duration (in hours)
    total_duration = sum(len(batch["audio"]["array"]) / batch["audio"]["sampling_rate"] for batch in dataset) / 3600

    print(f"{name}:")
    print(f"  - Number of samples: {num_samples}")
    print(f"  - Total audio duration: {total_duration:.2f} hours")

In [17]:
# @title Preprocessing & Download Dataset function
def prepare_dataset(batch, processor, sr=16000, augment=False):
    """
    Preprocesses the dataset with optional data augmentation.
    Args:
        batch: A single example from the dataset.
        processor: Wav2Vec2Processor for feature extraction.
        sr: Sampling rate (default: 16kHz).
        augment (bool): Whether to apply data augmentation.
    Returns:
        dict: Processed batch with augmented or clean audio.
    """
    # Extract raw audio
    audio = batch["audio"]["array"]
    sampling_rate = batch["audio"]["sampling_rate"]

    # Resample audio if necessary
    if sampling_rate != sr:
        audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=sr)

    # Apply data augmentation (only if augment=True)
    if augment:
        audio = augment_audio(audio, sr)

    # Process audio
    inputs = processor(
        audio,
        sampling_rate=sr,
        max_length=Config.MAX_AUDIO_LEN,
        truncation=True
    )
    # Handle different column names for text transcription
    if "sentence" in batch:
        text_column = "sentence"
    elif "transcription" in batch:
        text_column = "transcription"
    else:
        raise KeyError("Dataset does not contain 'sentence' or 'transcription' column.")
    batch["input_values"] = torch.tensor(inputs.input_values[0]).to(device)
    batch["labels"] = torch.tensor(processor.tokenizer(batch[text_column]).input_ids).to(device)
    return batch

# Download and preprocess datasets
def download_and_preprocess_datasets(dset_name, language="hi", language_code="hi_in"):
    """
    Downloads and preprocesses Common Voice and FLEURS datasets.
    Args:
        dset_name (str): Dataset name (e.g., "C" for Common Voice, "F" for FLEURS).
        language (str): Language code for Common Voice (e.g., "hi").
        language_code (str): Language code for FLEURS (e.g., "hi_in").
    Returns:
        Tuple: Preprocessed train and test datasets.
    """
    # Load processor
    processor = Wav2Vec2Processor.from_pretrained(Config.MODEL_NAME)

    # Download and preprocess Common Voice
    if dset_name == 'C':
        asr_dset = load_dataset(Config.COMMON_VOICE_PATH, language, trust_remote_code=True)
    if dset_name == 'F':
        asr_dset = load_dataset(Config.FLEURS_PATH, language_code, trust_remote_code=True)

    asr_dset = asr_dset.cast_column("audio", Audio(sampling_rate=16000))
    if dset_name == 'C':
        print("Common Voice columns:", asr_dset["train"].column_names)
        print("Common Voice sample:", asr_dset["train"][0])

        # Print dataset statistics
        print("Common Voice Dataset Statistics:")
        print_dataset_statistics(asr_dset["train"], name="Common Voice Train")
        print_dataset_statistics(asr_dset["test"], name="Common Voice Test")
    if dset_name == 'F':
        print("FLEURS columns:", asr_dset["train"].column_names)
        print("FLEURS sample:", asr_dset["train"][0])

        # Print dataset statistics
        print("FLEURS Dataset Statistics:")
        print_dataset_statistics(asr_dset["train"], name="FLEURS Train")
        print_dataset_statistics(asr_dset["test"], name="FLEURS Test")

    asr_train = asr_dset["train"].map(
        lambda x: prepare_dataset(x, processor, augment=True),
        remove_columns=[col for col in asr_dset["train"].column_names if col not in ["audio", "sentence"]]
    )
    asr_test = asr_dset["test"].map(
        lambda x: prepare_dataset(x, processor, augment=False),
        remove_columns=[col for col in asr_dset["test"].column_names if col not in ["audio", "sentence"]]
    )

    return asr_train, asr_test, processor


In [47]:
from torch.cuda.amp import autocast

def evaluate_pretrained_model(model, processor, test_dataset):
    """
    Evaluates the pre-trained model on the test dataset.
    Args:
        model: Pre-trained Wav2Vec 2.0 model.
        processor: Wav2Vec2Processor for decoding predictions.
        test_dataset: Preprocessed test dataset.
    """
    references = []
    hypotheses = []

    # Determine the text column name dynamically
    if "sentence" in test_dataset.column_names:
        text_column = "sentence"
    elif "transcription" in test_dataset.column_names:
        text_column = "transcription"
    else:
        raise KeyError("Dataset does not contain 'sentence' or 'transcription' column.")

    for batch in test_dataset:
        inputs = torch.tensor(batch["input_values"]).unsqueeze(0).to(device)
        with autocast():
            with torch.no_grad():
                logits = model(inputs).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor.decode(
            predicted_ids[0],
            skip_special_tokens=True)
        transcription = transcription[:Config.MAX_PREDICTION_LENGTH]
        # Remove repetitive patterns
        def remove_repetitions(text):
            return ''.join([text[i] for i in range(len(text)) if i == 0 or text[i] != text[i - 1]])

        transcription = remove_repetitions(transcription)
        references.append(batch[text_column])
        hypotheses.append(transcription)

    # Calculate Word Error Rate (WER)
    word_error_rate = wer(references, hypotheses)
    # Calculate Character Error Rate (CER)
    char_error_rate = cer(references, hypotheses)
    def normalize_text(text):
        return " ".join(text.strip().lower().split())

    references = [normalize_text(ref) for ref in references]
    hypotheses = [normalize_text(hyp) for hyp in hypotheses]
    print(f"Pre-trained Model Word Error Rate (WER): {word_error_rate:.2f}")
    print(f"Pre-trained Model Character Error Rate (CER): {char_error_rate:.2f}")


In [11]:
def tokenize(texts):
    """
    Tokenizes a list of texts into individual characters or words.
    Args:
        texts (list): List of strings to tokenize.
    Returns:
        list: Flattened list of tokens.
    """
    # Tokenize into characters
    return list("".join(texts))

In [12]:
def plot_confusion_matrix(conf_matrix, label_set, title="Confusion Matrix"):
    """
    Plots the confusion matrix as a heatmap.
    Args:
        conf_matrix (np.ndarray): Confusion matrix.
        label_set (list): List of unique labels (tokens).
        title (str): Title of the plot.
    """
    plt.figure(figsize=(10, 8))
    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=label_set, yticklabels=label_set)
    plt.title(title)
    plt.xlabel("Predicted Tokens")
    plt.ylabel("True Tokens")
    plt.show()



In [13]:
def plot_metric_trends(metrics, metric_name="WER"):
    """
    Plots the trend of a metric over epochs.
    Args:
        metrics (list): List of metric values for each epoch.
        metric_name (str): Name of the metric (e.g., "WER").
    """
    plt.figure(figsize=(10, 6))
    plt.plot(metrics, marker="o", color="orange")
    plt.title(f"{metric_name} Trend Over Epochs")
    plt.xlabel("Epoch")
    plt.ylabel(metric_name)
    plt.grid(linestyle="--", alpha=0.7)
    plt.show()

In [61]:
!pip show transformers

Name: transformers
Version: 4.50.3
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /usr/local/lib/python3.11/dist-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: peft, sentence-transformers


In [65]:
class CustomDataCollatorCTCWithPadding:
    def __init__(self, processor: Wav2Vec2Processor, padding=True):
        self.processor = processor
        self.padding = padding

    def __call__(self, features):
        # Split inputs and labels since they need different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        # Pad input values
        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )

        # Pad labels
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt",
            )

        # Replace labels with -100 where padding tokens are present
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # Update batch with padded labels
        batch["labels"] = labels

        return batch

In [66]:
# @title Function for Fine-tune the model
def fine_tune_model(train_dataset, test_dataset, processor):
    """
    Fine-tunes the Wav2Vec 2.0 Base model.
    Args:
        train_dataset: Preprocessed training dataset.
        test_dataset: Preprocessed test dataset.
        processor: Wav2Vec2Processor for feature extraction.
    """
    # Load pre-trained model
    model = Wav2Vec2ForCTC.from_pretrained(Config.MODEL_NAME)
    model.to(device)
    model.gradient_checkpointing_enable()

    NUM_TRAIN_EXAMPLES = len(train_dataset)

    # Calculate total steps
    total_steps = (NUM_TRAIN_EXAMPLES / Config.BATCH_SIZE) * Config.NUM_EPOCHS

    # Define warmup steps (10% of total steps)
    WARMUP_STEPS = int(0.1 * total_steps)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=Config.OUTPUT_DIR,
        num_train_epochs=Config.NUM_EPOCHS,
        per_device_train_batch_size=Config.BATCH_SIZE,
        per_device_eval_batch_size=Config.BATCH_SIZE,
        evaluation_strategy="epoch",
        logging_dir=Config.LOG_DIR,
        save_steps=500,
        save_total_limit=2,
        gradient_checkpointing=True,
        fp16=True,
        warmup_steps=WARMUP_STEPS,
        weight_decay=Config.WEIGHT_DECAY,
        push_to_hub=False,
    )
    data_collator = CustomDataCollatorCTCWithPadding(processor=processor, padding=True)


    # Create Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        tokenizer=processor.tokenizer,
        data_collator=data_collator,  # Use data collator for padding
    )

    # Start training
    trainer.train()
    trainer.save_model(Config.OUTPUT_DIR)

# Evaluate the model
def evaluate_model(model, processor, test_dataset):
    """
    Evaluates the fine-tuned model using Word Error Rate (WER).
    Args:
        model: Fine-tuned Wav2Vec 2.0 model.
        processor: Wav2Vec2Processor for decoding predictions.
        test_dataset: Preprocessed test dataset.
    """
    references = []
    hypotheses = []

        # Determine the text column name dynamically
    if "sentence" in test_dataset.column_names:
        text_column = "sentence"
    elif "transcription" in test_dataset.column_names:
        text_column = "transcription"
    else:
        raise KeyError("Dataset does not contain 'sentence' or 'transcription' column.")

    for batch in test_dataset:
        inputs = torch.tensor(batch["input_values"]).unsqueeze(0).to(device)
        with autocast():
            with torch.no_grad():
                logits = model(inputs).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor.decode(predicted_ids[0])
        references.append(batch[text_column])
        hypotheses.append(transcription)

    # Calculate Word Error Rate (WER)
    word_error_rate = wer(references, hypotheses)
    # Calculate Character Error Rate (CER)
    char_error_rate = cer(references, hypotheses)
    print(f"Fine Tuned Model - Word Error Rate (WER): {word_error_rate:.2f}")
    print(f"Fine Tuned Model - Character Error Rate (CER): {char_error_rate:.2f}")


In [18]:
# @title Common Voice Dataset
# Step 1: Download and preprocess Common Voice Dataset
dset_name = 'C' #Common Voice
common_voice_train , common_voice_test , processor = download_and_preprocess_datasets(dset_name)



Common Voice columns: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment']
Common Voice sample: {'client_id': '0f018a99663f33afbb7d38aee281fb1afcfd07f9e7acd00383f604e1e17c38d6ed8adf1bd2ccbf927a52c5adefb8ac4b158ce27a7c2ed9581e71202eb302dfb3', 'path': '/root/.cache/huggingface/datasets/downloads/extracted/1bfc12b9ee30f73bf143fa237d4ba38488008883c25816876e1a35295c9575d3/hi_train_0/common_voice_hi_26008353.mp3', 'audio': {'path': '/root/.cache/huggingface/datasets/downloads/extracted/1bfc12b9ee30f73bf143fa237d4ba38488008883c25816876e1a35295c9575d3/hi_train_0/common_voice_hi_26008353.mp3', 'array': array([ 3.81639165e-17,  2.42861287e-17, -1.73472348e-17, ...,
       -1.30981789e-07,  2.63096808e-07,  4.77157300e-08]), 'sampling_rate': 16000}, 'sentence': 'हमने उसका जन्मदिन मनाया।', 'up_votes': 2, 'down_votes': 0, 'age': '', 'gender': '', 'accent': '', 'locale': 'hi', 'segment': ''}
Common Voice Dataset Statistics:
Common Voice

Map:   0%|          | 0/4361 [00:00<?, ? examples/s]

Map:   0%|          | 0/2894 [00:00<?, ? examples/s]

In [19]:
common_voice_train.save_to_disk("./common_voice_train")
common_voice_test.save_to_disk("./common_voice_test")


Saving the dataset (0/4 shards):   0%|          | 0/4361 [00:00<?, ? examples/s]

Saving the dataset (0/3 shards):   0%|          | 0/2894 [00:00<?, ? examples/s]

In [20]:
# Step 2: Load PreTrained model
pretrained_model = Wav2Vec2ForCTC.from_pretrained(Config.MODEL_NAME).to(device)
pretrained_model.gradient_checkpointing_enable()
print(f"Model is on device: {next(pretrained_model.parameters()).device}")


pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model is on device: cuda:0


In [48]:
# Step 3: Evaluate pre-trained model on Common Voice datasets
print("Evaluating pre-trained model on Common Voice:")
evaluate_pretrained_model(pretrained_model, processor, common_voice_test)


Evaluating pre-trained model on Common Voice:


  with autocast():


Pre-trained Model Word Error Rate (WER): 1.00
Pre-trained Model Character Error Rate (CER): 1.76


In [2]:
# Step 4: Fine-tune and evaluate on Common Voice
print("Fine-tuning on Common Voice...")
fine_tune_model(common_voice_train, common_voice_test, processor)
fine_tuned_model = Wav2Vec2ForCTC.from_pretrained(Config.OUTPUT_DIR)
fine_tuned_model.to(device)
print("Evaluating fine-tuned model on Common Voice:")
evaluate_model(fine_tuned_model, processor, common_voice_test)

Fine-tuning on Common Voice...


NameError: name 'fine_tune_model' is not defined

In [None]:
# Step 5 : Clears RAM
# Save preprocessed Common Voice dataset to disk
common_voice_train.save_to_disk("./common_voice_train")
common_voice_test.save_to_disk("./common_voice_test")
del common_voice_train
del common_voice_test
del common_voice
gc.collect()

In [None]:
# @title Fleurs Dataset
#Step 1: Load Fleurs Datset
dset_name = 'F' #Fleurs Dataset
fleurs_train , fleurs_test , processor = download_and_preprocess_datasets(dset_name)



In [None]:
# Step 2: Load PreTrained model
pretrained_model = Wav2Vec2ForCTC.from_pretrained(Config.MODEL_NAME).to(device)
pretrained_model.gradient_checkpointing_enable()


In [None]:
#Step 3 : Evaluate pre-trained model on Fleurs datasets
print("Evaluating pre-trained model on FLEURS:")
evaluate_pretrained_model(pretrained_model, processor, fleurs_test)

In [None]:
# Step 4: Fine-tune and evaluate on FLEURS
print("Fine-tuning on FLEURS...")
fine_tune_model(fleurs_train, fleurs_test, processor)
fine_tuned_model = Wav2Vec2ForCTC.from_pretrained(Config.OUTPUT_DIR)
fine_tuned_model.to(device)
print("Evaluating fine-tuned model on FLEURS:")
evaluate_model(fine_tuned_model, processor, fleurs_test)