## Basic Setup

In [1]:
import sys
sys.path.append('..')

from scripts.data_loaders.common import show_sample

# importing datasets from dataloader 
from scripts.data_loaders.L2ARCTIC import L2ArcticDataset, all_arctic_speaker_splits, SPEAKERS
from scripts.data_loaders.TIMIT import TIMITDataset
from scripts.data_loaders.PSST import PSSTDataset
# <TODO> add buckeye and doreco 

from scripts.eval.evaluate import evaluate
from scripts.eval.metrics import per, fer
from scripts.ipa_transcription.wav2vec2 import transcribe_batch

from transformers import AutoProcessor, AutoModelForCTC

from IPython.display import clear_output
clear_output()

In [2]:
# set espeak library path for macOS
if sys.platform == "darwin":
    from phonemizer.backend.espeak.wrapper import EspeakWrapper

    _ESPEAK_LIBRARY = "/opt/homebrew/Cellar/espeak/1.48.04_1/lib/libespeak.1.1.48.dylib"
    EspeakWrapper.set_library(_ESPEAK_LIBRARY)

In [3]:

PRE_TRAINED_ID = "facebook/wav2vec2-lv-60-espeak-cv-ft"

# Load Data

In [None]:
train_psst = PSSTDataset(split="train", include_speaker_info=True, force_offline=True)

In [6]:
train_timit = TIMITDataset(split="train", include_speaker_info=True)

In [8]:
L2_timit = L2ArcticDataset(include_speaker_info=True)

# Extend Phoneme Vocab

In [16]:
timit_vocab = set("".join(train_df['ipa']))
print(timit_vocab)

{'k', 'o', 'h', 'ŋ', 'p', 'ɡ', 'm', 'v', 'f', 'ʌ', 'e', 'ɛ', 'u', 'a', 'ɹ', 'n', 'ɑ', 'ð', 'ə', 't', 'z', 'ʒ', 'ʊ', 'ɔ', 'b', 's', 'w', 'd', 'æ', 'θ', 'ɾ', 'ʔ', 'ɪ', 'l', 'j', 'i', 'ʃ'}


In [17]:
tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_ID)
vocab = tokenizer.get_vocab()
# you will see how large the vocab is, we will resize our linear layer later to make it work for our smaller vocab
print(vocab)

{'A': 267, 'E': 45, 'H': 72, 'I': 20, 'O': 305, 'U': 292, '[PAD]': 310, '[UNK]': 309, '_': 182, 'a': 53, 'ã': 125, 'b': 113, 'b̪': 177, 'b̪͡v': 289, 'b͡ꞵ': 287, 'c': 25, 'cʼ': 229, 'c͡ç': 66, 'd': 176, 'd̼': 118, 'd͡z': 263, 'd͡ð': 244, 'd͡ɮ': 18, 'd͡ʑ': 94, 'd͡ʒ': 280, 'e': 175, 'ẽ': 211, 'e̞': 114, 'ẽ̞': 265, 'f': 139, 'fʼ': 249, 'h': 179, 'i': 245, 'j': 215, 'k': 282, 'kxʼ': 281, 'kǀ': 41, 'kǁ': 61, 'kǂ': 21, 'kǃ': 70, 'kʘ': 79, 'kʼ': 39, 'k̚': 89, 'k͡p': 48, 'k͡x': 198, 'l': 303, 'm': 75, 'm̥': 172, 'n': 102, 'n̥': 28, 'n̼': 169, 'o': 117, 'õ': 119, 'o̞': 62, 'õ̞': 234, 'p': 259, 'pʼ': 286, 'p̚': 149, 'p̪': 273, 'p̪͡f': 105, 'p͡f': 225, 'p͡ɸ': 103, 'q': 130, 'qǀ': 302, 'qǁ': 126, 'qǂ': 299, 'qǃ': 37, 'qʘ': 261, 'qʼ': 58, 'q͡ʡ': 127, 'q͡χʼ': 291, 'q͡ꭓ': 4, 'r': 82, 'r̥': 34, 's': 247, 'sʼ': 200, 't': 307, 'tʼ': 183, 't̚': 241, 't̪͡θʼ': 216, 't̼': 87, 't͡s': 156, 't͡sʼ': 43, 't͡ɕ': 264, 't͡ɬ': 170, 't͡ɬʼ': 164, 't͡ʃ': 91, 't͡ʃʼ': 108, 't͡θ': 140, 'u': 154, 'ũ': 81, 'v': 243, 'w

In [18]:
additional_vocab = timit_vocab.difference(set(vocab.keys()) | {' '})
print("tokens that are in timit but not in the pretrained model", additional_vocab)
tokenizer.add_tokens(list(additional_vocab))

tokens that are in timit but not in the pretrained model set()


0

# Update Feature Extraction

In [19]:
feature_extractor = AutoFeatureExtractor.from_pretrained(PRE_TRAINED_ID)

We don't need to update the feature extractor since it has been pretrained on 16kHz audio which matches the TIMIT dataset.

For datasets with different sampling rates, the feature extractor should be updated or the audio resampled (easier).

This is also where code to add extra features (such as conditioning on speaker's native language etc.) would be added.

# hyperparam search


In [20]:
%pip install torchaudio

Note: you may need to restart the kernel to use updated packages.


## V2 resize finetuning 

In [None]:
import json
import torch
import logging
import tempfile
import wandb
import numpy as np
from transformers import (
    AutoProcessor, 
    AutoModelForCTC, 
    Trainer, 
    TrainingArguments,
    Wav2Vec2CTCTokenizer,
    EarlyStoppingCallback
)
from datasets import Dataset
from dataclasses import dataclass
from typing import Dict, List, Union
import torchaudio
from sklearn.model_selection import train_test_split
import os
import time
from datetime import datetime
import GPUtil
import psutil

# Constants and logging setup as before
SAMPLING_RATE = 16000
MAX_AUDIO_LENGTH = 160000
MAX_LABEL_LENGTH = 100

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class TrialTimeTracker:
    def __init__(self, output_file):
        self.output_file = output_file
        self.current_trial = None
        self.start_time = None
        
    def start_trial(self, config):
        self.current_trial = {
            'config': config,
            'start_time': datetime.now().isoformat(),
            'gpu_type': self._get_gpu_type(),
            'timestamps': []
        }
        self.start_time = time.time()
        
    def log_timestamp(self, step_name):
        if self.current_trial:
            gpu_metrics = self._get_gpu_metrics()
            timestamp = {
                'step': step_name,
                'elapsed_seconds': time.time() - self.start_time,
                **gpu_metrics
            }
            self.current_trial['timestamps'].append(timestamp)
            logger.info(f"Step {step_name} completed after {timestamp['elapsed_seconds']:.2f}s")
    
    def _get_gpu_type(self):
        try:
            gpu = GPUtil.getGPUs()[0]
            return gpu.name
        except:
            return "Unknown"
            
    def _get_gpu_metrics(self):
        try:
            gpu = GPUtil.getGPUs()[0]
            return {
                'gpu_utilization': gpu.load * 100,
                'gpu_memory_used': gpu.memoryUsed,
                'gpu_memory_total': gpu.memoryTotal
            }
        except:
            return {
                'gpu_utilization': None,
                'gpu_memory_used': None,
                'gpu_memory_total': None
            }

@dataclass
class DataCollatorCTCWithPadding:
    # Same as before
    processor: AutoProcessor
    padding: Union[bool, str] = "longest"

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        max_length = min(
            max(len(feature["input_values"]) for feature in features),
            MAX_AUDIO_LENGTH
        )
        padded_inputs = []
        attention_mask = []
        
        for feature in features:
            input_length = len(feature["input_values"])
            padding_length = max_length - input_length
            
            if isinstance(feature["input_values"], list):
                input_values = torch.tensor(feature["input_values"])
            else:
                input_values = feature["input_values"]
            
            input_values = input_values.squeeze()
            
            if padding_length > 0:
                padded_input = torch.nn.functional.pad(input_values, (0, padding_length))
                attention_mask.append(torch.cat([torch.ones(input_length), torch.zeros(padding_length)]))
            else:
                padded_input = input_values
                attention_mask.append(torch.ones(input_length))
            
            padded_inputs.append(padded_input)

        batch = {
            "input_values": torch.stack(padded_inputs),
            "attention_mask": torch.stack(attention_mask)
        }

        with self.processor.as_target_processor():
            label_features = [{"input_ids": feature["labels"]} for feature in features]
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt",
            )

        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        batch["labels"] = labels

        return batch

def prepare_model_and_processor(pre_trained_id, timit_vocab, base_dir):
    """Prepare model and processor with vocabulary resizing"""
    logger.info("Loading pretrained model and processor...")
    
    os.makedirs(base_dir, exist_ok=True)
    processor = AutoProcessor.from_pretrained(pre_trained_id)
    model = AutoModelForCTC.from_pretrained(pre_trained_id)
    
    old_vocab = processor.tokenizer.get_vocab()
    logger.info(f"Initial vocabulary size: {len(old_vocab)}")
    
    special_tokens = {
        "[PAD]": 0,
        "<s>": 1,
        "</s>": 2,
        "[UNK]": 3
    }
    
    regular_tokens = sorted(list(timit_vocab - set(['<pad>', '<unk>'])))
    vocab = {**special_tokens}
    
    for idx, token in enumerate(regular_tokens):
        vocab[token] = idx + len(special_tokens)
    
    logger.info(f"New vocabulary size: {len(vocab)}")
    
    vocab_file = os.path.join(base_dir, "vocab.json")
    with open(vocab_file, 'w') as f:
        json.dump(vocab, f, indent=2)
    
    tokenizer_config = {
        "vocab": vocab,
        "pad_token": "[PAD]",
        "bos_token": "<s>",
        "eos_token": "</s>",
        "unk_token": "[UNK]",
    }
    
    with tempfile.NamedTemporaryFile(mode='w', delete=False) as f:
        json.dump(vocab, f)
        vocab_path = f.name
    
    new_tokenizer = Wav2Vec2CTCTokenizer(
        vocab_path,
        **{k: v for k, v in tokenizer_config.items() if k != "vocab"}
    )
    
    processor.tokenizer = new_tokenizer
    
    # Save processor configuration
    processor_path = os.path.join(base_dir, "processor")
    processor.save_pretrained(processor_path)
    
    # Resize the linear head
    old_weights = model.lm_head.weight.data
    old_bias = model.lm_head.bias.data
    
    new_layer = torch.nn.Linear(model.lm_head.in_features, len(processor.tokenizer))
    torch.nn.init.xavier_uniform_(new_layer.weight.data, gain=0.1)
    new_layer.bias.data.uniform_(-0.1, 0.1)
    
    transfer_count = 0
    for token, new_idx in vocab.items():
        if token in old_vocab:
            old_idx = old_vocab[token]
            new_layer.weight.data[new_idx, :] = old_weights[old_idx, :]
            new_layer.bias.data[new_idx] = old_bias[old_idx]
            transfer_count += 1
            
    logger.info(f"Transferred weights for {transfer_count} tokens")
    
    model.lm_head = new_layer
    model.config.vocab_size = len(vocab)
    model.config.pad_token_id = special_tokens["[PAD]"]
    model.config.bos_token_id = special_tokens["<s>"]
    model.config.eos_token_id = special_tokens["</s>"]
    model.config.unk_token_id = special_tokens["[UNK]"]
    
    return model, processor

def prepare_datasets(train_df, processor):
    """Dataset preparation with optimized batch processing"""
    # Same as before
    logger.info("Splitting dataset into train and evaluation...")
    
    train_data, eval_data = train_test_split(train_df, test_size=0.2, random_state=42)
    
    def process_data(batch):
        audio = preprocess_audio(batch["audio"])
        audio = audio.squeeze()
        
        inputs = processor(
            audio, 
            sampling_rate=SAMPLING_RATE, 
            return_tensors=None
        )
        
        input_values = np.squeeze(inputs["input_values"])
        
        with processor.as_target_processor():
            labels = processor(batch["ipa"]).input_ids
        
        return {
            "input_values": input_values,
            "labels": labels,
            "audio": audio
        }
    
    train_dataset = Dataset.from_pandas(train_data)
    eval_dataset = Dataset.from_pandas(eval_data)
    
    train_dataset = train_dataset.map(
        process_data,
        batch_size=32,
        num_proc=4,
        remove_columns=[col for col in train_dataset.column_names if col != "audio"]
    )
    eval_dataset = eval_dataset.map(
        process_data,
        batch_size=32,
        num_proc=4,
        remove_columns=[col for col in eval_dataset.column_names if col != "audio"]
    )
    
    return train_dataset, eval_dataset

def preprocess_audio(audio_input):
    """Audio preprocessing function"""
    # Same as before
    if isinstance(audio_input, str):
        waveform, sample_rate = torchaudio.load(audio_input)
        audio_numpy = waveform.squeeze().numpy()
    elif isinstance(audio_input, (list, np.ndarray, torch.Tensor)):
        audio_numpy = np.array(audio_input, dtype=np.float32)
    else:
        raise ValueError(f"Unsupported audio type: {type(audio_input)}")
    
    audio_numpy = audio_numpy.astype(np.float32)
    if len(audio_numpy.shape) > 1:
        audio_numpy = np.mean(audio_numpy, axis=0)
    
    if np.abs(audio_numpy).max() > 1:
        audio_numpy = audio_numpy / np.abs(audio_numpy).max()
    
    if len(audio_numpy) > MAX_AUDIO_LENGTH:
        audio_numpy = audio_numpy[:MAX_AUDIO_LENGTH]
    elif len(audio_numpy) < MAX_AUDIO_LENGTH:
        padding = np.zeros(MAX_AUDIO_LENGTH - len(audio_numpy), dtype=np.float32)
        audio_numpy = np.concatenate([audio_numpy, padding])
    
    return audio_numpy

def run_finetuning(train_df, pre_trained_id, timit_vocab, output_dir):
    """Run finetuning with optimal hyperparameters"""
    tracker = TrialTimeTracker(os.path.join(output_dir, "finetuning_log.jsonl"))
    
    config = {
        "learning_rate": 9.999999999999999e-05,
        "per_device_train_batch_size": 4,
        "per_device_eval_batch_size": 4,
        "gradient_accumulation_steps": 2,
        "warmup_ratio": 0.1,
        "weight_decay": 0.01,
        "max_grad_norm": 1.0,
        "adam_beta1": 0.9,
        "adam_beta2": 0.975,
        "adam_epsilon": 1e-08
    }
    
    wandb.init(
        project="xlsr-phoneme-finetuning-final",
        name="finetuning_optimal_params",
        config=config
    )
    
    tracker.start_trial(config)
    
    tracker.log_timestamp("model_init_start")
    model, processor = prepare_model_and_processor(pre_trained_id, timit_vocab, output_dir)
    model.gradient_checkpointing_enable()
    tracker.log_timestamp("model_init_complete")
    
    tracker.log_timestamp("dataset_prep_start")
    train_dataset, eval_dataset = prepare_datasets(train_df, processor)
    tracker.log_timestamp("dataset_prep_complete")
    
    training_args = TrainingArguments(
        output_dir=output_dir,
        **config,
        num_train_epochs=20,
        logging_steps=50,
        save_steps=100,
        eval_steps=100,
        evaluation_strategy="steps",
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        save_strategy="steps",
        save_total_limit=2,
        fp16=True,
        dataloader_num_workers=16,
        gradient_checkpointing=True,
        dataloader_prefetch_factor=4,
        ddp_find_unused_parameters=False
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=DataCollatorCTCWithPadding(processor=processor, padding="longest"),
        callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
    )
    
    tracker.log_timestamp("training_start")
    train_result = trainer.train()
    tracker.log_timestamp("training_complete")
    
    eval_result = trainer.evaluate()
    tracker.log_timestamp("evaluation_complete")
    
    logger.info("Saving final model and processor...")
    trainer.save_model(os.path.join(output_dir, "final_model"))
    processor.save_pretrained(os.path.join(output_dir, "final_processor"))
    
    wandb.finish()

if __name__ == "__main__":
    run_finetuning(
        train_df=train_df,  # Your training DataFrame
        pre_trained_id=PRE_TRAINED_ID,  # Your pretrained model ID
        timit_vocab=timit_vocab,  # Your TIMIT vocabulary set
        output_dir="./results/finetune_resize_final"
    )

## Save Model on Hugging Face

In [None]:
from transformers import AutoModelForCTC, Wav2Vec2Processor
from huggingface_hub import login

# Log in to Hugging Face (if not already logged in)
# login("your-huggingface-api-token")

# Load the model and processor from your local directory
model = AutoModelForCTC.from_pretrained("/home/arunasrivastava/ML/notebooks/results-b0")
processor = Wav2Vec2Processor.from_pretrained("/home/arunasrivastava/ML/notebooks/results-b0")

# Specify the directory where you want to save the model
save_directory = "./xlsr-timit-b0"

# Save and upload to Hugging Face Model Hub under an organization
model.save_pretrained(save_directory, push_to_hub=True, repo_id="KoelLabs/xlsr-timit-b0")
processor.save_pretrained(save_directory)


# Instructions to Upload Model to Hugging Face

1. **Login to Hugging Face**  
   Run the following command to log in to your Hugging Face account:
   ```bash
   huggingface-cli login


2. **Create a Repository**  
   Run the following command to log in to your Hugging Face account:
   ```bash
   huggingface-cli repo create your-model-name --organization your-org-name

3. **Git clone**  
   Run the following command to clone your repo:
   ```bash 
   git clone https://huggingface.co/your-org-name/your-model-name

4. **Upload Model**  (if you did not make it on Hugging Face already)
   Run the following command to clone your repo:
   ```bash 
   huggingface-cli repo upload --path ./<path here>


5. **Move Other Files**  
   Run the following command to copy each file into the repo:
   ```bash
   cp -r ./<path here> <your-cloned-repo-folder> 

6. **Stage & Commit!**
    Run the following git commands
    ```bash
    git add README.md
    git commit -m "Add model card"
    git push


### YAAAAY CONGRATULATIONS 

# (Optional) Hyperparam search


In [None]:
%pip install optuna joblib
# you can use WADNB sweeps as well, whatever is easier