In [1]:
%%capture

"""
You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.

Instructions for setting up Colab are as follows:
1. Open a new Python 3 notebook.
2. Import this notebook from GitHub (File -> Upload Notebook -> "GITHUB" tab -> copy/paste GitHub URL)
3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select "GPU" for hardware accelerator)
4. Run this cell to set up dependencies.
5. Restart the runtime (Runtime -> Restart Runtime) for any upgraded packages to take effect


NOTE: User is responsible for checking the content of datasets and the applicable licenses and determining if suitable for the intended use.
"""

# Install dependencies
!pip install wget
!apt-get install sox libsndfile1 ffmpeg libsox-fmt-mp3
!pip install text-unidecode
!pip install matplotlib>=3.3.2
!pip install datasets==2.21.0 # downgrading to 2.21.0 because latest version (3.0.0) has some issues
!pip install wandb


## Install NeMo
BRANCH = 'main'
!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]

"""
Remember to restart the runtime for the kernel to pick up any upgraded packages (e.g. matplotlib)!
Alternatively, you can uncomment the exit() below to crash and restart the kernel, in the case
that you want to use the "Run All Cells" (or similar) option.
"""
# exit()

In [2]:
import os
import glob
import subprocess
import tarfile
import wget
import copy
from omegaconf import OmegaConf, open_dict
import nemo
import nemo.collections.asr as nemo_asr
from nemo.collections.asr.metrics.wer import word_error_rate
from nemo.utils import logging, exp_manager
from tqdm.auto import tqdm
from nemo.collections.asr.parts.utils.manifest_utils import read_manifest, write_manifest
import json
import wandb

In [3]:
wandb.login(key="56ba8228c7e8cf3c2c6037b966edc1cec69cc4a3")
wandb.init(project='CodeSwitch ASR')

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33molufemi[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Tracking run with wandb version 0.19.1
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20250131_063302-rvcsgaj8[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mblooming-sun-13[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/olufemi/CodeSwitch%20ASR[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/olufemi/CodeSwitch%20ASR/runs/rvcsgaj8[0m


In [4]:
import pandas as pd
data = pd.read_csv("/kaggle/input/codeswitched-dataset/21 hours.csv")
data = data[["Prompt_text", "Duration", "full_audio_path"]]
data["full_audio_path"] = ["/kaggle/input/codeswitchedaudiofiles/audio_files/" + i for i in data["full_audio_path"]]
data

Unnamed: 0,Prompt_text,Duration,full_audio_path
0,"Àwọn challenges kan face mi, èmi náà face wọn.",4.56,/kaggle/input/codeswitchedaudiofiles/audio_fil...
1,"My aunt always says, ‘Ìbáṣepọ̀rere ní ń mú ẹb...",2.22,/kaggle/input/codeswitchedaudiofiles/audio_fil...
2,"Me that have been praying pe ki aye é le da, y...",4.14,/kaggle/input/codeswitchedaudiofiles/audio_fil...
3,Jọ lend me your pen.,1.80,/kaggle/input/codeswitchedaudiofiles/audio_fil...
4,"My uncle always says, ‘Ìjọpọ̀ẹbí ń mú gbogbo ...",3.54,/kaggle/input/codeswitchedaudiofiles/audio_fil...
...,...,...,...
16378,They would have been really punished tí Mọ bá ...,3.84,/kaggle/input/codeswitchedaudiofiles/audio_fil...
16379,Their blanket tí dà ná; where did they put it?,4.80,/kaggle/input/codeswitchedaudiofiles/audio_fil...
16380,I didn't go anywhere pẹ̀lú ẹ.,2.82,/kaggle/input/codeswitchedaudiofiles/audio_fil...
16381,We want to go and buy water ní Baṣọ̀run market.,3.42,/kaggle/input/codeswitchedaudiofiles/audio_fil...


In [5]:
import pandas as pd
import json
import os
import re
import string

def clean_text(text):
    """
    Cleans text by removing punctuation (except apostrophes), converting to lowercase,
    and stripping extra spaces.
    
    Args:
        text (str): The input text to clean.
    
    Returns:
        str: The cleaned text.
    """
    if not isinstance(text, str):
        return ""  # Handle cases where text is not a string
    
    # Keep only apostrophes, remove other punctuation
    text = re.sub(r"[{}]".format(re.escape(string.punctuation.replace("'", ""))), "", text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove multiple spaces
    text = re.sub(r"\s+", " ", text).strip()
    
    return text

In [6]:
import json

# Define the output manifest file
output_manifest = "nemo_asr_manifest.json"

# Convert DataFrame to NeMo manifest format
with open(output_manifest, "w", encoding="utf-8") as f:
    for _, row in data.iterrows():
        manifest_entry = {
            "audio_filepath": row["full_audio_path"],
            "duration": row["Duration"],
            "text": clean_text(row["Prompt_text"])
        }
        json.dump(manifest_entry, f, ensure_ascii=False)
        f.write("\n")

print(f"NeMo ASR manifest saved to {output_manifest}")

NeMo ASR manifest saved to nemo_asr_manifest.json


In [7]:
# data = "/kaggle/working/nemo_asr_manifest.json"

In [8]:
# manifest_data = read_manifest(data)

In [9]:
# text = [data['text'] for data in manifest_data]

In [10]:
# with open('text.txt', 'w') as f:
#     for text in text:
#         f.write(text + '\n')

In [11]:
clean_text("My mother says, 'Ìdílé wa ní agbára àti ìmọ")

"my mother says 'ìdílé wa ní agbára àti ìmọ"

In [12]:
import pandas as pd
import json
import os

def convert_csv_to_nemo_manifest(csv_file):
    """
    Converts a CSV file to a NeMo ASR-compatible JSON manifest file.
    
    Args:
        csv_file (str): Path to the input CSV file.

    Saves:
        A JSON manifest file with the same name as the input CSV.
    """
    # Load the CSV file
    data = pd.read_csv(csv_file)
    data["full_audio_path"] = ["/kaggle/input/codeswitchedaudiofiles/audio_files/" + i for i in data["full_audio_path"]]

    # Extract filename without extension
    base_name = os.path.splitext(csv_file)[0]
    base_name = base_name.split("/")[-1]
    output_manifest = f"{base_name}.json"

    # Convert DataFrame to NeMo manifest format
    with open(output_manifest, "w", encoding="utf-8") as f:
        for _, row in data.iterrows():
            manifest_entry = {
                "audio_filepath": row["full_audio_path"],
                "duration": row["Duration"],
                "text": clean_text(row["Prompt_text"])
            }
            json.dump(manifest_entry, f, ensure_ascii=False)
            f.write("\n")

    print(f"NeMo ASR manifest saved to {output_manifest}")

In [13]:
# Example usage:
convert_csv_to_nemo_manifest("/kaggle/input/codeswitched-dataset/train.csv")
convert_csv_to_nemo_manifest("/kaggle/input/codeswitched-dataset/validation.csv")
convert_csv_to_nemo_manifest("/kaggle/input/codeswitched-dataset/test.csv")

NeMo ASR manifest saved to train.json
NeMo ASR manifest saved to validation.json
NeMo ASR manifest saved to test.json


In [14]:
# import re
# from tqdm import tqdm
# import os

# chars_to_ignore_regex = r'[,\?\.\!\-\;\:\"“%\‘”�…{}\[\]・。『』、ー〜]'  # Regex to remove unwanted characters

# def remove_special_characters(data):
#     """Clean unwanted characters and convert text to lowercase."""
#     data["text"] = re.sub(chars_to_ignore_regex, '', data["text"]).lower().strip()
#     return data

# def apply_preprocessors(manifest):
#     """
#     Apply `remove_special_characters` to each entry in the manifest.

#     Args:
#         manifest (list[dict]): List of data entries with a 'text' field.

#     Returns:
#         list[dict]: Preprocessed manifest.
#     """
#     for idx in tqdm(range(len(manifest)), desc="Applying remove_special_characters"):
#         manifest[idx] = remove_special_characters(manifest[idx])
#     print("Finished processing manifest!")
#     return manifest

# def write_processed_manifest(data, original_path):
#     """
#     Write the preprocessed manifest to a new file.

#     Args:
#         data (list[dict]): Processed manifest data.
#         original_path (str): Path to the original file.

#     Returns:
#         str: Path to the new processed manifest file.
#     """
#     base, ext = os.path.splitext(original_path)
#     new_manifest_name = f"{base}_processed{ext}"
#     write_manifest(new_manifest_name, data)
#     print(f"Finished writing manifest: {new_manifest_name}")
#     return new_manifest_name


In [15]:
# data_ = read_manifest(data)
# data_processed = apply_preprocessors(data_)

# # Write new manifests
# manifest_cleaned = write_processed_manifest(data_processed, data)


In [16]:
# import json
# import random

# # Input file
# input_file = data

# # Output files
# train_file = "train.json"
# validation_file = "validation.json"
# test_file = "test.json"

# # Split ratios
# train_ratio = 0.8
# validation_ratio = 0.1
# test_ratio = 0.1

# # Load the data
# with open(input_file, "r") as f:
#     data = [json.loads(line) for line in f]

# # Shuffle the data
# random.shuffle(data)

# # Calculate split indices
# total_samples = len(data)
# train_end = int(total_samples * train_ratio)
# validation_end = train_end + int(total_samples * validation_ratio)

# # Split the data
# train_data = data[:train_end]
# validation_data = data[train_end:validation_end]
# test_data = data[validation_end:]

# # Save the splits
# with open(train_file, "w") as f:
#     for item in train_data:
#         f.write(json.dumps(item) + "\n")

# with open(validation_file, "w") as f:
#     for item in validation_data:
#         f.write(json.dumps(item) + "\n")

# with open(test_file, "w") as f:
#     for item in test_data:
#         f.write(json.dumps(item) + "\n")

# print(f"Data successfully split!\nTrain: {len(train_data)}\nValidation: {len(validation_data)}\nTest: {len(test_data)}")


# train='train.json'
# validation='validation.json'
# test='test.json'

In [17]:
pd.read_csv("/kaggle/input/codeswitched-dataset/train.csv").head()

Unnamed: 0,Fullname,Email_Address,Prompt_ID,Prompt_text,Prompt_type,Audio_link,Duration,full_audio_path
0,Odeyemi Tosin Solomon,huntertosin96@gmail.com,67802cba7a251edbd6a920bd,I want to read iwe ti mo ra ni last week.,Prompt,https://storage.googleapis.com/transcribeme-ly...,3.06,67802cba7a251edbd6a920bd.wav
1,OJO OLAOLUWA VICTOR,olaoluwavictor2019@gmail.com,677595d209f2f37dcccafd5c,E update your system? It will fix most issues.,Prompt,https://storage.googleapis.com/transcribeme-ly...,3.78,677595d209f2f37dcccafd5c.wav
2,Saoban Ramotallahi Olamide,saobanramotallahi08@gmail.com,677027b7f324631b7d83965a,Lola was doing Alajapa work before she wo ise ...,Prompt,https://storage.googleapis.com/transcribeme-ly...,6.66,677027b7f324631b7d83965a.wav
3,Lateefat Saoban,saobanlateefat1@gmail.com,677fb54584b0c362d5ec07f0,My oluko inspires me daily to be a better pers...,Prompt,https://storage.googleapis.com/transcribeme-ly...,5.04,677fb54584b0c362d5ec07f0.wav
4,OJO OLAOLUWA VICTOR,olaoluwavictor2019@gmail.com,6780a2c484b0c362d5295843,"My mother says, ‘Ìdílé wa ní agbára àti ìmọ̀t...",Prompt,https://storage.googleapis.com/transcribeme-ly...,3.78,6780a2c484b0c362d5295843.wav


In [18]:
import json
files = ["train.json", "validation.json", "test.json"]

for file in files:
    with open(file, "r") as f:
        for i, line in enumerate(f):
            try:
                json.loads(line)
            except Exception as e:
                print(f"Error in {file}, line {i}: {e}")

In [19]:
import os
import json
from pydub import AudioSegment
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

def resample_audio(input_path, output_path, target_sample_rate=16000):
    try:
        audio = AudioSegment.from_file(input_path)
        audio = audio.set_frame_rate(target_sample_rate).set_channels(1)
        audio.export(output_path, format="wav")
    except Exception as e:
        print(f"Error processing {input_path}: {e}")

def batch_resample_parallel(manifest_paths, output_dir, target_sample_rate=16000, max_workers=4):
    os.makedirs(output_dir, exist_ok=True)

    # Combine all manifests into a single list of tasks
    tasks = []
    for manifest_path in manifest_paths:
        with open(manifest_path, "r") as f:
            lines = [json.loads(line) for line in f]
            for item in lines:
                input_audio = item["audio_filepath"]
                output_audio = os.path.join(output_dir, os.path.basename(input_audio))
                tasks.append((input_audio, output_audio, target_sample_rate))

    # Process files in parallel with a tqdm progress bar
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(resample_audio, *task) for task in tasks]
        for _ in tqdm(futures, desc="Resampling audio files", ncols=80):
            _.result()

    # Update manifests with new file paths
    for manifest_path in manifest_paths:
        updated_manifest = []
        with open(manifest_path, "r") as f:
            for line in f:
                item = json.loads(line)
                item["audio_filepath"] = os.path.join(output_dir, os.path.basename(item["audio_filepath"]))
                updated_manifest.append(item)

        # Overwrite the original manifest with updated file paths
        with open(manifest_path, "w") as f:
            for item in updated_manifest:
                f.write(json.dumps(item) + "\n")

# Example usage
manifest_paths = ["/kaggle/working/train.json", "/kaggle/working/validation.json",
                  "/kaggle/working/test.json"
                  ]
output_dir = "/kaggle/working/resampled_audio"
batch_resample_parallel(manifest_paths, output_dir, target_sample_rate=16000, max_workers=8)


Resampling audio files: 100%|█████████████| 16383/16383 [28:53<00:00,  9.45it/s]


In [20]:
# batch_resample_parallel("validation.json", "resampled_audio/validation", target_sample_rate=16000, max_workers=8)
# batch_resample_parallel("test.json", "resampled_audio/test", target_sample_rate=16000, max_workers=8)

In [21]:
train='train.json'
validation='validation.json'
test='test.json'

In [22]:
import nemo.collections.asr as nemo_asr
pretrained_model  = nemo_asr.models.ASRModel.from_pretrained(model_name="stt_en_fastconformer_ctc_large")

[NeMo I 2025-01-31 07:02:04 cloud:68] Downloading from: https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_fastconformer_ctc_large/versions/1.0.0/files/stt_en_fastconformer_ctc_large.nemo to /root/.cache/torch/NeMo/NeMo_2.2.0rc0/stt_en_fastconformer_ctc_large/00a071a9dac048acc3aeea942b0bfa40/stt_en_fastconformer_ctc_large.nemo
[NeMo I 2025-01-31 07:02:09 common:826] Instantiating model from pre-trained checkpoint
[NeMo I 2025-01-31 07:02:10 mixins:180] Tokenizer SentencePieceTokenizer initialized with 1024 tokens


[NeMo W 2025-01-31 07:02:11 modelPT:176] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: null
    sample_rate: 16000
    batch_size: 1
    shuffle: true
    num_workers: 8
    pin_memory: true
    use_start_end_token: false
    trim_silence: false
    max_duration: 20
    min_duration: 0.1
    is_tarred: false
    tarred_audio_filepaths: null
    shuffle_n: 2048
    bucketing_strategy: fully_randomized
    bucketing_batch_size: null
    
[NeMo W 2025-01-31 07:02:11 modelPT:183] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: null
    sample_rate: 16000
    batch_size: 32
    shuffle: false
    num_workers: 8
    pin_m

[NeMo I 2025-01-31 07:02:11 features:305] PADDING: 0
[NeMo I 2025-01-31 07:02:13 save_restore_connector:275] Model EncDecCTCModelBPE was successfully restored from /root/.cache/torch/NeMo/NeMo_2.2.0rc0/stt_en_fastconformer_ctc_large/00a071a9dac048acc3aeea942b0bfa40/stt_en_fastconformer_ctc_large.nemo.


In [23]:
# import nemo.collections.asr as nemo_asr
# pretrained_model  = nemo_asr.models.EncDecCTCModelBPE.from_pretrained(model_name="stt_en_fastconformer_ctc_large")

In [24]:
# Extract current vocabulary
original_vocab = pretrained_model.tokenizer.tokenizer.get_vocab()
print(f"Original vocabulary size: {len(original_vocab)}")

Original vocabulary size: 1024


In [25]:
import os

BRANCH = "main"  # Replace with the desired branch if needed
if not os.path.exists("process_asr_text_tokenizer.py"):
    !wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/tokenizers/process_asr_text_tokenizer.py


--2025-01-31 07:02:16--  https://raw.githubusercontent.com/NVIDIA/NeMo/main/scripts/tokenizers/process_asr_text_tokenizer.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 17146 (17K) [text/plain]
Saving to: ‘process_asr_text_tokenizer.py’


2025-01-31 07:02:16 (31.7 MB/s) - ‘process_asr_text_tokenizer.py’ saved [17146/17146]



In [26]:
# mv /kaggle/working/tokenizers /kaggle/working/tokenizers2

In [27]:
!python process_asr_text_tokenizer.py \
    --manifest="/kaggle/working/nemo_asr_manifest.json" \
    --data_root="tokenizers/codeswitched_tokenizer" \
    --vocab_size=1024 \
    --tokenizer="spe" \
    --spe_type="bpe" \
    --spe_character_coverage=1.0 \
    --log

[NeMo I 2025-01-31 07:02:24 sentencepiece_tokenizer:425] Processing tokenizers/codeswitched_tokenizer/text_corpus/document.txt and store at tokenizers/codeswitched_tokenizer/tokenizer_spe_bpe_v1024
sentencepiece_trainer.cc(178) LOG(INFO) Running command: --input=tokenizers/codeswitched_tokenizer/text_corpus/document.txt --model_prefix=tokenizers/codeswitched_tokenizer/tokenizer_spe_bpe_v1024/tokenizer --vocab_size=1024 --shuffle_input_sentence=true --hard_vocab_limit=false --model_type=bpe --character_coverage=1.0 --bos_id=-1 --eos_id=-1 --normalization_rule_name=nmt_nfkc_cf --remove_extra_whitespaces=false
sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: tokenizers/codeswitched_tokenizer/text_corpus/document.txt
  input_format: 
  model_prefix: tokenizers/codeswitched_tokenizer/tokenizer_spe_bpe_v1024/tokenizer
  model_type: BPE
  vocab_size: 1024
  self_test_sample_size: 0
  character_coverage: 1
  input_sentence_size: 0
  shuffle_inp

In [28]:
pretrained_model.change_vocabulary(
    new_tokenizer_dir="/kaggle/working/tokenizers/codeswitched_tokenizer/tokenizer_spe_bpe_v1024",
    new_tokenizer_type="bpe"
)

[NeMo W 2025-01-31 07:02:26 modelPT:281] You tried to register an artifact under config key=tokenizer.model_path but an artifact for it has already been registered.
[NeMo W 2025-01-31 07:02:26 modelPT:281] You tried to register an artifact under config key=tokenizer.vocab_path but an artifact for it has already been registered.
[NeMo W 2025-01-31 07:02:26 modelPT:281] You tried to register an artifact under config key=tokenizer.spe_tokenizer_vocab but an artifact for it has already been registered.


[NeMo I 2025-01-31 07:02:26 mixins:180] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2025-01-31 07:02:27 ctc_bpe_models:274] 
    Replacing old number of classes (1024) with new number of classes - 1024
[NeMo I 2025-01-31 07:02:27 ctc_bpe_models:316] Changed tokenizer to ['<unk>', '▁t', '▁m', '▁s', '▁a', '▁i', '▁n', '▁w', '▁o', 'er', '▁l', '▁f', '▁b', 'in', '▁p', 'en', '▁g', '▁th', 'at', 'on', 're', 'an', '▁c', '▁d', '▁y', '▁e', 'or', '▁k', '▁ti', 'un', '▁the', '▁ni', 'ou', '▁h', 'es', 'ing', 'ar', 'ti', 'le', '▁mo', '▁to', '▁wa', 'al', 'bo', '▁r', '▁j', 'ay', 'se', 'ent', '▁mi', 'ro', 'th', '▁is', '▁ma', 'ver', '▁lo', '▁in', '▁se', 'ati', '▁you', '▁ní', 'st', '▁my', 've', 'ẹ́', '▁be', 'll', '▁ì', 'ed', '▁à', '▁go', '▁re', 'ad', 'ce', '▁of', 'ọ́', '▁an', 'it', 'gb', '▁ẹ', 'ke', 'ra', 'ch', 'id', '▁tí', 'is', 'om', 'ọ̀', '▁le', '▁ne', 'ays', '▁gb', '▁fun', '▁ṣ', '▁gbo', 'ol', '▁pe', '▁si', '▁st', 'ow', '▁ch', 'ọn', '▁we', 'as', '▁‘', 'ri', 'gbo', 'bí', '▁for', '

In [29]:
config = pretrained_model.cfg
config.train_ds.manifest_filepath = train
config.validation_ds.manifest_filepath = validation
config.test_ds.manifest_filepath = test

OmegaConf.save(config, "asr_config.yaml")  # Save to file for customization

In [30]:
# Setup training dataset
pretrained_model.setup_training_data(train_data_config=config.train_ds)

# Setup validation dataset
pretrained_model.setup_validation_data(val_data_config=config.validation_ds)

[NeMo I 2025-01-31 07:02:29 collections:201] Dataset loaded with 13122 files totalling 17.00 hours
[NeMo I 2025-01-31 07:02:29 collections:202] 0 files were filtered totalling 0.00 hours
[NeMo I 2025-01-31 07:02:29 collections:201] Dataset loaded with 1647 files totalling 2.19 hours
[NeMo I 2025-01-31 07:02:29 collections:202] 0 files were filtered totalling 0.00 hours


In [31]:
from lightning.pytorch import Trainer
from lightning.pytorch.loggers import WandbLogger
from lightning.pytorch.callbacks import ModelCheckpoint

# Initialize Wandb Logger
wandb_logger = WandbLogger(project="CodeSwitch ASR")

# Define Checkpoint Callback (Save best 2 models based on validation WER)
checkpoint_callback = ModelCheckpoint(
    monitor="val_wer",  # Monitor validation WER
    mode="min",  # Save model if WER improves (lower is better)
    save_top_k=2,  # Keep only the best 2 models
    filename="best_model-{epoch:02d}-{val_WER:.4f}"
)

# Define the Trainer
trainer = Trainer(
    max_epochs=20,  # Train for 20 epochs
    devices=1,  # Specify the number of GPUs or CPUs (use "auto" for automatic selection)
    accelerator="gpu",  # Use "gpu" if you have a GPU, otherwise use "cpu"
    precision=16,  # Enables mixed precision training (requires compatible GPU)
    gradient_clip_val=1.0,
    log_every_n_steps=200,  # Log results every 200 steps
    accumulate_grad_batches=8,  # Gradient accumulation for 8 steps
    logger=wandb_logger,  # Attach W&B Logger
    callbacks=[checkpoint_callback]  # Save best 2 checkpoints based on validation WER
)

print("Trainer initialized successfully with W&B logging and model checkpointing.")

Trainer initialized successfully with W&B logging and model checkpointing.


In [32]:
# Check training dataset
if pretrained_model._train_dl is not None:
    print(f"Training samples: {len(pretrained_model._train_dl.dataset)}")
else:
    print("Training dataset not set up properly.")

# Check validation dataset
if pretrained_model._validation_dl is not None:
    print(f"Validation samples: {len(pretrained_model._validation_dl.dataset)}")
else:
    print("Validation dataset not set up properly.")

Training samples: 13122
Validation samples: 1647


In [33]:
import soundfile as sf

def check_sample_rate(filepath):
    try:
        # Get information about the audio file
        info = sf.info(filepath)
        return info.samplerate
    except Exception as e:
        print(f"Error checking sample rate for {filepath}: {e}")
        return None

# Example usage
enyor_samples = os.listdir("/kaggle/working/resampled_audio")[:5]
enyor_samples = ["/kaggle/working/resampled_audio/" + sample for sample in enyor_samples]
for sample in enyor_samples:
    sample_rate = check_sample_rate(sample)
    if sample_rate:
        print(f"Sample rate for {sample}: {sample_rate} Hz")
# filepath = "/path/to/audio_file.wav"
# sample_rate = check_sample_rate(filepath)
# if sample_rate:
#     print(f"Sample rate: {sample_rate} Hz")


Sample rate for /kaggle/working/resampled_audio/678c4af6191a7d46a4e3e004.wav: 16000 Hz
Sample rate for /kaggle/working/resampled_audio/67837e80b96cb6eef423030b.wav: 16000 Hz
Sample rate for /kaggle/working/resampled_audio/6776674c09f2f37dccdbf00a.wav: 16000 Hz
Sample rate for /kaggle/working/resampled_audio/6761fe0214e8b7d926ee23f3.wav: 16000 Hz
Sample rate for /kaggle/working/resampled_audio/677fc9927a251edbd68993d2.wav: 16000 Hz


In [34]:
# from pydub import AudioSegment

# def check_sample_rate_pydub(filepath):
#     try:
#         # Load audio file and get its frame rate (sample rate)
#         audio = AudioSegment.from_file(filepath)
#         return audio.frame_rate
#     except Exception as e:
#         print(f"Error checking sample rate for {filepath}: {e}")
#         return None

# # Example usage
# enyor_samples = ["/content/drive/MyDrive/Lyngual Labs/audio_files/676be317f324631b7da8d1d9.wav", "/content/drive/MyDrive/Lyngual Labs/audio_files/676ee1cef324631b7df8f185.wav"]
# for sample in enyor_samples:
#     sample_rate = check_sample_rate_pydub(sample)
#     if sample_rate:
#         print(f"Sample rate for {sample.split('/')[-1]}: {sample_rate} Hz")

In [35]:
# Start fine-tuning
trainer.fit(pretrained_model)

INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


[NeMo I 2025-01-31 07:02:34 modelPT:793] Optimizer config = AdamW (
    Parameter Group 0
        amsgrad: False
        betas: [0.9, 0.98]
        capturable: False
        differentiable: False
        eps: 1e-08
        foreach: None
        fused: None
        lr: 0.001
        maximize: False
        weight_decay: 0.001
    )
[NeMo I 2025-01-31 07:02:34 lr_scheduler:948] Scheduler "<nemo.core.optim.lr_scheduler.CosineAnnealing object at 0x7e4463566fe0>" 
    will be used during training (effective maximum steps = 32820) - 
    Parameters : 
    (warmup_steps: 15000
    warmup_ratio: null
    min_lr: 0.0001
    max_steps: 32820
    )


INFO: 
  | Name              | Type                              | Params | Mode 
--------------------------------------------------------------------------------
0 | preprocessor      | AudioToMelSpectrogramPreprocessor | 0      | train
1 | encoder           | ConformerEncoder                  | 115 M  | train
2 | spec_augmentation | SpectrogramAugmentation           | 0      | train
3 | wer               | WER                               | 0      | train
4 | decoder           | ConvASRDecoder                    | 525 K  | train
5 | loss              | CTCLoss                           | 0      | train
--------------------------------------------------------------------------------
115 M     Trainable params
0         Non-trainable params
115 M     Total params
462.402   Total estimated model params size (MB)
527       Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

[NeMo W 2025-01-31 07:02:36 ctc_greedy_decoding:168] CTC decoding strategy 'greedy' is slower than 'greedy_batch', which implements the same exact interface. Consider changing your strategy to 'greedy_batch' for a free performance improvement.


[NeMo I 2025-01-31 07:02:36 wer:329] 
    
[NeMo I 2025-01-31 07:02:36 wer:330] reference:court ti instruct gbogbo parties láti file pretrial briefs ní deadline tí wọn set
[NeMo I 2025-01-31 07:02:36 wer:331] predicted:gbonýce sti importfe ara olu feelcoasguninuber àwọn sendceñ want andñmiousequljẹ ne when thatmeisi àwọnorereat ag ló why when ag àwọn party as want mí àwọnding when fi whenily governorýish0 pre p feelut adý plaing plquanasefẹ́ite that oluce
[NeMo I 2025-01-31 07:02:36 wer:329] 
    
[NeMo I 2025-01-31 07:02:36 wer:330] reference:the nose of ọmọ ìkókó náà is dirty
[NeMo I 2025-01-31 07:02:36 wer:331] predicted:lṣouldicmp they bec care sti been8 mí andtà pada mí please ṣ è wh your oluingxý wantýl


Training: |          | 0/? [00:00<?, ?it/s]

[NeMo I 2025-01-31 07:03:06 wer:329] 
    
[NeMo I 2025-01-31 07:03:06 wer:330] reference:i don’t have time lati ka iwe mi and my exam is fast approaching
[NeMo I 2025-01-31 07:03:06 wer:331] predicted:ìl believe lot before long9sí yin àwọn yin exam àwọn ilubious set àwọn personctdy ‘díléding àwọn arurity and mu àl yinkàn múýding yinidiven gan yii please mustgun want ni bállven ṣ couruntanforming is tó sé was make araopleyangbonṣing
[NeMo I 2025-01-31 07:03:35 wer:329] 
    
[NeMo I 2025-01-31 07:03:35 wer:330] reference:ti ebi ba n pami i can not read very well
[NeMo I 2025-01-31 07:03:35 wer:331] predicted:lṣ yin theicgbonurityite òic want ọl about meetingquanidian aregba ko àwọningures nextidibivenbiẹbí whenople bàopleasilyelurityiteurity cont wondingvenarslll
[NeMo I 2025-01-31 07:04:03 wer:329] 
    
[NeMo I 2025-01-31 07:04:03 wer:330] reference:leave my ile iwe
[NeMo I 2025-01-31 07:04:03 wer:331] predicted:ore yinerivering yinidisí agbi withic àlther make mon make
[NeMo I 2025-

Validation: |          | 0/? [00:00<?, ?it/s]

[NeMo I 2025-01-31 07:33:59 wer:329] 
    
[NeMo I 2025-01-31 07:33:59 wer:330] reference:court ti instruct gbogbo parties láti file pretrial briefs ní deadline tí wọn set
[NeMo I 2025-01-31 07:33:59 wer:331] predicted:mo
[NeMo I 2025-01-31 07:34:00 wer:329] 
    
[NeMo I 2025-01-31 07:34:00 wer:330] reference:the nose of ọmọ ìkókó náà is dirty
[NeMo I 2025-01-31 07:34:00 wer:331] predicted:
[NeMo I 2025-01-31 07:34:00 wer:329] 
    
[NeMo I 2025-01-31 07:34:00 wer:330] reference:put three saki si ori first rice yen
[NeMo I 2025-01-31 07:34:00 wer:331] predicted:mo e
[NeMo I 2025-01-31 07:34:00 wer:329] 
    
[NeMo I 2025-01-31 07:34:00 wer:330] reference:create shortcuts lati pari awon ise yen kiakia
[NeMo I 2025-01-31 07:34:00 wer:331] predicted:mo
[NeMo I 2025-01-31 07:34:00 wer:329] 
    
[NeMo I 2025-01-31 07:34:00 wer:330] reference:olufemi's wrist ti kon
[NeMo I 2025-01-31 07:34:00 wer:331] predicted:my e gan make gan
[NeMo I 2025-01-31 07:34:01 wer:329] 
    
[NeMo I 2025-01-31

Validation: |          | 0/? [00:00<?, ?it/s]

[NeMo I 2025-01-31 08:05:51 wer:329] 
    
[NeMo I 2025-01-31 08:05:51 wer:330] reference:court ti instruct gbogbo parties láti file pretrial briefs ní deadline tí wọn set
[NeMo I 2025-01-31 08:05:51 wer:331] predicted:c ti inro gbogbo par lati f prorier bs ni din ti won set
[NeMo I 2025-01-31 08:05:51 wer:329] 
    
[NeMo I 2025-01-31 08:05:52 wer:330] reference:the nose of ọmọ ìkókó náà is dirty
[NeMo I 2025-01-31 08:05:52 wer:331] predicted:the knows of ọmọ kò know is de
[NeMo I 2025-01-31 08:05:52 wer:329] 
    
[NeMo I 2025-01-31 08:05:52 wer:330] reference:put three saki si ori first rice yen
[NeMo I 2025-01-31 08:05:52 wer:331] predicted:st ch si o f st yen
[NeMo I 2025-01-31 08:05:52 wer:329] 
    
[NeMo I 2025-01-31 08:05:52 wer:330] reference:create shortcuts lati pari awon ise yen kiakia
[NeMo I 2025-01-31 08:05:52 wer:331] predicted:c c chcs lati ba wa ṣe e ki ki
[NeMo I 2025-01-31 08:05:52 wer:329] 
    
[NeMo I 2025-01-31 08:05:52 wer:330] reference:olufemi's wrist ti kon

Validation: |          | 0/? [00:00<?, ?it/s]

[NeMo I 2025-01-31 08:38:06 wer:329] 
    
[NeMo I 2025-01-31 08:38:06 wer:330] reference:court ti instruct gbogbo parties láti file pretrial briefs ní deadline tí wọn set
[NeMo I 2025-01-31 08:38:06 wer:331] predicted:court ti insttruct gbogbo parties lati fi prerial brees ni deadline ti won set
[NeMo I 2025-01-31 08:38:07 wer:329] 
    
[NeMo I 2025-01-31 08:38:07 wer:330] reference:the nose of ọmọ ìkókó náà is dirty
[NeMo I 2025-01-31 08:38:07 wer:331] predicted:the nose of ọmọ i naa is dy
[NeMo I 2025-01-31 08:38:07 wer:329] 
    
[NeMo I 2025-01-31 08:38:07 wer:330] reference:put three saki si ori first rice yen
[NeMo I 2025-01-31 08:38:07 wer:331] predicted:p th chcky si ori first ricese yen
[NeMo I 2025-01-31 08:38:07 wer:329] 
    
[NeMo I 2025-01-31 08:38:07 wer:330] reference:create shortcuts lati pari awon ise yen kiakia
[NeMo I 2025-01-31 08:38:07 wer:331] predicted:c c shcorts lati par àwọn ise e ki ki
[NeMo I 2025-01-31 08:38:07 wer:329] 
    
[NeMo I 2025-01-31 08:38:07 

Validation: |          | 0/? [00:00<?, ?it/s]

[NeMo I 2025-01-31 09:10:26 wer:329] 
    
[NeMo I 2025-01-31 09:10:26 wer:330] reference:court ti instruct gbogbo parties láti file pretrial briefs ní deadline tí wọn set
[NeMo I 2025-01-31 09:10:26 wer:331] predicted:court ti intruct gbogbo parties láti file pretried prefves ni deadline ti won set
[NeMo I 2025-01-31 09:10:26 wer:329] 
    
[NeMo I 2025-01-31 09:10:26 wer:330] reference:the nose of ọmọ ìkókó náà is dirty
[NeMo I 2025-01-31 09:10:26 wer:331] predicted:nse of ọmọ ik naa is dy
[NeMo I 2025-01-31 09:10:27 wer:329] 
    
[NeMo I 2025-01-31 09:10:27 wer:330] reference:put three saki si ori first rice yen
[NeMo I 2025-01-31 09:10:27 wer:331] predicted:put three saking si ori fares yen
[NeMo I 2025-01-31 09:10:27 wer:329] 
    
[NeMo I 2025-01-31 09:10:27 wer:330] reference:create shortcuts lati pari awon ise yen kiakia
[NeMo I 2025-01-31 09:10:27 wer:331] predicted:create shcorts láti parari awon ise ki ki
[NeMo I 2025-01-31 09:10:27 wer:329] 
    
[NeMo I 2025-01-31 09:10:2

Validation: |          | 0/? [00:00<?, ?it/s]

[NeMo I 2025-01-31 09:42:49 wer:329] 
    
[NeMo I 2025-01-31 09:42:49 wer:330] reference:court ti instruct gbogbo parties láti file pretrial briefs ní deadline tí wọn set
[NeMo I 2025-01-31 09:42:49 wer:331] predicted:court ti instruct gbogbo parties lati file pretrial bes ni deadline ti won set
[NeMo I 2025-01-31 09:42:49 wer:329] 
    
[NeMo I 2025-01-31 09:42:49 wer:330] reference:the nose of ọmọ ìkókó náà is dirty
[NeMo I 2025-01-31 09:42:49 wer:331] predicted:the news of ọmọ ikoú is dty
[NeMo I 2025-01-31 09:42:49 wer:329] 
    
[NeMo I 2025-01-31 09:42:49 wer:330] reference:put three saki si ori first rice yen
[NeMo I 2025-01-31 09:42:49 wer:331] predicted:put three chackking si ori first riceress yen
[NeMo I 2025-01-31 09:42:49 wer:329] 
    
[NeMo I 2025-01-31 09:42:49 wer:330] reference:create shortcuts lati pari awon ise yen kiakia
[NeMo I 2025-01-31 09:42:49 wer:331] predicted:create shscs lati pari awon ise in kia carea
[NeMo I 2025-01-31 09:42:49 wer:329] 
    
[NeMo I 20

Validation: |          | 0/? [00:00<?, ?it/s]

[NeMo I 2025-01-31 10:15:05 wer:329] 
    
[NeMo I 2025-01-31 10:15:05 wer:330] reference:court ti instruct gbogbo parties láti file pretrial briefs ní deadline tí wọn set
[NeMo I 2025-01-31 10:15:05 wer:331] predicted:court ti instruct good parties láti file pretrial griefes ni deadline tí wọ́n set
[NeMo I 2025-01-31 10:15:05 wer:329] 
    
[NeMo I 2025-01-31 10:15:05 wer:330] reference:the nose of ọmọ ìkókó náà is dirty
[NeMo I 2025-01-31 10:15:05 wer:331] predicted:the nse of ọmọ ìko naa is dirty
[NeMo I 2025-01-31 10:15:06 wer:329] 
    
[NeMo I 2025-01-31 10:15:06 wer:330] reference:put three saki si ori first rice yen
[NeMo I 2025-01-31 10:15:06 wer:331] predicted:put th3ree c si first riceice yen
[NeMo I 2025-01-31 10:15:06 wer:329] 
    
[NeMo I 2025-01-31 10:15:06 wer:330] reference:create shortcuts lati pari awon ise yen kiakia
[NeMo I 2025-01-31 10:15:06 wer:331] predicted:create shortscuts lati parari awon changech ki kia
[NeMo I 2025-01-31 10:15:06 wer:329] 
    
[NeMo I 2

Validation: |          | 0/? [00:00<?, ?it/s]

[NeMo I 2025-01-31 10:47:22 wer:329] 
    
[NeMo I 2025-01-31 10:47:22 wer:330] reference:court ti instruct gbogbo parties láti file pretrial briefs ní deadline tí wọn set
[NeMo I 2025-01-31 10:47:22 wer:331] predicted:court ti insttruct gbogbo parties láti file pretrial brerifs ni deadline ti wọ́n set
[NeMo I 2025-01-31 10:47:22 wer:329] 
    
[NeMo I 2025-01-31 10:47:22 wer:330] reference:the nose of ọmọ ìkókó náà is dirty
[NeMo I 2025-01-31 10:47:22 wer:331] predicted:the nse of ọmọkokoo is dirtty
[NeMo I 2025-01-31 10:47:23 wer:329] 
    
[NeMo I 2025-01-31 10:47:23 wer:330] reference:put three saki si ori first rice yen
[NeMo I 2025-01-31 10:47:23 wer:331] predicted:put three carcy si ori feirst ricece yen
[NeMo I 2025-01-31 10:47:23 wer:329] 
    
[NeMo I 2025-01-31 10:47:23 wer:330] reference:create shortcuts lati pari awon ise yen kiakia
[NeMo I 2025-01-31 10:47:23 wer:331] predicted:create shortscuts lati par awon ise gka
[NeMo I 2025-01-31 10:47:23 wer:329] 
    
[NeMo I 2025

Validation: |          | 0/? [00:00<?, ?it/s]

[NeMo I 2025-01-31 11:20:04 wer:329] 
    
[NeMo I 2025-01-31 11:20:04 wer:330] reference:court ti instruct gbogbo parties láti file pretrial briefs ní deadline tí wọn set
[NeMo I 2025-01-31 11:20:04 wer:331] predicted:court ti instruct gbogbo parties lati file premirial briefs ni deadline ti wọ́n set
[NeMo I 2025-01-31 11:20:04 wer:329] 
    
[NeMo I 2025-01-31 11:20:04 wer:330] reference:the nose of ọmọ ìkókó náà is dirty
[NeMo I 2025-01-31 11:20:04 wer:331] predicted:the knowsse of omoko is dirtty
[NeMo I 2025-01-31 11:20:04 wer:329] 
    
[NeMo I 2025-01-31 11:20:04 wer:330] reference:put three saki si ori first rice yen
[NeMo I 2025-01-31 11:20:04 wer:331] predicted:put three sa si ori first rice yen
[NeMo I 2025-01-31 11:20:05 wer:329] 
    
[NeMo I 2025-01-31 11:20:05 wer:330] reference:create shortcuts lati pari awon ise yen kiakia
[NeMo I 2025-01-31 11:20:05 wer:331] predicted:create shotscuts lati ara àwọn iseyin kia
[NeMo I 2025-01-31 11:20:05 wer:329] 
    
[NeMo I 2025-01-

Validation: |          | 0/? [00:00<?, ?it/s]

[NeMo I 2025-01-31 11:53:29 wer:329] 
    
[NeMo I 2025-01-31 11:53:29 wer:330] reference:court ti instruct gbogbo parties láti file pretrial briefs ní deadline tí wọn set
[NeMo I 2025-01-31 11:53:29 wer:331] predicted:court ti instruct gbogbo parties láti file pretrial preefves ni deadline tí wọ́n set
[NeMo I 2025-01-31 11:53:29 wer:329] 
    
[NeMo I 2025-01-31 11:53:29 wer:330] reference:the nose of ọmọ ìkókó náà is dirty
[NeMo I 2025-01-31 11:53:29 wer:331] predicted:the lesee of omo icọ́ca is dirtty
[NeMo I 2025-01-31 11:53:29 wer:329] 
    
[NeMo I 2025-01-31 11:53:29 wer:330] reference:put three saki si ori first rice yen
[NeMo I 2025-01-31 11:53:29 wer:331] predicted:put th3 chckki si ori first riceice yen
[NeMo I 2025-01-31 11:53:30 wer:329] 
    
[NeMo I 2025-01-31 11:53:30 wer:330] reference:create shortcuts lati pari awon ise yen kiakia
[NeMo I 2025-01-31 11:53:30 wer:331] predicted:create shortcorts lati pari awon ise ki ki
[NeMo I 2025-01-31 11:53:30 wer:329] 
    
[NeMo 

Validation: |          | 0/? [00:00<?, ?it/s]

[NeMo I 2025-01-31 12:26:55 wer:329] 
    
[NeMo I 2025-01-31 12:26:55 wer:330] reference:court ti instruct gbogbo parties láti file pretrial briefs ní deadline tí wọn set
[NeMo I 2025-01-31 12:26:55 wer:331] predicted:court ti instruct gr parties láti file pretrial bries ní deadline tí wọn set
[NeMo I 2025-01-31 12:26:55 wer:329] 
    
[NeMo I 2025-01-31 12:26:55 wer:330] reference:the nose of ọmọ ìkókó náà is dirty
[NeMo I 2025-01-31 12:26:55 wer:331] predicted:the ns of ọmọ oko náà is dirtty
[NeMo I 2025-01-31 12:26:55 wer:329] 
    
[NeMo I 2025-01-31 12:26:55 wer:330] reference:put three saki si ori first rice yen
[NeMo I 2025-01-31 12:26:55 wer:331] predicted:put three sae si ori fres rice rice yen
[NeMo I 2025-01-31 12:26:55 wer:329] 
    
[NeMo I 2025-01-31 12:26:55 wer:330] reference:create shortcuts lati pari awon ise yen kiakia
[NeMo I 2025-01-31 12:26:55 wer:331] predicted:create shortcuts lati parower àwọn se kie
[NeMo I 2025-01-31 12:26:56 wer:329] 
    
[NeMo I 2025-01-3

Validation: |          | 0/? [00:00<?, ?it/s]

[NeMo I 2025-01-31 12:59:45 wer:329] 
    
[NeMo I 2025-01-31 12:59:45 wer:330] reference:court ti instruct gbogbo parties láti file pretrial briefs ní deadline tí wọn set
[NeMo I 2025-01-31 12:59:45 wer:331] predicted:court ti instruct gr parties láti file premirial breefs ni deadline tí wọn set
[NeMo I 2025-01-31 12:59:46 wer:329] 
    
[NeMo I 2025-01-31 12:59:46 wer:330] reference:the nose of ọmọ ìkókó náà is dirty
[NeMo I 2025-01-31 12:59:46 wer:331] predicted:the nurse of ọmọ i nkanọ́kó náà is dirty
[NeMo I 2025-01-31 12:59:46 wer:329] 
    
[NeMo I 2025-01-31 12:59:46 wer:330] reference:put three saki si ori first rice yen
[NeMo I 2025-01-31 12:59:46 wer:331] predicted:put three sake si ori fa ricece yen
[NeMo I 2025-01-31 12:59:46 wer:329] 
    
[NeMo I 2025-01-31 12:59:46 wer:330] reference:create shortcuts lati pari awon ise yen kiakia
[NeMo I 2025-01-31 12:59:46 wer:331] predicted:create shortcuts láti pari awon e ise ki ki
[NeMo I 2025-01-31 12:59:46 wer:329] 
    
[NeMo I 

Validation: |          | 0/? [00:00<?, ?it/s]

[NeMo I 2025-01-31 13:32:08 wer:329] 
    
[NeMo I 2025-01-31 13:32:08 wer:330] reference:court ti instruct gbogbo parties láti file pretrial briefs ní deadline tí wọn set
[NeMo I 2025-01-31 13:32:08 wer:331] predicted:court ti instruct gbogbo parties láti file pretrial breriefs ni deadline tí wọn set
[NeMo I 2025-01-31 13:32:08 wer:329] 
    
[NeMo I 2025-01-31 13:32:08 wer:330] reference:the nose of ọmọ ìkókó náà is dirty
[NeMo I 2025-01-31 13:32:09 wer:331] predicted:the the nse of omo ikoko náàa is dirty
[NeMo I 2025-01-31 13:32:09 wer:329] 
    
[NeMo I 2025-01-31 13:32:09 wer:330] reference:put three saki si ori first rice yen
[NeMo I 2025-01-31 13:32:09 wer:331] predicted:plutut three sackke si ori f rice yen
[NeMo I 2025-01-31 13:32:09 wer:329] 
    
[NeMo I 2025-01-31 13:32:09 wer:330] reference:create shortcuts lati pari awon ise yen kiakia
[NeMo I 2025-01-31 13:32:09 wer:331] predicted:create shortcuts lati par awon ise yen ki ki
[NeMo I 2025-01-31 13:32:09 wer:329] 
    
[N

Validation: |          | 0/? [00:00<?, ?it/s]

[NeMo I 2025-01-31 14:05:02 wer:329] 
    
[NeMo I 2025-01-31 14:05:02 wer:330] reference:court ti instruct gbogbo parties láti file pretrial briefs ní deadline tí wọn set
[NeMo I 2025-01-31 14:05:02 wer:331] predicted:court ti instruct gbogbo parties láti file pretrial befs ni deadline tí wọn set
[NeMo I 2025-01-31 14:05:03 wer:329] 
    
[NeMo I 2025-01-31 14:05:03 wer:330] reference:the nose of ọmọ ìkókó náà is dirty
[NeMo I 2025-01-31 14:05:03 wer:331] predicted:the news of ọmọ ikokoa is dirty
[NeMo I 2025-01-31 14:05:03 wer:329] 
    
[NeMo I 2025-01-31 14:05:03 wer:330] reference:put three saki si ori first rice yen
[NeMo I 2025-01-31 14:05:03 wer:331] predicted:put three sackki si ori feirst rice yen
[NeMo I 2025-01-31 14:05:03 wer:329] 
    
[NeMo I 2025-01-31 14:05:03 wer:330] reference:create shortcuts lati pari awon ise yen kiakia
[NeMo I 2025-01-31 14:05:03 wer:331] predicted:create shortscuts lati par awon iseṣyin care ki
[NeMo I 2025-01-31 14:05:03 wer:329] 
    
[NeMo I 

Validation: |          | 0/? [00:00<?, ?it/s]

[NeMo I 2025-01-31 14:38:10 wer:329] 
    
[NeMo I 2025-01-31 14:38:10 wer:330] reference:court ti instruct gbogbo parties láti file pretrial briefs ní deadline tí wọn set
[NeMo I 2025-01-31 14:38:10 wer:331] predicted:court ti instruct gbogbo parties láti file pretrial briefs ni deadline tí wọn set
[NeMo I 2025-01-31 14:38:11 wer:329] 
    
[NeMo I 2025-01-31 14:38:11 wer:330] reference:the nose of ọmọ ìkókó náà is dirty
[NeMo I 2025-01-31 14:38:11 wer:331] predicted:the news of omokokoa is dirty
[NeMo I 2025-01-31 14:38:11 wer:329] 
    
[NeMo I 2025-01-31 14:38:11 wer:330] reference:put three saki si ori first rice yen
[NeMo I 2025-01-31 14:38:11 wer:331] predicted:put three sacken si ori fe rice yen
[NeMo I 2025-01-31 14:38:11 wer:329] 
    
[NeMo I 2025-01-31 14:38:11 wer:330] reference:create shortcuts lati pari awon ise yen kiakia
[NeMo I 2025-01-31 14:38:11 wer:331] predicted:create shortcuts lati pari awon e iseyin ki a aka
[NeMo I 2025-01-31 14:38:11 wer:329] 
    
[NeMo I 20

Validation: |          | 0/? [00:00<?, ?it/s]

[NeMo I 2025-01-31 15:11:20 wer:329] 
    
[NeMo I 2025-01-31 15:11:20 wer:330] reference:court ti instruct gbogbo parties láti file pretrial briefs ní deadline tí wọn set
[NeMo I 2025-01-31 15:11:20 wer:331] predicted:court ti instruct gbogbo parties láti file pretrial briefs ní deadline tí wọn set
[NeMo I 2025-01-31 15:11:20 wer:329] 
    
[NeMo I 2025-01-31 15:11:20 wer:330] reference:the nose of ọmọ ìkókó náà is dirty
[NeMo I 2025-01-31 15:11:20 wer:331] predicted:the news of ọmọ ikoa is dirty
[NeMo I 2025-01-31 15:11:21 wer:329] 
    
[NeMo I 2025-01-31 15:11:21 wer:330] reference:put three saki si ori first rice yen
[NeMo I 2025-01-31 15:11:21 wer:331] predicted:put three sackin si ori fresirst rice yen
[NeMo I 2025-01-31 15:11:21 wer:329] 
    
[NeMo I 2025-01-31 15:11:21 wer:330] reference:create shortcuts lati pari awon ise yen kiakia
[NeMo I 2025-01-31 15:11:21 wer:331] predicted:cate shortcuts lati pari awon ise yen kiaa
[NeMo I 2025-01-31 15:11:21 wer:329] 
    
[NeMo I 202

Validation: |          | 0/? [00:00<?, ?it/s]

[NeMo I 2025-01-31 15:44:13 wer:329] 
    
[NeMo I 2025-01-31 15:44:13 wer:330] reference:court ti instruct gbogbo parties láti file pretrial briefs ní deadline tí wọn set
[NeMo I 2025-01-31 15:44:13 wer:331] predicted:court ti insttruct gbogbo parties láti file pretrial brief ní deadline tí wọn set
[NeMo I 2025-01-31 15:44:14 wer:329] 
    
[NeMo I 2025-01-31 15:44:14 wer:330] reference:the nose of ọmọ ìkókó náà is dirty
[NeMo I 2025-01-31 15:44:14 wer:331] predicted:the news of ọmọ ikookoa is dty
[NeMo I 2025-01-31 15:44:14 wer:329] 
    
[NeMo I 2025-01-31 15:44:14 wer:330] reference:put three saki si ori first rice yen
[NeMo I 2025-01-31 15:44:14 wer:331] predicted:put three saret si ori fresirst rice yen
[NeMo I 2025-01-31 15:44:14 wer:329] 
    
[NeMo I 2025-01-31 15:44:14 wer:330] reference:create shortcuts lati pari awon ise yen kiakia
[NeMo I 2025-01-31 15:44:14 wer:331] predicted:create shortcuts lati pari awon ise yen kia kia
[NeMo I 2025-01-31 15:44:14 wer:329] 
    
[NeMo 

Validation: |          | 0/? [00:00<?, ?it/s]

[NeMo I 2025-01-31 16:17:18 wer:329] 
    
[NeMo I 2025-01-31 16:17:18 wer:330] reference:court ti instruct gbogbo parties láti file pretrial briefs ní deadline tí wọn set
[NeMo I 2025-01-31 16:17:18 wer:331] predicted:court ti instruct gbogbo parties láti file pretrial briefs ní deadline tí wọn set
[NeMo I 2025-01-31 16:17:18 wer:329] 
    
[NeMo I 2025-01-31 16:17:18 wer:330] reference:the nose of ọmọ ìkókó náà is dirty
[NeMo I 2025-01-31 16:17:18 wer:331] predicted:the newse of ọmọ ikokoa is dirty
[NeMo I 2025-01-31 16:17:18 wer:329] 
    
[NeMo I 2025-01-31 16:17:18 wer:330] reference:put three saki si ori first rice yen
[NeMo I 2025-01-31 16:17:18 wer:331] predicted:put three sa si ori first rice yen
[NeMo I 2025-01-31 16:17:19 wer:329] 
    
[NeMo I 2025-01-31 16:17:19 wer:330] reference:create shortcuts lati pari awon ise yen kiakia
[NeMo I 2025-01-31 16:17:19 wer:331] predicted:create shortcuts lati pari awon ise yen kiakia
[NeMo I 2025-01-31 16:17:19 wer:329] 
    
[NeMo I 202

Validation: |          | 0/? [00:00<?, ?it/s]

[NeMo I 2025-01-31 16:50:21 wer:329] 
    
[NeMo I 2025-01-31 16:50:21 wer:330] reference:court ti instruct gbogbo parties láti file pretrial briefs ní deadline tí wọn set
[NeMo I 2025-01-31 16:50:21 wer:331] predicted:court ti instruct gbogbo parties láti file pretrial brief ni deadline tí wọn set
[NeMo I 2025-01-31 16:50:22 wer:329] 
    
[NeMo I 2025-01-31 16:50:22 wer:330] reference:the nose of ọmọ ìkókó náà is dirty
[NeMo I 2025-01-31 16:50:22 wer:331] predicted:the newse of ọmọ ikokoó ina is dirtty
[NeMo I 2025-01-31 16:50:22 wer:329] 
    
[NeMo I 2025-01-31 16:50:22 wer:330] reference:put three saki si ori first rice yen
[NeMo I 2025-01-31 16:50:22 wer:331] predicted:put three saki si ori fresirst rice yen
[NeMo I 2025-01-31 16:50:22 wer:329] 
    
[NeMo I 2025-01-31 16:50:22 wer:330] reference:create shortcuts lati pari awon ise yen kiakia
[NeMo I 2025-01-31 16:50:22 wer:331] predicted:create shortcuts lati pari awon ise yen kiakia
[NeMo I 2025-01-31 16:50:22 wer:329] 
    
[N

Validation: |          | 0/? [00:00<?, ?it/s]

[NeMo I 2025-01-31 17:23:01 wer:329] 
    
[NeMo I 2025-01-31 17:23:01 wer:330] reference:court ti instruct gbogbo parties láti file pretrial briefs ní deadline tí wọn set
[NeMo I 2025-01-31 17:23:01 wer:331] predicted:court ti instruct gbogbo parties láti file pretrial briefs ní deadline tí wọn set
[NeMo I 2025-01-31 17:23:01 wer:329] 
    
[NeMo I 2025-01-31 17:23:01 wer:330] reference:the nose of ọmọ ìkókó náà is dirty
[NeMo I 2025-01-31 17:23:01 wer:331] predicted:the news of ọmọ ikokoana is dirty
[NeMo I 2025-01-31 17:23:02 wer:329] 
    
[NeMo I 2025-01-31 17:23:02 wer:330] reference:put three saki si ori first rice yen
[NeMo I 2025-01-31 17:23:02 wer:331] predicted:put three sakiret si ori first rice yen
[NeMo I 2025-01-31 17:23:02 wer:329] 
    
[NeMo I 2025-01-31 17:23:02 wer:330] reference:create shortcuts lati pari awon ise yen kiakia
[NeMo I 2025-01-31 17:23:02 wer:331] predicted:create shortcuts lati pari awon ise yen kia kia
[NeMo I 2025-01-31 17:23:02 wer:329] 
    
[NeM

Validation: |          | 0/? [00:00<?, ?it/s]

[NeMo I 2025-01-31 17:55:43 wer:329] 
    
[NeMo I 2025-01-31 17:55:43 wer:330] reference:court ti instruct gbogbo parties láti file pretrial briefs ní deadline tí wọn set
[NeMo I 2025-01-31 17:55:43 wer:331] predicted:court ti instruct gbogbo parties láti file pretrial briefs ní deadline tí wọn set
[NeMo I 2025-01-31 17:55:43 wer:329] 
    
[NeMo I 2025-01-31 17:55:43 wer:330] reference:the nose of ọmọ ìkókó náà is dirty
[NeMo I 2025-01-31 17:55:43 wer:331] predicted:the hse of ọmọ ikokoa is dirty
[NeMo I 2025-01-31 17:55:44 wer:329] 
    
[NeMo I 2025-01-31 17:55:44 wer:330] reference:put three saki si ori first rice yen
[NeMo I 2025-01-31 17:55:44 wer:331] predicted:put three saki si ori fresirst rice yen
[NeMo I 2025-01-31 17:55:44 wer:329] 
    
[NeMo I 2025-01-31 17:55:44 wer:330] reference:create shortcuts lati pari awon ise yen kiakia
[NeMo I 2025-01-31 17:55:44 wer:331] predicted:create shortcuts lati pari awon ise yen ki aa
[NeMo I 2025-01-31 17:55:44 wer:329] 
    
[NeMo I 2

In [36]:
# Setup test dataset
pretrained_model.setup_test_data(test_data_config=config.test_ds)
results = trainer.test(pretrained_model)
print(results)


[NeMo I 2025-01-31 17:56:06 collections:201] Dataset loaded with 1614 files totalling 1.93 hours
[NeMo I 2025-01-31 17:56:06 collections:202] 0 files were filtered totalling 0.00 hours


INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]

[NeMo I 2025-01-31 17:56:08 wer:329] 
    
[NeMo I 2025-01-31 17:56:08 wer:330] reference:fredrick ma wa violent gan whenever he takes alcohol
[NeMo I 2025-01-31 17:56:08 wer:331] predicted:fedbric ma wa vallebe whenever you taes grow
[NeMo I 2025-01-31 17:56:08 wer:329] 
    
[NeMo I 2025-01-31 17:56:08 wer:330] reference:charge it to my card o better ju cash
[NeMo I 2025-01-31 17:56:08 wer:331] predicted:chade to my card o gbọ́ dá ju cash
[NeMo I 2025-01-31 17:56:08 wer:329] 
    
[NeMo I 2025-01-31 17:56:08 wer:330] reference:phone ẹ wá lórí table
[NeMo I 2025-01-31 17:56:08 wer:331] predicted:ston e wa lori tableor
[NeMo I 2025-01-31 17:56:08 wer:329] 
    
[NeMo I 2025-01-31 17:56:08 wer:330] reference:the latest style ti àwọn youth wo báyìí nice gan
[NeMo I 2025-01-31 17:56:08 wer:331] predicted:the latest ight ti awon youth wore bái or nize gan
[NeMo I 2025-01-31 17:56:08 wer:329] 
    
[NeMo I 2025-01-31 17:56:08 wer:330] reference:what is this thing that looks like aràn
[NeMo 

[{'global_step': 32820.0, 'test_loss': 1.0643792152404785, 'test_wer': 0.2962435185909271}]


In [37]:
import wandb

# Log test results
wandb.log({
    "global_step":results[0]["global_step"],
    "test_loss": results[0]["test_loss"],  # Adjust key name based on output
    "test_wer": results[0]["test_wer"]  # Adjust key name based on output
})

print("Test results logged to W&B successfully!")


Test results logged to W&B successfully!


In [38]:
pretrained_model.save_to("saved_model/fine_tuned_model.nemo")
print("Model saved successfully!")

Model saved successfully!


In [39]:
import shutil

tokenizer_dir = "tokenizers/codeswitched_tokenizer"
saved_tokenizer_dir = "saved_model/tokenizer"

# Copy the tokenizer files to a new directory
shutil.copytree(tokenizer_dir, saved_tokenizer_dir)
print(f"Tokenizer saved to {saved_tokenizer_dir}")

Tokenizer saved to saved_model/tokenizer
