In [1]:
import os
from omegaconf import OmegaConf

In [2]:
tmp = 'src'
data_folder = 'data'
if not os.path.exists(tmp):
    os.makedirs(tmp)
if not os.path.exists(data_folder):
    os.makedirs(data_folder)

In [3]:
script = os.path.join(tmp, 'process_vad_data.py')
if not os.path.exists(script):
    !wget -P $tmp https://raw.githubusercontent.com/NVIDIA/NeMo/main/scripts/dataset_processing/process_vad_data.py

In [4]:
speech_data_root = os.path.join(data_folder, 'google_dataset_v2')  # your <resampled freesound data directory>
background_data_root = os.path.join(data_folder, 'google_dataset_v2/google_speech_recognition_v2/_background_noise_')# your <resampled freesound data directory>
out_dir = os.path.join(data_folder, 'manifest')
if not os.path.exists(speech_data_root):
    os.mkdir(speech_data_root)

In [5]:
# !python $script \
#     --out_dir={out_dir} \
#     --speech_data_root={speech_data_root} \
#     --background_data_root={background_data_root}\
#     --log \
#     --demo \
#     --rebalance_method='fixed' 

In [6]:
train_dataset = 'data/manifest/balanced_background_training_manifest.json,data/manifest/balanced_speech_training_manifest.json' 
val_dataset = 'data/manifest/background_validation_manifest.json,data/manifest/speech_validation_manifest.json' 
test_dataset = 'data/manifest/balanced_background_testing_manifest.json,data/manifest/balanced_speech_testing_manifest.json' 

In [7]:
# NeMo's "core" package
import nemo
# NeMo's ASR collection - this collections contains complete ASR models and
# building blocks (modules) for ASR
import nemo.collections.asr as nemo_asr

fused_indices_to_multihot has reached end of life. Please migrate to a non-experimental function.
OneLogger: Setting error_handling_strategy to DISABLE_QUIETLY_AND_REPORT_METRIC_ERROR for rank (rank=0) with OneLogger disabled. To override: explicitly set error_handling_strategy parameter.
No exporters were provided. This means that no telemetry data will be collected.


In [8]:

MODEL_CONFIG = "marblenet_3x2x64.yaml"

if not os.path.exists(f"configs/{MODEL_CONFIG}"):
  !wget -P configs/ "https://raw.githubusercontent.com/NVIDIA-NeMo/NeMo/refs/heads/main/examples/asr/conf/marblenet/marblenet_3x2x64.yaml"

In [9]:
config_path = f"configs/{MODEL_CONFIG}"
config = OmegaConf.load(config_path)
config = OmegaConf.to_container(config, resolve=True)
config = OmegaConf.create(config)

print(OmegaConf.to_yaml(config))

name: MarbleNet-3x2x64
model:
  sample_rate: 16000
  repeat: 2
  dropout: 0.0
  kernel_size_factor: 1.0
  labels:
  - background
  - speech
  train_ds:
    manifest_filepath: ???
    sample_rate: 16000
    labels:
    - background
    - speech
    batch_size: 128
    shuffle: true
    is_tarred: false
    tarred_audio_filepaths: null
    tarred_shard_strategy: scatter
    shuffle_n: 2048
    num_workers: 8
    pin_memory: true
    bucketing_strategy: synced_randomized
    bucketing_batch_size: null
    bucketing_weights: null
    augmentor:
      shift:
        prob: 1.0
        min_shift_ms: -5.0
        max_shift_ms: 5.0
      white_noise:
        prob: 1.0
        min_level: -90
        max_level: -46
  validation_ds:
    manifest_filepath: ???
    sample_rate: 16000
    labels:
    - background
    - speech
    batch_size: 128
    shuffle: false
    num_workers: 8
    pin_memory: true
    val_loss_idx: 0
  test_ds:
    manifest_filepath: null
    sample_rate: 16000
    labels:
    

In [10]:
# Preserve some useful parameters
labels = config.model.labels
sample_rate = config.model.sample_rate

In [11]:
print(OmegaConf.to_yaml(config.model.train_ds))

manifest_filepath: ???
sample_rate: 16000
labels:
- background
- speech
batch_size: 128
shuffle: true
is_tarred: false
tarred_audio_filepaths: null
tarred_shard_strategy: scatter
shuffle_n: 2048
num_workers: 8
pin_memory: true
bucketing_strategy: synced_randomized
bucketing_batch_size: null
bucketing_weights: null
augmentor:
  shift:
    prob: 1.0
    min_shift_ms: -5.0
    max_shift_ms: 5.0
  white_noise:
    prob: 1.0
    min_level: -90
    max_level: -46



In [12]:

config.model.train_ds.manifest_filepath = train_dataset
config.model.validation_ds.manifest_filepath = val_dataset
config.model.test_ds.manifest_filepath = test_dataset

In [13]:
import torch
import lightning.pytorch as pl
print("Trainer config - \n")
print(OmegaConf.to_yaml(config.trainer))

Trainer config - 

devices: 1
max_epochs: 150
max_steps: -1
num_nodes: 1
accelerator: gpu
strategy: ddp
accumulate_grad_batches: 1
enable_checkpointing: false
logger: false
log_every_n_steps: 1
val_check_interval: 1.0
benchmark: false



In [14]:
# Let's modify some trainer configs for this demo
# Checks if we have GPU available and uses it
accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'
config.trainer.devices = 1
config.trainer.accelerator = accelerator

# Reduces maximum number of epochs to 5 for quick demonstration
config.trainer.max_epochs = 5

# Remove distributed training flags
config.trainer.strategy = 'auto'

In [15]:
trainer = pl.Trainer(**config.trainer)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
`Trainer(val_check_interval=1.0)` was configured so validation will run at the end of the training epoch..


In [16]:
from nemo.utils.exp_manager import exp_manager
exp_dir = exp_manager(trainer, config.get("exp_manager", None))
# The exp_dir provides a path to the current experiment for easy access
exp_dir = str(exp_dir)
exp_dir

[NeMo I 2026-02-02 21:09:45 exp_manager:594] ExpManager schema
[NeMo I 2026-02-02 21:09:45 exp_manager:595] {'explicit_log_dir': None, 'exp_dir': None, 'name': None, 'version': None, 'use_datetime_version': True, 'resume_if_exists': False, 'resume_past_end': False, 'resume_ignore_no_checkpoint': False, 'resume_from_checkpoint': None, 'create_tensorboard_logger': True, 'summary_writer_kwargs': None, 'create_wandb_logger': False, 'wandb_logger_kwargs': None, 'create_mlflow_logger': False, 'mlflow_logger_kwargs': {'experiment_name': None, 'run_name': None, 'tracking_uri': None, 'tags': None, 'save_dir': './mlruns', 'prefix': '', 'artifact_location': None, 'run_id': None, 'log_model': False}, 'create_dllogger_logger': False, 'dllogger_logger_kwargs': {'verbose': False, 'stdout': False, 'json_file': './dllogger.json'}, 'create_clearml_logger': False, 'clearml_logger_kwargs': {'project': None, 'task': None, 'connect_pytorch': False, 'model_name': None, 'tags': None, 'log_model': False, 'log_

'/home/minhth11/Projects/NeMo/nemo_experiments/MarbleNet-3x2x64/2026-02-02_21-09-45'

In [17]:
vad_model = nemo_asr.models.EncDecClassificationModel(cfg=config.model, trainer=trainer)

[NeMo W 2026-02-02 21:09:45 classification_models:641] Please use the EncDecSpeakerLabelModel instead of this model. EncDecClassificationModel model is kept for backward compatibility with older models.


[NeMo I 2026-02-02 21:09:45 collections:750] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2026-02-02 21:09:45 collections:751] Dataset successfully loaded with 1000 items and total duration provided from manifest is  0.17 hours.
[NeMo I 2026-02-02 21:09:45 collections:757] # 1000 files loaded accounting to # 1 labels
[NeMo I 2026-02-02 21:09:45 collections:750] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2026-02-02 21:09:45 collections:751] Dataset successfully loaded with 1000 items and total duration provided from manifest is  0.17 hours.
[NeMo I 2026-02-02 21:09:45 collections:757] # 1000 files loaded accounting to # 1 labels


[NeMo W 2026-02-02 21:09:45 label_models:201] Total number of 2 labels found in all the manifest files.


[NeMo I 2026-02-02 21:09:45 collections:750] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2026-02-02 21:09:45 collections:751] Dataset successfully loaded with 2000 items and total duration provided from manifest is  0.35 hours.
[NeMo I 2026-02-02 21:09:45 collections:757] # 2000 files loaded accounting to # 2 labels
[NeMo I 2026-02-02 21:09:45 collections:750] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2026-02-02 21:09:45 collections:751] Dataset successfully loaded with 147 items and total duration provided from manifest is  0.03 hours.
[NeMo I 2026-02-02 21:09:45 collections:757] # 147 files loaded accounting to # 2 labels
[NeMo I 2026-02-02 21:09:45 collections:750] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2026-02-02 21:09:45 collections:751] Dataset successfully loaded with 800 items and total duration provided from manifest is  0.14 hours.
[NeMo I 2026-02-02 21:09:45 collections:757] # 800 files loaded accounting t

In [18]:
# Noise augmentation
print(OmegaConf.to_yaml(config.model.train_ds.augmentor)) # noise augmentation
print(OmegaConf.to_yaml(config.model.spec_augment)) # SpecAug data augmentation

shift:
  prob: 1.0
  min_shift_ms: -5.0
  max_shift_ms: 5.0
white_noise:
  prob: 1.0
  min_level: -90
  max_level: -46

_target_: nemo.collections.asr.modules.SpectrogramAugmentation
freq_masks: 2
time_masks: 2
freq_width: 15
time_width: 25
rect_masks: 5
rect_time: 25
rect_freq: 15



In [19]:

trainer.fit(vad_model)

You are using a CUDA device ('NVIDIA GeForce RTX 3050 Laptop GPU') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


[NeMo I 2026-02-02 21:10:17 modelPT:830] Optimizer config = SGD (
    Parameter Group 0
        dampening: 0
        differentiable: False
        foreach: None
        fused: None
        lr: 0.01
        maximize: False
        momentum: 0.9
        nesterov: False
        weight_decay: 0.001
    )
[NeMo I 2026-02-02 21:10:17 lr_scheduler:995] Scheduler "<nemo.core.optim.lr_scheduler.PolynomialHoldDecayAnnealing object at 0x720fb4f5c280>" 
    will be used during training (effective maximum steps = 80) - 
    Parameters : 
    (power: 2.0
    warmup_ratio: 0.05
    hold_ratio: 0.45
    min_lr: 0.001
    last_epoch: -1
    max_steps: 80
    )



  | Name                 | Type                         | Params | Mode 
------------------------------------------------------------------------------
0 | loss                 | CrossEntropyLoss             | 0      | train
1 | eval_loss            | CrossEntropyLoss             | 0      | train
2 | _accuracy            | TopKClassificationAccuracy   | 0      | train
3 | preprocessor         | AudioToMFCCPreprocessor      | 0      | train
4 | encoder              | ConvASREncoder               | 88.9 K | train
5 | decoder              | ConvASRDecoderClassification | 258    | train
6 | _macro_accuracy      | MulticlassAccuracy           | 0      | train
7 | _pair_macro_accuracy | MulticlassAccuracy           | 0      | train
8 | spec_augmentation    | SpectrogramAugmentation      | 0      | train
------------------------------------------------------------------------------
89.2 K    Trainable params
0         Non-trainable params
89.2 K    Total params
0.357     Total estimated mode

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

[NeMo I 2026-02-02 21:10:20 preemption:56] Preemption requires torch distributed to be initialized, disabling preemption


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 0, global step 16: 'val_loss' reached 0.79439 (best 0.79439), saving model to '/home/minhth11/Projects/NeMo/nemo_experiments/MarbleNet-3x2x64/2026-02-02_21-09-45/checkpoints/MarbleNet-3x2x64--val_loss=0.7944-epoch=0.ckpt' as top 3


[NeMo I 2026-02-02 21:10:24 nemo_model_checkpoint:573] Checkpoint save for step 16 started at 1770041424.174253.
[NeMo I 2026-02-02 21:10:24 nemo_model_checkpoint:573] Checkpoint save for step 16 started at 1770041424.3043628.


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 1, global step 32: 'val_loss' reached 0.51221 (best 0.51221), saving model to '/home/minhth11/Projects/NeMo/nemo_experiments/MarbleNet-3x2x64/2026-02-02_21-09-45/checkpoints/MarbleNet-3x2x64--val_loss=0.5122-epoch=1.ckpt' as top 3


[NeMo I 2026-02-02 21:10:27 nemo_model_checkpoint:573] Checkpoint save for step 32 started at 1770041427.5764365.
[NeMo I 2026-02-02 21:10:27 nemo_model_checkpoint:573] Checkpoint save for step 32 started at 1770041427.731219.


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 2, global step 48: 'val_loss' reached 0.11798 (best 0.11798), saving model to '/home/minhth11/Projects/NeMo/nemo_experiments/MarbleNet-3x2x64/2026-02-02_21-09-45/checkpoints/MarbleNet-3x2x64--val_loss=0.1180-epoch=2.ckpt' as top 3


[NeMo I 2026-02-02 21:10:31 nemo_model_checkpoint:573] Checkpoint save for step 48 started at 1770041431.3224804.
[NeMo I 2026-02-02 21:10:31 nemo_model_checkpoint:573] Checkpoint save for step 48 started at 1770041431.4443805.


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 3, global step 64: 'val_loss' reached 0.01662 (best 0.01662), saving model to '/home/minhth11/Projects/NeMo/nemo_experiments/MarbleNet-3x2x64/2026-02-02_21-09-45/checkpoints/MarbleNet-3x2x64--val_loss=0.0166-epoch=3.ckpt' as top 3


[NeMo I 2026-02-02 21:10:34 nemo_model_checkpoint:573] Checkpoint save for step 64 started at 1770041434.4905708.
[NeMo I 2026-02-02 21:10:35 nemo_model_checkpoint:573] Checkpoint save for step 64 started at 1770041435.5747912.


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 4, global step 80: 'val_loss' reached 0.01962 (best 0.01662), saving model to '/home/minhth11/Projects/NeMo/nemo_experiments/MarbleNet-3x2x64/2026-02-02_21-09-45/checkpoints/MarbleNet-3x2x64--val_loss=0.0196-epoch=4.ckpt' as top 3


[NeMo I 2026-02-02 21:10:35 nemo_model_checkpoint:573] Checkpoint save for step 80 started at 1770041435.6234193.
[NeMo I 2026-02-02 21:10:35 nemo_model_checkpoint:573] Checkpoint save for step 80 started at 1770041435.717878.


`Trainer.fit` stopped: `max_epochs=5` reached.


[NeMo I 2026-02-02 21:10:35 nemo_model_checkpoint:573] Checkpoint save for step 80 started at 1770041435.794618.


In [20]:
trainer.test(vad_model, ckpt_path=None)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 0.008883921429514885,
  'test_acc_micro_top_1': 1.0,
  'test_acc_macro': 1.0}]