In [1]:
import os
import json

In [2]:
data_dir = "datasets"

In [3]:
TRAIN_MANIFEST = os.path.join(data_dir,'train_manifest.json')
TEST_MANIFEST = os.path.join(data_dir, 'test_manifest.json')

In [4]:
VOCAB_SIZE = 1024  # can be any value above 29
TOKENIZER_TYPE = "spe"  # can be wpe or spe
SPE_TYPE = "unigram"  # can be bpe or unigram

# # ------------------------------------------------------------------- #
# !rm -r tokenizers/

# if not os.path.exists("tokenizers"):
#   os.makedirs("tokenizers")

# !python scripts/process_asr_text_tokenizer.py \
#    --manifest=$TRAIN_MANIFEST \
#    --data_root="tokenizers" \
#    --tokenizer=$TOKENIZER_TYPE \
#    --spe_type=$SPE_TYPE \
#    --no_lower_case \
#    --log \
#    --vocab_size=$VOCAB_SIZE

In [5]:
# Tokenizer path
if TOKENIZER_TYPE == 'spe':
  TOKENIZER = os.path.join("tokenizers", f"tokenizer_spe_{SPE_TYPE}_v{VOCAB_SIZE}")
  TOKENIZER_TYPE_CFG = "bpe"
else:
  TOKENIZER = os.path.join("tokenizers", f"tokenizer_wpe_v{VOCAB_SIZE}")
  TOKENIZER_TYPE_CFG = "wpe"

In [6]:
from omegaconf import OmegaConf, open_dict

config = OmegaConf.load("./configs/conformer_transducer_bpe_small.yaml")

In [7]:
# config.model.encoder.jasper[-1].filters = '${model.model_defaults.enc_hidden}'

In [8]:
config.model.train_ds.manifest_filepath = TRAIN_MANIFEST
config.model.validation_ds.manifest_filepath = TEST_MANIFEST
config.model.test_ds.manifest_filepath = TEST_MANIFEST

In [9]:
# print out the train and validation configs to know what needs to be changed
print(OmegaConf.to_yaml(config.model.train_ds))

manifest_filepath: datasets/train_manifest.json
sample_rate: ${model.sample_rate}
batch_size: 16
shuffle: true
num_workers: 8
pin_memory: true
use_start_end_token: false
trim_silence: false
max_duration: 16.7
min_duration: 0.1
is_tarred: false
tarred_audio_filepaths: null
shuffle_n: 2048
bucketing_strategy: synced_randomized
bucketing_batch_size: null



In [10]:
config.model.tokenizer.dir = TOKENIZER
config.model.tokenizer.type = TOKENIZER_TYPE_CFG

In [11]:
print(OmegaConf.to_yaml(config.model.tokenizer))

dir: tokenizers/tokenizer_spe_unigram_v1024
type: bpe



In [12]:
print(OmegaConf.to_yaml(config.model.optim))

name: adamw
lr: 0.5
betas:
- 0.9
- 0.98
weight_decay: 0.0
sched:
  name: NoamAnnealing
  d_model: ${model.encoder.d_model}
  warmup_steps: 10000
  warmup_ratio: null
  min_lr: 1.0e-06



In [13]:
# Finally, let's remove logging of samples and the warmup since the dataset is small (similar to CTC models)
config.model.log_prediction = False
config.model.optim.sched.warmup_steps = None

In [14]:
config.model.spec_augment.freq_masks = 0
config.model.spec_augment.time_masks = 0

In [15]:
print(OmegaConf.to_yaml(config.model.spec_augment))

_target_: nemo.collections.asr.modules.SpectrogramAugmentation
freq_masks: 0
time_masks: 0
freq_width: 27
time_width: 0.05



In [16]:
print(OmegaConf.to_yaml(config.model.joint))

_target_: nemo.collections.asr.modules.RNNTJoint
log_softmax: null
preserve_memory: false
fuse_loss_wer: true
fused_batch_size: 16
jointnet:
  joint_hidden: ${model.model_defaults.joint_hidden}
  activation: relu
  dropout: 0.2



In [17]:
import torch
from pytorch_lightning import Trainer
import pytorch_lightning as ptl

if torch.cuda.is_available():
  accelerator = 'gpu'
else:
  accelerator = 'cpu'

EPOCHS = 100

# ckpt_callback = ptl.callbacks.ModelCheckpoint(save_top_k=7)

# Initialize a Trainer for the Transducer model
trainer = Trainer(devices=1, accelerator=accelerator, max_epochs=EPOCHS,
                  enable_checkpointing=False, logger=False,
                  log_every_n_steps=100, check_val_every_n_epoch=1,accumulate_grad_batches=1)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [18]:
import nemo.collections.asr as nemo_asr

[NeMo W 2022-09-19 08:24:54 optimizers:55] Apex was not found. Using the lamb or fused_adam optimizer will error out.


In [19]:
# Build the model
model = nemo_asr.models.EncDecRNNTBPEModel(cfg=config.model, trainer=trainer)

[NeMo I 2022-09-19 08:24:59 mixins:170] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2022-09-19 08:25:07 collections:194] Dataset loaded with 49759 files totalling 83.57 hours
[NeMo I 2022-09-19 08:25:07 collections:195] 0 files were filtered totalling 0.00 hours
[NeMo I 2022-09-19 08:25:07 collections:194] Dataset loaded with 2685 files totalling 4.38 hours
[NeMo I 2022-09-19 08:25:07 collections:195] 0 files were filtered totalling 0.00 hours
[NeMo I 2022-09-19 08:25:08 collections:194] Dataset loaded with 2685 files totalling 4.38 hours
[NeMo I 2022-09-19 08:25:08 collections:195] 0 files were filtered totalling 0.00 hours
[NeMo I 2022-09-19 08:25:08 features:223] PADDING: 0


    


[NeMo I 2022-09-19 08:25:08 rnnt_models:203] Using RNNT Loss : warprnnt_numba
    Loss warprnnt_numba_kwargs: {'fastemit_lambda': 0.0, 'clamp': -1.0}
[NeMo I 2022-09-19 08:25:08 audio_preprocessing:491] Numba CUDA SpecAugment kernel is being used


In [20]:
model.summarize()

  | Name              | Type                              | Params
------------------------------------------------------------------------
0 | preprocessor      | AudioToMelSpectrogramPreprocessor | 0     
1 | encoder           | ConformerEncoder                  | 13.0 M
2 | decoder           | RNNTDecoder                       | 1.1 M 
3 | joint             | RNNTJoint                         | 488 K 
4 | loss              | RNNTLoss                          | 0     
5 | spec_augmentation | SpectrogramAugmentation           | 0     
6 | wer               | RNNTBPEWER                        | 0     
------------------------------------------------------------------------
14.6 M    Trainable params
0         Non-trainable params
14.6 M    Total params
58.443    Total estimated model params size (MB)

In [21]:
asr_model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained(model_name="stt_en_conformer_transducer_small")

[NeMo I 2022-09-19 08:25:19 cloud:56] Found existing object /home/user/.cache/torch/NeMo/NeMo_1.11.0/stt_en_conformer_transducer_small/a755afe69952642a8410330876938b83/stt_en_conformer_transducer_small.nemo.
[NeMo I 2022-09-19 08:25:19 cloud:62] Re-using file from: /home/user/.cache/torch/NeMo/NeMo_1.11.0/stt_en_conformer_transducer_small/a755afe69952642a8410330876938b83/stt_en_conformer_transducer_small.nemo
[NeMo I 2022-09-19 08:25:19 common:910] Instantiating model from pre-trained checkpoint
[NeMo I 2022-09-19 08:25:21 mixins:170] Tokenizer SentencePieceTokenizer initialized with 1024 tokens


[NeMo W 2022-09-19 08:25:21 modelPT:142] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /data2/nemo_asr_set_2.0/RES/tarred_audio_manifest.json
    sample_rate: 16000
    batch_size: 16
    shuffle: true
    num_workers: 4
    pin_memory: true
    use_start_end_token: true
    trim_silence: false
    max_duration: 20
    min_duration: 0.1
    shuffle_n: 2048
    is_tarred: true
    tarred_audio_filepaths: /data2/nemo_asr_set_2.0/RES/audio__OP_0..4095_CL_.tar
    
[NeMo W 2022-09-19 08:25:21 modelPT:149] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath:
    - /manifests/librispeech/librivox-dev-other.json
    - /manifest

[NeMo I 2022-09-19 08:25:21 features:223] PADDING: 0


    


[NeMo I 2022-09-19 08:25:22 rnnt_models:203] Using RNNT Loss : warprnnt_numba
    Loss warprnnt_numba_kwargs: {'fastemit_lambda': 0.0}
[NeMo I 2022-09-19 08:25:22 audio_preprocessing:491] Numba CUDA SpecAugment kernel is being used
[NeMo I 2022-09-19 08:25:31 save_restore_connector:243] Model EncDecRNNTBPEModel was successfully restored from /home/user/.cache/torch/NeMo/NeMo_1.11.0/stt_en_conformer_transducer_small/a755afe69952642a8410330876938b83/stt_en_conformer_transducer_small.nemo.


In [None]:
# asr_model = nemo_asr.models.EncDecRNNTBPEModel.load_from_checkpoint('/home/user/TRANSDUCER_r1.11.0/EXP1/experiments/Transducer-Model-UZ-finetuning-Contextnet/2022-09-16_06-36-25/checkpoints/Transducer-Model-UZ-finetuning-Contextnet--val_wer=0.3657-epoch=2.ckpt')

In [22]:
model.encoder.load_state_dict(asr_model.encoder.state_dict(), strict=True)

model.decoder.load_state_dict(asr_model.decoder.state_dict(), strict=True)

model.joint.load_state_dict(asr_model.joint.state_dict(), strict=True)

<All keys matched successfully>

In [23]:
# Prepare NeMo's Experiment manager to handle checkpoint saving and logging for us
from nemo.utils import exp_manager

# Environment variable generally used for multi-node multi-gpu training.
# In notebook environments, this flag is unnecessary and can cause logs of multiple training runs to overwrite each other.
os.environ.pop('NEMO_EXPM_VERSION', None)

exp_config = exp_manager.ExpManagerConfig(
    exp_dir=f'experiments/',
    name=f"Transducer-Model-UZ-finetuning-Conformer-Transducer",
    checkpoint_callback_params=exp_manager.CallbackParams(
        monitor="val_wer",
        mode="min",
        always_save_nemo=True,
        save_best_model=True,
    ),
)

exp_config = OmegaConf.structured(exp_config)

logdir = exp_manager.exp_manager(trainer, exp_config)

[NeMo I 2022-09-19 08:25:39 exp_manager:286] Experiments will be logged at experiments/Transducer-Model-UZ-finetuning-Conformer-Transducer/2022-09-19_08-25-39
[NeMo I 2022-09-19 08:25:39 exp_manager:660] TensorboardLogger has been set up


      rank_zero_deprecation("`Trainer.weights_save_path` has been deprecated in v1.6 and will be removed in v1.8.")
    
[NeMo W 2022-09-19 08:25:39 exp_manager:899] The checkpoint callback was told to monitor a validation value and trainer's max_steps was set to -1. Please ensure that max_steps will run for at least 1 epochs to ensure that checkpointing will not error out.


In [24]:
# !tensorboard --bind_all --logdir .

In [25]:
# Release resources prior to training
import gc
gc.collect()

if accelerator == 'gpu':
  torch.cuda.empty_cache()

In [26]:
# Train the model
trainer.fit(model)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


[NeMo I 2022-09-19 08:25:49 modelPT:587] Optimizer config = AdamW (
    Parameter Group 0
        amsgrad: False
        betas: [0.9, 0.98]
        capturable: False
        eps: 1e-08
        foreach: None
        lr: 0.5
        maximize: False
        weight_decay: 0.0
    )
[NeMo I 2022-09-19 08:25:49 lr_scheduler:910] Scheduler "<nemo.core.optim.lr_scheduler.NoamAnnealing object at 0x7fdd204bf4c0>" 
    will be used during training (effective maximum steps = 311000) - 
    Parameters : 
    (d_model: 176
    warmup_steps: null
    warmup_ratio: null
    min_lr: 1.0e-06
    max_steps: 311000
    )



  | Name              | Type                              | Params
------------------------------------------------------------------------
0 | preprocessor      | AudioToMelSpectrogramPreprocessor | 0     
1 | encoder           | ConformerEncoder                  | 13.0 M
2 | decoder           | RNNTDecoder                       | 1.1 M 
3 | joint             | RNNTJoint                         | 488 K 
4 | loss              | RNNTLoss                          | 0     
5 | spec_augmentation | SpectrogramAugmentation           | 0     
6 | wer               | RNNTBPEWER                        | 0     
------------------------------------------------------------------------
14.6 M    Trainable params
0         Non-trainable params
14.6 M    Total params
58.443    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Epoch 0, global step 3110: 'val_wer' reached 1.00000 (best 1.00000), saving model to '/home/user/TRANSDUCER_r1.11.0/EXP1/experiments/Transducer-Model-UZ-finetuning-Conformer-Transducer/2022-09-19_08-25-39/checkpoints/Transducer-Model-UZ-finetuning-Conformer-Transducer--val_wer=1.0000-epoch=0.ckpt' as top 3


Validation: 0it [00:00, ?it/s]

Epoch 1, global step 6220: 'val_wer' reached 1.00000 (best 1.00000), saving model to '/home/user/TRANSDUCER_r1.11.0/EXP1/experiments/Transducer-Model-UZ-finetuning-Conformer-Transducer/2022-09-19_08-25-39/checkpoints/Transducer-Model-UZ-finetuning-Conformer-Transducer--val_wer=1.0000-epoch=1.ckpt' as top 3
      rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
    


In [None]:
# trainer.test(model)