In [2]:
config = {
    "speech_config": {
        "sample_rate": 16000,
        "frame_ms": 25,
        "stride_ms": 10,
        "num_feature_bins": 80,
        "feature_type": "log_mel_spectrogram",
        "preemphasis": 0.97,
        "normalize_signal": True,
        "normalize_feature": True,
        "normalize_per_frame": False,
    },
    "decoder_config": {
        "vocabulary": None,
        "target_vocab_size": 1024,
        "max_subword_length": 4,
        "blank_at_zero": True,
        "beam_width": 5,
        "norm_score": True,
    },
    "model_config": {
        "name": "contextnet",
        "encoder_alpha": 0.5,
        "encoder_blocks": [
            {
                "nlayers": 1,
                "kernel_size": 5,
                "filters": 256,
                "strides": 1,
                "residual": False,
                "activation": "silu",
            },
            {
                "nlayers": 5,
                "kernel_size": 5,
                "filters": 256,
                "strides": 1,
                "residual": True,
                "activation": "silu",
            },
            {
                "nlayers": 5,
                "kernel_size": 5,
                "filters": 256,
                "strides": 1,
                "residual": True,
                "activation": "silu",
            },
            {
                "nlayers": 5,
                "kernel_size": 5,
                "filters": 256,
                "strides": 2,
                "residual": True,
                "activation": "silu",
            },
            {
                "nlayers": 5,
                "kernel_size": 5,
                "filters": 256,
                "strides": 1,
                "residual": True,
                "activation": "silu",
            },
            {
                "nlayers": 5,
                "kernel_size": 5,
                "filters": 256,
                "strides": 1,
                "residual": True,
                "activation": "silu",
            },
            {
                "nlayers": 5,
                "kernel_size": 5,
                "filters": 256,
                "strides": 1,
                "residual": True,
                "activation": "silu",
            },
            {
                "nlayers": 5,
                "kernel_size": 5,
                "filters": 256,
                "strides": 2,
                "residual": True,
                "activation": "silu",
            },
            {
                "nlayers": 5,
                "kernel_size": 5,
                "filters": 256,
                "strides": 1,
                "residual": True,
                "activation": "silu",
            },
            {
                "nlayers": 5,
                "kernel_size": 5,
                "filters": 256,
                "strides": 1,
                "residual": True,
                "activation": "silu",
            },
            {
                "nlayers": 5,
                "kernel_size": 5,
                "filters": 256,
                "strides": 1,
                "residual": True,
                "activation": "silu",
            },
            {
                "nlayers": 5,
                "kernel_size": 5,
                "filters": 512,
                "strides": 1,
                "residual": True,
                "activation": "silu",
            },
            {
                "nlayers": 5,
                "kernel_size": 5,
                "filters": 512,
                "strides": 1,
                "residual": True,
                "activation": "silu",
            },
            {
                "nlayers": 5,
                "kernel_size": 5,
                "filters": 512,
                "strides": 1,
                "residual": True,
                "activation": "silu",
            },
            {
                "nlayers": 5,
                "kernel_size": 5,
                "filters": 512,
                "strides": 2,
                "residual": True,
                "activation": "silu",
            },
            {
                "nlayers": 5,
                "kernel_size": 5,
                "filters": 512,
                "strides": 1,
                "residual": True,
                "activation": "silu",
            },
            {
                "nlayers": 5,
                "kernel_size": 5,
                "filters": 512,
                "strides": 1,
                "residual": True,
                "activation": "silu",
            },
            {
                "nlayers": 5,
                "kernel_size": 5,
                "filters": 512,
                "strides": 1,
                "residual": True,
                "activation": "silu",
            },
            {
                "nlayers": 5,
                "kernel_size": 5,
                "filters": 512,
                "strides": 1,
                "residual": True,
                "activation": "silu",
            },
            {
                "nlayers": 5,
                "kernel_size": 5,
                "filters": 512,
                "strides": 1,
                "residual": True,
                "activation": "silu",
            },
            {
                "nlayers": 5,
                "kernel_size": 5,
                "filters": 512,
                "strides": 1,
                "residual": True,
                "activation": "silu",
            },
            {
                "nlayers": 5,
                "kernel_size": 5,
                "filters": 512,
                "strides": 1,
                "residual": True,
                "activation": "silu",
            },
            {
                "nlayers": 1,
                "kernel_size": 5,
                "filters": 640,
                "strides": 1,
                "residual": False,
                "activation": "silu",
            },
        ],
        "prediction_embed_dim": 640,
        "prediction_embed_dropout": 0,
        "prediction_num_rnns": 1,
        "prediction_rnn_units": 640,
        "prediction_rnn_type": "lstm",
        "prediction_rnn_implementation": 1,
        "prediction_layer_norm": True,
        "prediction_projection_units": 0,
        "joint_dim": 640,
        "joint_activation": "tanh",
    },
    "learning_config": {
        "train_dataset_config": {
            "use_tf": True,
            "augmentation_config": {
                "feature_augment": {
                    "time_masking": {
                        "num_masks": 10,
                        "mask_factor": 100,
                        "p_upperbound": 0.05,
                    },
                    "freq_masking": {"num_masks": 1, "mask_factor": 27},
                }
            },
            "data_paths": [
                "/mnt/h/ML/Datasets/ASR/Raw/LibriSpeech/train-clean-100/transcripts.tsv"
            ],
            "tfrecords_dir": None,
            "shuffle": True,
            "cache": True,
            "buffer_size": 100,
            "drop_remainder": True,
            "stage": "train",
        },
        "eval_dataset_config": {
            "use_tf": True,
            "data_paths": None,
            "tfrecords_dir": None,
            "shuffle": False,
            "cache": True,
            "buffer_size": 100,
            "drop_remainder": True,
            "stage": "eval",
        },
        "test_dataset_config": {
            "use_tf": True,
            "data_paths": [
                "/mnt/h/ML/Datasets/ASR/Raw/LibriSpeech/test-clean/transcripts.tsv"
            ],
            "tfrecords_dir": None,
            "shuffle": False,
            "cache": True,
            "buffer_size": 100,
            "drop_remainder": True,
            "stage": "test",
        },
        "optimizer_config": {
            "warmup_steps": 40000,
            "beta_1": 0.9,
            "beta_2": 0.98,
            "epsilon": 1e-09,
        },
        "running_config": {
            "batch_size": 2,
            "num_epochs": 20,
            "checkpoint": {
                "filepath": "/mnt/e/Models/local/contextnet/checkpoints/{epoch:02d}.h5",
                "save_best_only": False,
                "save_weights_only": True,
                "save_freq": "epoch",
            },
            "states_dir": "/mnt/e/Models/local/contextnet/states",
            "tensorboard": {
                "log_dir": "/mnt/e/Models/local/contextnet/tensorboard",
                "histogram_freq": 1,
                "write_graph": True,
                "write_images": True,
                "update_freq": "epoch",
                "profile_batch": 2,
            },
        },
    },
}

In [1]:
import sys
sys.path.append('/mydata/hassan/TensorFlowASR')

In [3]:
metadata = {
    "train": {"max_input_length": 2974, "max_label_length": 194, "num_entries": 281241},
    "eval": {"max_input_length": 3516, "max_label_length": 186, "num_entries": 5567},
}

In [4]:
import os
import math
import argparse
from tensorflow_asr.utils import env_util

env_util.setup_environment()
import tensorflow as tf

tf.keras.backend.clear_session()
tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})
strategy = env_util.setup_strategy([0])

from tensorflow_asr.configs.config import Config
from tensorflow_asr.datasets import asr_dataset
from tensorflow_asr.featurizers import speech_featurizers, text_featurizers
from tensorflow_asr.models.transducer.contextnet import ContextNet
from tensorflow_asr.optimizers.schedules import TransformerSchedule

config = Config(config)
speech_featurizer = speech_featurizers.TFSpeechFeaturizer(config.speech_config)

text_featurizer = text_featurizers.CharFeaturizer(config.decoder_config)

train_dataset = asr_dataset.ASRSliceDataset(
    speech_featurizer=speech_featurizer,
    text_featurizer=text_featurizer,
    **vars(config.learning_config.train_dataset_config),
    indefinite=True
)
eval_dataset = asr_dataset.ASRSliceDataset(
    speech_featurizer=speech_featurizer,
    text_featurizer=text_featurizer,
    **vars(config.learning_config.eval_dataset_config),
    indefinite=True
)

train_dataset.load_metadata(metadata)
eval_dataset.load_metadata(metadata)
speech_featurizer.reset_length()
text_featurizer.reset_length()

global_batch_size = config.learning_config.running_config.batch_size
global_batch_size *= strategy.num_replicas_in_sync

train_data_loader = train_dataset.create(global_batch_size)
eval_data_loader = eval_dataset.create(global_batch_size)

with strategy.scope():
    # build model
    contextnet = ContextNet(**config.model_config, vocabulary_size=text_featurizer.num_classes)
    contextnet.make(speech_featurizer.shape)
    contextnet.summary(line_length=100)

    optimizer = tf.keras.optimizers.Adam(
        TransformerSchedule(
            d_model=contextnet.dmodel,
            warmup_steps=config.learning_config.optimizer_config.pop("warmup_steps", 10000),
            max_lr=(0.05 / math.sqrt(contextnet.dmodel))
        ),
        **config.learning_config.optimizer_config
    )

    contextnet.compile(
        optimizer=optimizer,
        experimental_steps_per_execution=10,
        global_batch_size=global_batch_size,
        blank=text_featurizer.blank
    )

callbacks = [
    tf.keras.callbacks.ModelCheckpoint(**config.learning_config.running_config.checkpoint),
    tf.keras.callbacks.experimental.BackupAndRestore(config.learning_config.running_config.states_dir),
    tf.keras.callbacks.TensorBoard(**config.learning_config.running_config.tensorboard)
]

contextnet.fit(
    train_data_loader,
    epochs=config.learning_config.running_config.num_epochs,
    validation_data=eval_data_loader,
    callbacks=callbacks,
    steps_per_epoch=train_dataset.total_steps,
    validation_steps=eval_dataset.total_steps
)

2023-05-17 09:46:06.876000: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-05-17 09:46:06.876023: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)


2023-05-17 09:46:08.136288: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-05-17 09:46:08.136310: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2023-05-17 09:46:08.136342: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (node0.tempnode.dsdm-pg0.clemson.cloudlab.us): /proc/driver/nvidia/version does not exist
2023-05-17 09:46:08.136585: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


INFO:tensorflow:Use RNNT loss in TensorFlow


PermissionDeniedError: /mnt/h; Permission denied