In [1]:
import tensorflow as tf
try: [tf.config.experimental.set_memory_growth(gpu, True) for gpu in tf.config.experimental.list_physical_devices("GPU")]
except: pass

In [2]:

class CTCloss(tf.keras.losses.Loss):
    """ CTCLoss objec for training the model"""
    def __init__(self, name: str = "CTCloss") -> None:
        super(CTCloss, self).__init__()
        self.name = name
        self.loss_fn = tf.keras.backend.ctc_batch_cost

    def __call__(self, y_true: tf.Tensor, y_pred: tf.Tensor, sample_weight=None) -> tf.Tensor:
        """ Compute the training batch CTC loss value"""
        batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
        input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
        label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")

        input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
        label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")

        loss = self.loss_fn(y_true, y_pred, input_length, label_length)

        return loss

In [3]:
class CERMetric(tf.keras.metrics.Metric):
    """A custom TensorFlow metric to compute the Character Error Rate (CER).
    
    Args:
        vocabulary: A string of the vocabulary used to encode the labels.
        name: (Optional) string name of the metric instance.
        **kwargs: Additional keyword arguments.
    """
    def __init__(self, vocabulary, name="CER", **kwargs):
        # Initialize the base Metric class
        super(CERMetric, self).__init__(name=name, **kwargs)
        
        # Initialize variables to keep track of the cumulative character/word error rates and counter
        self.cer_accumulator = tf.Variable(0.0, name="cer_accumulator", dtype=tf.float32)
        self.batch_counter = tf.Variable(0, name="batch_counter", dtype=tf.int32)
        
        # Store the vocabulary as an attribute
        self.vocabulary = tf.constant(list(vocabulary))

    @staticmethod
    def get_cer(pred_decoded, y_true, vocab, padding=-1):
        """ Calculates the character error rate (CER) between the predicted labels and true labels for a batch of input data.

        Args:
            pred_decoded (tf.Tensor): The predicted labels, with dtype=tf.int32, usually output from tf.keras.backend.ctc_decode
            y_true (tf.Tensor): The true labels, with dtype=tf.int32
            vocab (tf.Tensor): The vocabulary tensor, with dtype=tf.string
            padding (int, optional): The padding token when converting to sparse tensor. Defaults to -1.

        Returns:
            tf.Tensor: The CER between the predicted labels and true labels
        """
        # Keep only valid indices in the predicted labels tensor, replacing invalid indices with padding token
        vocab_length = tf.cast(tf.shape(vocab)[0], tf.int64)
        valid_pred_indices = tf.less(pred_decoded, vocab_length)
        valid_pred = tf.where(valid_pred_indices, pred_decoded, padding)

        # Keep only valid indices in the true labels tensor, replacing invalid indices with padding token
        y_true = tf.cast(y_true, tf.int64)
        valid_true_indices = tf.less(y_true, vocab_length)
        valid_true = tf.where(valid_true_indices, y_true, padding)

        # Convert the valid predicted labels tensor to a sparse tensor
        sparse_pred = tf.RaggedTensor.from_tensor(valid_pred, padding=padding).to_sparse()

        # Convert the valid true labels tensor to a sparse tensor
        sparse_true = tf.RaggedTensor.from_tensor(valid_true, padding=padding).to_sparse()

        # Calculate the normalized edit distance between the sparse predicted labels tensor and sparse true labels tensor
        distance = tf.edit_distance(sparse_pred, sparse_true, normalize=True)

        return distance

    def update_state(self, y_true, y_pred, sample_weight=None):
        """Updates the state variables of the metric.

        Args:
            y_true: A tensor of true labels with shape (batch_size, sequence_length).
            y_pred: A tensor of predicted labels with shape (batch_size, sequence_length, num_classes).
            sample_weight: (Optional) a tensor of weights with shape (batch_size, sequence_length).
        """
        # Get the input shape and length
        input_shape = tf.keras.backend.shape(y_pred)
        input_length = tf.ones(shape=input_shape[0], dtype="int32") * tf.cast(input_shape[1], "int32")

        # Decode the predicted labels using greedy decoding
        decode_predicted, log = tf.keras.backend.ctc_decode(y_pred, input_length, greedy=True)

        # Calculate the normalized edit distance between the predicted labels and true labels tensors
        distance = self.get_cer(decode_predicted[0], y_true, self.vocabulary)

        # Add the sum of the distance tensor to the cer_accumulator variable
        self.cer_accumulator.assign_add(tf.reduce_sum(distance))
        
        # Increment the batch_counter by the batch size
        self.batch_counter.assign_add(input_shape[0])

    def result(self):
        """ Computes and returns the metric result.

        Returns:
            A TensorFlow float representing the CER (character error rate).
        """
        return tf.math.divide_no_nan(self.cer_accumulator, tf.cast(self.batch_counter, tf.float32))


In [4]:
class WERMetric(tf.keras.metrics.Metric):
    """A custom TensorFlow metric to compute the Word Error Rate (WER).
    
    Attributes:
        vocabulary: A string of the vocabulary used to encode the labels.
        name: (Optional) string name of the metric instance.
        **kwargs: Additional keyword arguments.
    """
    def __init__(self, vocabulary: str, name="WER", **kwargs):
        # Initialize the base Metric class
        super(WERMetric, self).__init__(name=name, **kwargs)
        
        # Initialize variables to keep track of the cumulative character/word error rates and counter
        self.wer_accumulator = tf.Variable(0.0, name="wer_accumulator", dtype=tf.float32)
        self.batch_counter = tf.Variable(0, name="batch_counter", dtype=tf.int32)
        
        # Store the vocabulary as an attribute
        self.vocabulary = tf.constant(list(vocabulary))

    @staticmethod
    def preprocess_dense(dense_input: tf.Tensor, vocab: tf.Tensor, padding=-1, separator="") -> tf.SparseTensor:
        """ Preprocess the dense input tensor to a sparse tensor with given vocabulary
        
        Args:
            dense_input (tf.Tensor): The dense input tensor, dtype=tf.int32
            vocab (tf.Tensor): The vocabulary tensor, dtype=tf.string
            padding (int, optional): The padding token when converting to sparse tensor. Defaults to -1.

        Returns:
            tf.SparseTensor: The sparse tensor with given vocabulary
        """
        # Keep only the valid indices of the dense input tensor
        vocab_length = tf.cast(tf.shape(vocab)[0], tf.int64)
        dense_input = tf.cast(dense_input, tf.int64)
        valid_indices = tf.less(dense_input, vocab_length)
        valid_input = tf.where(valid_indices, dense_input, padding)

        # Convert the valid input tensor to a ragged tensor with padding
        input_ragged = tf.RaggedTensor.from_tensor(valid_input, padding=padding)

        # Use the vocabulary tensor to get the strings corresponding to the indices in the ragged tensor
        input_binary_chars = tf.gather(vocab, input_ragged)

        # Join the binary character tensor along the sequence axis to get the input strings
        input_strings = tf.strings.reduce_join(input_binary_chars, axis=1, separator=separator)

        # Convert the input strings tensor to a sparse tensor
        input_sparse_string = tf.strings.split(input_strings, sep=" ").to_sparse()

        return input_sparse_string

    @staticmethod
    def get_wer(pred_decoded, y_true, vocab, padding=-1, separator=""):
        """ Calculate the normalized WER distance between the predicted labels and true labels tensors

        Args:
            pred_decoded (tf.Tensor): The predicted labels tensor, dtype=tf.int32. Usually output from tf.keras.backend.ctc_decode
            y_true (tf.Tensor): The true labels tensor, dtype=tf.int32
            vocab (tf.Tensor): The vocabulary tensor, dtype=tf.string

        Returns:
            tf.Tensor: The normalized WER distance between the predicted labels and true labels tensors
        """
        pred_sparse = WERMetric.preprocess_dense(pred_decoded, vocab, padding=padding, separator=separator)
        true_sparse = WERMetric.preprocess_dense(y_true, vocab, padding=padding, separator=separator)

        distance = tf.edit_distance(pred_sparse, true_sparse, normalize=True)

        # test with numerical labels not string
        # true_sparse = tf.RaggedTensor.from_tensor(y_true, padding=-1).to_sparse()

        # replace 23 with -1
        # pred_decoded2 = tf.where(tf.equal(pred_decoded, 23), -1, pred_decoded)
        # pred_decoded2_sparse = tf.RaggedTensor.from_tensor(pred_decoded2, padding=-1).to_sparse()

        # distance = tf.edit_distance(pred_decoded2_sparse, true_sparse, normalize=True)

        return distance

    def update_state(self, y_true, y_pred, sample_weight=None):
        """
        """
        # Get the input shape and length
        input_shape = tf.keras.backend.shape(y_pred)
        input_length = tf.ones(shape=input_shape[0], dtype="int32") * tf.cast(input_shape[1], "int32")

        # Decode the predicted labels using greedy decoding
        decode_predicted, log = tf.keras.backend.ctc_decode(y_pred, input_length, greedy=True)

        # Calculate the normalized edit distance between the predicted labels and true labels tensors
        distance = self.get_wer(decode_predicted[0], y_true, self.vocabulary)

        # Calculate the number of wrong words in batch and add to wer_accumulator variable
        self.wer_accumulator.assign_add(tf.reduce_sum(tf.cast(distance, tf.float32)))

        # Increment the batch_counter by the batch size
        self.batch_counter.assign_add(input_shape[0])

    def result(self):
        """Computes and returns the metric result.

        Returns:
            A TensorFlow float representing the WER (Word Error Rate).
        """
        return tf.math.divide_no_nan(self.wer_accumulator, tf.cast(self.batch_counter, tf.float32))

In [6]:
from tensorflow.keras.models import load_model
from tensorflow.keras.losses import Loss
# Define your custom loss function
class CTCloss(Loss):
    """ CTCLoss object for training the model"""
    def __init__(self, name='CTCloss', reduction='auto') -> None:
        super(CTCloss, self).__init__(name=name, reduction=reduction)

    def call(self, y_true, y_pred):
        batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
        input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
        label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")

        input_length = input_length * tf.ones(shape=(batch_len,), dtype="int64")
        label_length = label_length * tf.ones(shape=(batch_len,), dtype="int64")

        loss = tf.keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
        return loss

# Load the model with the custom loss function
model_path = "C:/Users/isrch/AI Dialogue Narration Agent/model.h5"
vocabulary = "abcdefghijklmnopqrstuvwxyz'?! "
model = load_model(model_path, custom_objects={'CTCloss': CTCloss(),'CERMetric': lambda **kwargs: CERMetric(vocabulary, **kwargs),'WERMetric': lambda **kwargs: WERMetric(vocabulary, **kwargs)})
# Load the model with the custom loss function
# model_path = ""C:\Users\isrch\AI Dialogue Narration Agent\model.h5""
# model = load_model(model_path, custom_objects={'custom_loss_function': CTCloss})


  function = cls._parse_function_from_config(


In [6]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input (InputLayer)          [(None, 1392, 193)]       0         
                                                                 
 lambda (Lambda)             (None, 1392, 193, 1)      0         
                                                                 
 conv2d (Conv2D)             (None, 696, 97, 32)       14432     
                                                                 
 batch_normalization (BatchN  (None, 696, 97, 32)      128       
 ormalization)                                                   
                                                                 
 leaky_re_lu (LeakyReLU)     (None, 696, 97, 32)       0         
                                                                 
 conv2d_1 (Conv2D)           (None, 696, 49, 32)       236544    
                                                             

In [11]:
pip install tqdm

Collecting tqdm
  Obtaining dependency information for tqdm from https://files.pythonhosted.org/packages/2a/14/e75e52d521442e2fcc9f1df3c5e456aead034203d4797867980de558ab34/tqdm-4.66.2-py3-none-any.whl.metadata
  Downloading tqdm-4.66.2-py3-none-any.whl.metadata (57 kB)
     ---------------------------------------- 0.0/57.6 kB ? eta -:--:--
     ---------------------------------------- 57.6/57.6 kB ? eta 0:00:00
Downloading tqdm-4.66.2-py3-none-any.whl (78 kB)
   ---------------------------------------- 0.0/78.3 kB ? eta -:--:--
   ---------------------------------------- 78.3/78.3 kB ? eta 0:00:00
Installing collected packages: tqdm
Successfully installed tqdm-4.66.2
Note: you may need to restart the kernel to use updated packages.


In [12]:
import os
import pandas as pd
from tqdm import tqdm

# Define paths (replace with actual paths)
dataset_path = "D:/4-2-dataset"
metadata_path = os.path.join(dataset_path,"dataset.csv")
wavs_path = os.path.join(dataset_path, "wavs")

# Read metadata file and parse it
metadata_df = pd.read_csv(metadata_path, header=None)
metadata_df.columns = ["file_name", "transcription"]  # Assign column names for the two columns

# Structure the dataset where each row is a list of [wav_file_path, sound transcription]
dataset = [[os.path.join(wavs_path, file), label.lower()] for file, label in tqdm(metadata_df.values)]

# Create a ModelConfigs object to store model configurations
configs = ModelConfigs()

max_text_length, max_spectrogram_length = 0, 0
for file_path, label in tqdm(dataset):
    spectrogram = WavReader.get_spectrogram(file_path, frame_length=configs.frame_length, frame_step=configs.frame_step, fft_length=configs.fft_length)
    valid_label = [c for c in label if c in configs.vocab]
    max_text_length = max(max_text_length, len(valid_label))
    max_spectrogram_length = max(max_spectrogram_length, spectrogram.shape[0])
    configs.input_shape = [max_spectrogram_length, spectrogram.shape[1]]

configs.max_spectrogram_length = max_spectrogram_length
configs.max_text_length = max_text_length
configs.save()

FileNotFoundError: [Errno 2] No such file or directory: 'D:/4-2-dataset\\dataset.csv'

In [7]:
import os
import tarfile
import pandas as pd
from tqdm import tqdm
from urllib.request import urlopen
from io import BytesIO

from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard
from mltu.preprocessors import WavReader

from mltu.tensorflow.dataProvider import DataProvider
from mltu.transformers import LabelIndexer, LabelPadding, SpectrogramPadding
from mltu.tensorflow.losses import CTCloss
from mltu.tensorflow.callbacks import Model2onnx, TrainLogger
from mltu.tensorflow.metrics import CERMetric, WERMetric


from configs import ModelConfigs

ModuleNotFoundError: No module named 'pandas'

In [10]:
print(dataset[0])

['D:\\4-2-dataset\\wavs\\Recording (1).wav', 'you always want to play games or get attention from me while i’m studying or busy at work.']


In [11]:
data_provider = DataProvider(
    dataset=dataset,
    skip_validation=True,
    batch_size=configs.batch_size,
    data_preprocessors=[
        WavReader(frame_length=configs.frame_length, frame_step=configs.frame_step, fft_length=configs.fft_length),
        ],
    transformers=[
        SpectrogramPadding(max_spectrogram_length=configs.max_spectrogram_length, padding_value=0),
        LabelIndexer(configs.vocab),
        LabelPadding(max_word_length=configs.max_text_length, padding_value=len(configs.vocab)),
        ],
)

# Split the dataset into training and validation sets
train_data_provider, val_data_provider = data_provider.split(split = 0.9)

In [12]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=configs.learning_rate), 
    loss=CTCloss(), 
    metrics=[
        CERMetric(vocabulary=configs.vocab),
        WERMetric(vocabulary=configs.vocab)
        ],
    run_eagerly=False
)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input (InputLayer)          [(None, 1392, 193)]       0         
                                                                 
 lambda (Lambda)             (None, 1392, 193, 1)      0         
                                                                 
 conv2d (Conv2D)             (None, 696, 97, 32)       14432     
                                                                 
 batch_normalization (BatchN  (None, 696, 97, 32)      128       
 ormalization)                                                   
                                                                 
 leaky_re_lu (LeakyReLU)     (None, 696, 97, 32)       0         
                                                                 
 conv2d_1 (Conv2D)           (None, 696, 49, 32)       236544    
                                                             

In [13]:
earlystopper = EarlyStopping(monitor="val_CER", patience=20, verbose=1, mode="min")
checkpoint = ModelCheckpoint(f"{configs.model_path}/model.h5", monitor="val_CER", verbose=1, save_best_only=True, mode="min")
trainLogger = TrainLogger(configs.model_path)
tb_callback = TensorBoard(f"{configs.model_path}/logs", update_freq=1)
reduceLROnPlat = ReduceLROnPlateau(monitor="val_CER", factor=0.8, min_delta=1e-10, patience=5, verbose=1, mode="auto")
model2onnx = Model2onnx(f"{configs.model_path}/model.h5")

In [14]:
model.fit(
    train_data_provider,
    validation_data=val_data_provider,
    epochs=100,
    callbacks=[earlystopper,checkpoint, trainLogger, reduceLROnPlat, tb_callback, model2onnx],
    workers=configs.train_workers
)

Epoch 1/100
Epoch 1: val_CER improved from inf to 0.57776, saving model to Models/05_sound_to_text\202404231811\model.h5
Epoch 2/100
Epoch 2: val_CER improved from 0.57776 to 0.52088, saving model to Models/05_sound_to_text\202404231811\model.h5
Epoch 3/100
Epoch 3: val_CER improved from 0.52088 to 0.48107, saving model to Models/05_sound_to_text\202404231811\model.h5
Epoch 4/100
Epoch 4: val_CER improved from 0.48107 to 0.44644, saving model to Models/05_sound_to_text\202404231811\model.h5
Epoch 5/100
Epoch 5: val_CER improved from 0.44644 to 0.41203, saving model to Models/05_sound_to_text\202404231811\model.h5
Epoch 6/100
Epoch 6: val_CER improved from 0.41203 to 0.38443, saving model to Models/05_sound_to_text\202404231811\model.h5
Epoch 7/100
Epoch 7: val_CER improved from 0.38443 to 0.35486, saving model to Models/05_sound_to_text\202404231811\model.h5
Epoch 8/100
Epoch 8: val_CER improved from 0.35486 to 0.32765, saving model to Models/05_sound_to_text\202404231811\model.h5
Epoc

Epoch 22/100
Epoch 22: val_CER improved from 0.20693 to 0.20432, saving model to Models/05_sound_to_text\202404231811\model.h5
Epoch 23/100
Epoch 23: val_CER improved from 0.20432 to 0.20001, saving model to Models/05_sound_to_text\202404231811\model.h5
Epoch 24/100
Epoch 24: val_CER improved from 0.20001 to 0.19859, saving model to Models/05_sound_to_text\202404231811\model.h5
Epoch 25/100
Epoch 25: val_CER did not improve from 0.19859
Epoch 26/100
Epoch 26: val_CER improved from 0.19859 to 0.19802, saving model to Models/05_sound_to_text\202404231811\model.h5
Epoch 27/100
Epoch 27: val_CER improved from 0.19802 to 0.19448, saving model to Models/05_sound_to_text\202404231811\model.h5
Epoch 28/100
Epoch 28: val_CER improved from 0.19448 to 0.18847, saving model to Models/05_sound_to_text\202404231811\model.h5
Epoch 29/100
Epoch 29: val_CER improved from 0.18847 to 0.18761, saving model to Models/05_sound_to_text\202404231811\model.h5
Epoch 30/100
Epoch 30: val_CER improved from 0.1876

Epoch 45/100
Epoch 45: val_CER did not improve from 0.18101
Epoch 46/100
Epoch 46: val_CER did not improve from 0.18101
Epoch 47/100
Epoch 47: val_CER improved from 0.18101 to 0.17837, saving model to Models/05_sound_to_text\202404231811\model.h5
Epoch 48/100
Epoch 48: val_CER did not improve from 0.17837
Epoch 49/100
Epoch 49: val_CER improved from 0.17837 to 0.17703, saving model to Models/05_sound_to_text\202404231811\model.h5
Epoch 50/100
Epoch 50: val_CER improved from 0.17703 to 0.17587, saving model to Models/05_sound_to_text\202404231811\model.h5
Epoch 51/100
Epoch 51: val_CER improved from 0.17587 to 0.17420, saving model to Models/05_sound_to_text\202404231811\model.h5
Epoch 52/100
Epoch 52: val_CER did not improve from 0.17420
Epoch 53/100
Epoch 53: val_CER did not improve from 0.17420
Epoch 54/100
Epoch 54: val_CER improved from 0.17420 to 0.17163, saving model to Models/05_sound_to_text\202404231811\model.h5
Epoch 55/100
Epoch 55: val_CER improved from 0.17163 to 0.17087, 

Epoch 68/100
Epoch 68: val_CER did not improve from 0.16313
Epoch 69/100
Epoch 69: val_CER did not improve from 0.16313
Epoch 70/100
Epoch 70: val_CER did not improve from 0.16313
Epoch 71/100
Epoch 71: val_CER improved from 0.16313 to 0.16017, saving model to Models/05_sound_to_text\202404231811\model.h5
Epoch 72/100
Epoch 72: val_CER did not improve from 0.16017
Epoch 73/100
Epoch 73: val_CER did not improve from 0.16017
Epoch 74/100
Epoch 74: val_CER did not improve from 0.16017
Epoch 75/100
Epoch 75: val_CER did not improve from 0.16017
Epoch 76/100
Epoch 76: val_CER did not improve from 0.16017

Epoch 76: ReduceLROnPlateau reducing learning rate to 0.00020480002276599408.
Epoch 77/100
Epoch 77: val_CER did not improve from 0.16017
Epoch 78/100
Epoch 78: val_CER did not improve from 0.16017
Epoch 79/100
Epoch 79: val_CER did not improve from 0.16017
Epoch 80/100
Epoch 80: val_CER improved from 0.16017 to 0.15849, saving model to Models/05_sound_to_text\202404231811\model.h5
Epoch 8

<keras.callbacks.History at 0x21818607250>

In [23]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from mltu.configs import BaseModelConfigs
from mltu.inferenceModel import OnnxInferenceModel
from mltu.preprocessors import WavReader
from mltu.utils.text_utils import ctc_decoder, get_cer, get_wer

class WavToTextModel(OnnxInferenceModel):
    def _init_(self, char_list: str, *args, **kwargs):
        super()._init_(*args, **kwargs)
        self.char_list = char_list

    def predict(self, data: np.ndarray):
        data_pred = np.expand_dims(data, axis=0)
        preds = self.model.run(None, {self.input_name: data_pred})[0]
        text = ctc_decoder(preds, self.char_list)[0]
        return text

if __name__ == "__main__":
    # Load model configurations
    configs = BaseModelConfigs.load(r"C:\Users\isrch\AI Dialogue Narration Agent\Models\05_sound_to_text\202404231811\configs.yaml")

    # Initialize the model
    model = WavToTextModel(model_path=configs.model_path, char_list=configs.vocab, force_cpu=False)

    # Load data
    df = pd.read_csv(r"C:\Users\isrch\AI Dialogue Narration Agent\val.csv").values.tolist()

    # Initialize lists to store CER and WER
    accum_cer, accum_wer = [], []

    # Iterate over data
    for wav_path, label in tqdm(df):
        wav_path = wav_path.replace("\\", "/")

        # Get spectrogram
        spectrogram = WavReader.get_spectrogram(wav_path, frame_length=configs.frame_length, frame_step=configs.frame_step, fft_length=configs.fft_length)

        # Plot raw audio
#         WavReader.plot_raw_audio(wav_path, label)

        # Adjust spectrogram size
        if spectrogram.shape[0] < 1392:
            padding = 1392 - spectrogram.shape[0]
            spectrogram = np.pad(spectrogram, ((0, padding), (0, 0)), mode='constant', constant_values=0)
        elif spectrogram.shape[0] > 1392:
            spectrogram = spectrogram[:1392, :]

        # Plot spectrogram
#         WavReader.plot_spectrogram(spectrogram, label)

        # Make prediction
        text = model.predict(spectrogram)

        # Calculate CER and WER
        print(text)

  0%|          | 0/1310 [00:00<?, ?it/s]


AttributeError: 'WavToTextModel' object has no attribute 'input_name'