In [1]:
!pip install mltu



# **Importing the dependencies**

In [4]:
import tensorflow as tf
try: [tf.config.experimental.set_memory_growth(gpu, True) for gpu in tf.config.experimental.list_physical_devices("GPU")]
except: pass

import os
import tarfile
import pandas as pd
from tqdm import tqdm
from urllib.request import urlopen
from io import BytesIO

from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard
from mltu.preprocessors import WavReader

from mltu.tensorflow.dataProvider import DataProvider
from mltu.transformers import LabelIndexer, LabelPadding, SpectrogramPadding
from mltu.tensorflow.losses import CTCloss
from mltu.tensorflow.callbacks import Model2onnx, TrainLogger
from mltu.tensorflow.metrics import CERMetric, WERMetric

from configs import ModelConfigs

# Downloading the dataset

In [47]:
def download_and_unzip(url, extract_to="Datasets", chunk_size=1024*1024):
    http_response = urlopen(url)

    data = b""
    iterations = http_response.length // chunk_size + 1
    for _ in tqdm(range(iterations)):
        data += http_response.read(chunk_size)

    tarFile = tarfile.open(fileobj=BytesIO(data), mode="r|bz2")
    tarFile.extractall(path=extract_to)
    tarFile.close()

In [None]:
dataset_path = os.path.join("Datasets", "LJSpeech-1.1")
if not os.path.exists(dataset_path):
    download_and_unzip("https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2", extract_to="Datasets")

In [5]:
dataset_path = "Datasets/LJSpeech-1.1"
metadata_path = dataset_path + "/metadata.csv"
wavs_path = dataset_path + "/wavs/"

## Parsing and reading metadata

In [6]:
metadata_df = pd.read_csv(metadata_path, sep="|", header=None, quoting=3)
metadata_df.columns = ["file_name", "transcription", "normalized_transcription"]
metadata_df = metadata_df[["file_name", "normalized_transcription"]]

In [7]:
dataset = [[f"Datasets/LJSpeech-1.1/wavs/{file}.wav", label.lower()] for file, label in metadata_df.values.tolist()]

## Configuring the model and saving them

In [5]:
configs = ModelConfigs()

max_text_length, max_spectrogram_length = 0, 0
for file_path, label in tqdm(dataset):
    spectrogram = WavReader.get_spectrogram(file_path, frame_length=configs.frame_length, frame_step=configs.frame_step, fft_length=configs.fft_length)
    valid_label = [c for c in label if c in configs.vocab]
    max_text_length = max(max_text_length, len(valid_label))
    max_spectrogram_length = max(max_spectrogram_length, spectrogram.shape[0])
    configs.input_shape = [max_spectrogram_length, spectrogram.shape[1]]

configs.max_spectrogram_length = max_spectrogram_length
configs.max_text_length = max_text_length
configs.save()

NameError: name 'dataset' is not defined

### Creating Data Provide for the dataset

In [9]:
data_provider = DataProvider(
    dataset=dataset,
    skip_validation=True,
    batch_size=configs.batch_size,
    data_preprocessors=[
        WavReader(frame_length=configs.frame_length, frame_step=configs.frame_step, fft_length=configs.fft_length),
        ],
    transformers=[
        SpectrogramPadding(max_spectrogram_length=configs.max_spectrogram_length, padding_value=0),
        LabelIndexer(configs.vocab),
        LabelPadding(max_word_length=configs.max_text_length, padding_value=len(configs.vocab)),
        ],
)

# **Model Building**

In [10]:
import tensorflow as tf
from keras import layers
from keras.models import Model

from mltu.tensorflow.model_utils import residual_block, activation_layer

In [11]:
def train_model(input_dim, output_dim, activation="leaky_relu", dropout=0.2):
    
    inputs = layers.Input(shape=input_dim, name="input", dtype=tf.float32)
    input = layers.Lambda(lambda x: tf.expand_dims(x, axis=-1))(inputs)

    x = layers.Conv2D(filters=32, kernel_size=[11, 41], strides=[2, 2], padding="same", use_bias=False)(input)
    x = layers.BatchNormalization()(x)
    x = activation_layer(x, activation="leaky_relu")

    
    x = layers.Conv2D(filters=32, kernel_size=[11, 21], strides=[1, 2], padding="same", use_bias=False)(x)
    x = layers.BatchNormalization()(x)
    x = activation_layer(x, activation="leaky_relu")
    
    
    x = layers.Reshape((-1, x.shape[-2] * x.shape[-1]))(x)

    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)
    x = layers.Dropout(dropout)(x)
    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)
    x = layers.Dropout(dropout)(x)
    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)
    x = layers.Dropout(dropout)(x)
    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)
    x = layers.Dropout(dropout)(x)
    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)

    
    x = layers.Dense(256)(x)
    x = activation_layer(x, activation="leaky_relu")
    x = layers.Dropout(dropout)(x)

    output = layers.Dense(output_dim + 1, activation="softmax", dtype=tf.float32)(x)
    
    model = Model(inputs=inputs, outputs=output)
    return model

In [12]:
train_data_provider, val_data_provider = data_provider.split(split = 0.9)

In [13]:
model = train_model(
    input_dim = configs.input_shape,
    output_dim = len(configs.vocab),
    dropout=0.5
)

In [14]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=configs.learning_rate), 
    loss=CTCloss(), 
    metrics=[
        CERMetric(vocabulary=configs.vocab),
        WERMetric(vocabulary=configs.vocab)
        ],
    run_eagerly=False
)
model.summary(line_length=110)

Model: "model"
______________________________________________________________________________________________________________
 Layer (type)                                    Output Shape                                Param #          
 input (InputLayer)                              [(None, 1392, 193)]                         0                
                                                                                                              
 lambda (Lambda)                                 (None, 1392, 193, 1)                        0                
                                                                                                              
 conv2d (Conv2D)                                 (None, 696, 97, 32)                         14432            
                                                                                                              
 batch_normalization (BatchNormalization)        (None, 696, 97, 32)                         128 

In [15]:
earlystopper = EarlyStopping(monitor="val_CER", patience=20, verbose=1, mode="min")
checkpoint = ModelCheckpoint(f"{configs.model_path}/model.h5", monitor="val_CER", verbose=1, save_best_only=True, mode="min")
trainLogger = TrainLogger(configs.model_path)
tb_callback = TensorBoard(f"{configs.model_path}/logs", update_freq=1)
reduceLROnPlat = ReduceLROnPlateau(monitor="val_CER", factor=0.8, min_delta=1e-10, patience=5, verbose=1, mode="auto")
model2onnx = Model2onnx(f"{configs.model_path}/model.h5")

In [16]:
model.fit(
    train_data_provider,
    validation_data=val_data_provider,
    epochs=configs.train_epochs,
    callbacks=[earlystopper, checkpoint, trainLogger, reduceLROnPlat, tb_callback, model2onnx],
    workers=configs.train_workers
)

Epoch 1/40
Epoch 1: val_CER improved from inf to 0.63334, saving model to Models/05_sound_to_text\202311051747\model.h5
Epoch 2/40
Epoch 2: val_CER improved from 0.63334 to 0.17641, saving model to Models/05_sound_to_text\202311051747\model.h5
Epoch 3/40
Epoch 3: val_CER improved from 0.17641 to 0.12615, saving model to Models/05_sound_to_text\202311051747\model.h5
Epoch 4/40
Epoch 4: val_CER improved from 0.12615 to 0.09609, saving model to Models/05_sound_to_text\202311051747\model.h5
Epoch 5/40
Epoch 5: val_CER improved from 0.09609 to 0.07900, saving model to Models/05_sound_to_text\202311051747\model.h5
Epoch 6/40
Epoch 6: val_CER improved from 0.07900 to 0.07017, saving model to Models/05_sound_to_text\202311051747\model.h5
Epoch 7/40
Epoch 7: val_CER improved from 0.07017 to 0.06014, saving model to Models/05_sound_to_text\202311051747\model.h5
Epoch 8/40
Epoch 8: val_CER improved from 0.06014 to 0.05533, saving model to Models/05_sound_to_text\202311051747\model.h5
Epoch 9/40
E

<keras.callbacks.History at 0x242fb6f9cc0>

In [17]:
train_data_provider.to_csv(os.path.join(configs.model_path, "train.csv"))
val_data_provider.to_csv(os.path.join(configs.model_path, "val.csv"))

model2onnx = Model2onnx("Models/05_sound_to_text/202311051747/model.h5")


# **Testing**

In [1]:
import typing
import numpy as np

from mltu.inferenceModel import OnnxInferenceModel
from mltu.preprocessors import WavReader
from mltu.utils.text_utils import ctc_decoder, get_cer, get_wer

In [2]:
import pandas as pd
from tqdm import tqdm
from mltu.configs import BaseModelConfigs

In [3]:
class WavToTextModel(OnnxInferenceModel):
    def __init__(self, char_list: typing.Union[str, list], *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.char_list = char_list

    def predict(self, data: np.ndarray):
        data_pred = np.expand_dims(data, axis=0)

        preds = self.model.run(None, {self.input_name: data_pred})[0]

        text = ctc_decoder(preds, self.char_list)[0]

        return text

In [4]:
configs = BaseModelConfigs.load("Models/05_sound_to_text/202311051747/configs.yaml")
model = WavToTextModel(model_path=configs.model_path, char_list=configs.vocab, force_cpu=False)
df = pd.read_csv("Models/05_sound_to_text/202311051747/val.csv").values.tolist()

In [24]:
accum_cer, accum_wer = [], []
for wav_path, label in tqdm(df):
    spectrogram = WavReader.get_spectrogram(wav_path, frame_length=configs.frame_length, frame_step=configs.frame_step, fft_length=configs.fft_length)
    # WavReader.plot_raw_audio(wav_path, label)
    padded_spectrogram = np.pad(spectrogram, ((0, configs.max_spectrogram_length - spectrogram.shape[0]),(0,0)), mode="constant", constant_values=0)
    # WavReader.plot_spectrogram(spectrogram, label)

    text = model.predict(padded_spectrogram)
    true_label = "".join([l for l in label.lower() if l in configs.vocab])
    cer = get_cer(text, true_label)
    wer = get_wer(text, true_label)
    # print(true_label)
    accum_cer.append(cer)
    accum_wer.append(wer)

print(f"Average CER: {np.average(accum_cer)}, Average WER: {np.average(accum_wer)}")

100%|██████████| 1310/1310 [02:47<00:00,  7.84it/s]

Average CER: 0.026012683139923647, Average WER: 0.10827923298364035





# **Predicting against our own audio**

In [5]:
import typing
import numpy as np
from IPython.display import Audio, display

from mltu.inferenceModel import OnnxInferenceModel
from mltu.preprocessors import WavReader
from mltu.utils.text_utils import ctc_decoder

In [6]:
class WavToTextModel(OnnxInferenceModel):
    def __init__(self, char_list: typing.Union[str, list], *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.char_list = char_list

    def predict(self, data: np.ndarray):
        data_pred = np.expand_dims(data, axis=0)
        preds = self.model.run(None, {self.input_name: data_pred})[0]
        text = ctc_decoder(preds, self.char_list)[0]
        return text

In [7]:
configs = BaseModelConfigs.load("Models/05_sound_to_text/202311051747/configs.yaml")
model = WavToTextModel(model_path=configs.model_path, char_list=configs.vocab, force_cpu=False)

In [10]:
wav_path = "test.wav" 

In [11]:
spectrogram = WavReader.get_spectrogram(wav_path, frame_length=configs.frame_length, frame_step=configs.frame_step, fft_length=configs.fft_length)
padded_spectrogram = np.pad(spectrogram, ((0, configs.max_spectrogram_length - spectrogram.shape[0]),(0,0)), mode="constant", constant_values=0)
predicted_text = model.predict(padded_spectrogram)

In [12]:
display(Audio(wav_path))
print(f"Predicted: {predicted_text}")

Predicted: than in the same operations with ugly ones
