In [None]:
import gdown
url = 'https://drive.google.com/u/0/uc?id=1Qr3YGKdHVmT5Wfn25X2Ts2jE_XfcW1Yr&export=download'
output = 'dataset.zip'
gdown.download(url, output, quiet=False)
gdown.extractall('dataset.zip')

Downloading...
From: https://drive.google.com/u/0/uc?id=1Qr3YGKdHVmT5Wfn25X2Ts2jE_XfcW1Yr&export=download
To: /content/dataset.zip
100%|██████████| 358M/358M [00:08<00:00, 40.0MB/s]


['dataset/metadata.csv',
 'dataset/waves/',
 'dataset/waves/bbaf2n.wav',
 'dataset/waves/bbaf3s.wav',
 'dataset/waves/bbaf4p.wav',
 'dataset/waves/bbaf5a.wav',
 'dataset/waves/bbal6n.wav',
 'dataset/waves/bbal7s.wav',
 'dataset/waves/bbal8p.wav',
 'dataset/waves/bbal9a.wav',
 'dataset/waves/bbas1s.wav',
 'dataset/waves/bbas2p.wav',
 'dataset/waves/bbas3a.wav',
 'dataset/waves/bbaszn.wav',
 'dataset/waves/bbaz4n.wav',
 'dataset/waves/bbaz5s.wav',
 'dataset/waves/bbaz6p.wav',
 'dataset/waves/bbaz7a.wav',
 'dataset/waves/bbbf6n.wav',
 'dataset/waves/bbbf7s.wav',
 'dataset/waves/bbbf8p.wav',
 'dataset/waves/bbbf9a.wav',
 'dataset/waves/bbbm1s.wav',
 'dataset/waves/bbbm2p.wav',
 'dataset/waves/bbbm3a.wav',
 'dataset/waves/bbbmzn.wav',
 'dataset/waves/bbbs4n.wav',
 'dataset/waves/bbbs5s.wav',
 'dataset/waves/bbbs6p.wav',
 'dataset/waves/bbbs7a.wav',
 'dataset/waves/bbbz8n.wav',
 'dataset/waves/bbbz9s.wav',
 'dataset/waves/bbie8n.wav',
 'dataset/waves/bbie9s.wav',
 'dataset/waves/bbif1a.wav',

In [None]:
!pip install mltu



In [None]:
!pip install tf2onnx



In [None]:
!pip install h5py



In [None]:
import tensorflow as tf


import os
import tarfile
import pandas as pd
from tqdm import tqdm
from urllib.request import urlopen
from io import BytesIO

from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard
from mltu.preprocessors import WavReader

from mltu.tensorflow.dataProvider import DataProvider
from mltu.transformers import LabelIndexer, LabelPadding, SpectrogramPadding
from mltu.tensorflow.losses import CTCloss
from mltu.tensorflow.callbacks import Model2onnx, TrainLogger
from mltu.tensorflow.metrics import CERMetric, WERMetric

In [None]:
import h5py

In [None]:
import os
from datetime import datetime

from mltu.configs import BaseModelConfigs


class ModelConfigs(BaseModelConfigs):
    def __init__(self):
        super().__init__()
        self.model_path = os.path.join("Models/05_sound_to_text", datetime.strftime(datetime.now(), "%Y%m%d%H%M"))
        self.frame_length = 256
        self.frame_step = 160
        self.fft_length = 384

        self.vocab = "abcdefghijklmnopqrstuvwxyz "
        self.input_shape = None
        self.max_text_length = None
        self.max_spectrogram_length = None

        self.batch_size = 8
        self.learning_rate = 0.0005
        self.train_epochs = 1000
        self.train_workers = 20

In [None]:
import tensorflow as tf
from keras import layers
from keras.models import Model

from mltu.tensorflow.model_utils import residual_block, activation_layer


def train_model(input_dim, output_dim, activation="leaky_relu", dropout=0.2):

    inputs = layers.Input(shape=input_dim, name="input", dtype=tf.float32)

    # expand dims to add channel dimension
    input = layers.Lambda(lambda x: tf.expand_dims(x, axis=-1))(inputs)

    # Convolution layer 1
    x = layers.Conv2D(filters=32, kernel_size=[11, 41], strides=[2, 2], padding="same", use_bias=False)(input)
    x = layers.BatchNormalization()(x)
    x = activation_layer(x, activation="leaky_relu")

    # Convolution layer 2
    x = layers.Conv2D(filters=32, kernel_size=[11, 21], strides=[1, 2], padding="same", use_bias=False)(x)
    x = layers.BatchNormalization()(x)
    x = activation_layer(x, activation="leaky_relu")

    # Reshape the resulted volume to feed the RNNs layers
    x = layers.Reshape((-1, x.shape[-2] * x.shape[-1]))(x)

    # RNN layers
    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)
    x = layers.Dropout(dropout)(x)

    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)
    x = layers.Dropout(dropout)(x)

    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)
    x = layers.Dropout(dropout)(x)

    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)
    x = layers.Dropout(dropout)(x)

    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)

    # Dense layer
    x = layers.Dense(256)(x)
    x = activation_layer(x, activation="leaky_relu")
    x = layers.Dropout(dropout)(x)

    # Classification layer
    output = layers.Dense(output_dim + 1, activation="softmax", dtype=tf.float32)(x)

    model = Model(inputs=inputs, outputs=output)
    return model

In [None]:
import tensorflow as tf


import os
import tarfile
import pandas as pd
from tqdm import tqdm
from urllib.request import urlopen
from io import BytesIO

from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard
from mltu.preprocessors import WavReader

from mltu.tensorflow.dataProvider import DataProvider
from mltu.transformers import LabelIndexer, LabelPadding, SpectrogramPadding
from mltu.tensorflow.losses import CTCloss
from mltu.tensorflow.callbacks import Model2onnx, TrainLogger
from mltu.tensorflow.metrics import CERMetric, WERMetric

In [None]:
dataset_path = "/content/dataset"
metadata_path = dataset_path + "/metadata.csv"
wavs_path = dataset_path + "/waves/"

# Read metadata file and parse it
metadata_df = pd.read_csv(metadata_path, sep="|", header=None, quoting=3)
metadata_df.columns = ["file_name", "normalized_transcription"]
metadata_df = metadata_df[["file_name", "normalized_transcription"]]

# structure the dataset where each row is a list of [wav_file_path, sound transcription]
dataset = [[f"dataset/waves/{file}.wav", label.lower()] for file, label in metadata_df.values.tolist()]

In [None]:
dataset

[['dataset/waves/brbm9a.wav', 'bin red by m nine again'],
 ['dataset/waves/pgay3a.wav', 'place green at y three again'],
 ['dataset/waves/sbaa5s.wav', 'set blue at a five soon'],
 ['dataset/waves/bbie9s.wav', 'bin blue in e nine soon'],
 ['dataset/waves/pbib6n.wav', 'place blue in b six now'],
 ['dataset/waves/swbi5s.wav', 'set white by i five soon'],
 ['dataset/waves/srbizp.wav', 'set red by i zero please'],
 ['dataset/waves/bwwuzn.wav', 'bin white with u zero now'],
 ['dataset/waves/swbo9s.wav', 'set white by o nine soon'],
 ['dataset/waves/pbwp9a.wav', 'place blue with p nine again'],
 ['dataset/waves/lwbz4n.wav', 'lay white by z four now'],
 ['dataset/waves/lgbm2n.wav', 'lay green by m two now'],
 ['dataset/waves/lriq7s.wav', 'lay red in q seven soon'],
 ['dataset/waves/lgil7a.wav', 'lay green in l seven again'],
 ['dataset/waves/pgix7s.wav', 'place green in x seven soon'],
 ['dataset/waves/srao1a.wav', 'set red at o one again'],
 ['dataset/waves/srau2n.wav', 'set red at u two now'

In [None]:
configs = ModelConfigs()
configs.frame_length

256

In [None]:


max_text_length, max_spectrogram_length = 0, 0
for file_path, label in tqdm(dataset):
    spectrogram = WavReader.get_spectrogram(file_path, frame_length=configs.frame_length, frame_step=configs.frame_step, fft_length=configs.fft_length)
    valid_label = [c for c in label if c in configs.vocab]
    max_text_length = max(max_text_length, len(valid_label))
    max_spectrogram_length = max(max_spectrogram_length, spectrogram.shape[0])
    configs.input_shape = [max_spectrogram_length, spectrogram.shape[1]]

configs.max_spectrogram_length = max_spectrogram_length
configs.max_text_length = max_text_length
configs.save()

100%|██████████| 1000/1000 [00:12<00:00, 80.73it/s]


In [None]:
data_provider = DataProvider(
    dataset=dataset,
    skip_validation=True,
    batch_size=configs.batch_size,
    data_preprocessors=[
        WavReader(frame_length=configs.frame_length, frame_step=configs.frame_step, fft_length=configs.fft_length),
        ],
    transformers=[
        SpectrogramPadding(max_spectrogram_length=configs.max_spectrogram_length, padding_value=0),
        LabelIndexer(configs.vocab),
        LabelPadding(max_word_length=configs.max_text_length, padding_value=len(configs.vocab)),
        ],
)

INFO:DataProvider:Skipping Dataset validation...


In [None]:
train_data_provider, val_data_provider = data_provider.split(split = 0.9)

In [None]:
model = train_model(
    input_dim = configs.input_shape,
    output_dim = len(configs.vocab),
    dropout=0.5
)

In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=configs.learning_rate),
    loss=CTCloss(),
    metrics=[
        CERMetric(vocabulary=configs.vocab),
        WERMetric(vocabulary=configs.vocab)
        ],
    run_eagerly=False
)

In [None]:
model.summary(line_length=110)

Model: "model"
______________________________________________________________________________________________________________
 Layer (type)                                    Output Shape                                Param #          
 input (InputLayer)                              [(None, 414, 193)]                          0                
                                                                                                              
 lambda (Lambda)                                 (None, 414, 193, 1)                         0                
                                                                                                              
 conv2d (Conv2D)                                 (None, 207, 97, 32)                         14432            
                                                                                                              
 batch_normalization (BatchNormalization)        (None, 207, 97, 32)                         128 

In [None]:
earlystopper = EarlyStopping(monitor="val_CER", patience=20, verbose=1, mode="min")
checkpoint = ModelCheckpoint(f"{configs.model_path}/model.h5", monitor="val_CER", verbose=1, save_best_only=True, mode="min")
trainLogger = TrainLogger(configs.model_path)
tb_callback = TensorBoard(f"{configs.model_path}/logs", update_freq=1)
reduceLROnPlat = ReduceLROnPlateau(monitor="val_CER", factor=0.8, min_delta=1e-10, patience=5, verbose=1, mode="auto")
model2onnx = Model2onnx(f"{configs.model_path}/model.h5")

In [None]:
# model.load_weights('/content/Models/05_sound_to_text/202310062238/model.h5')

ValueError: ignored

In [None]:
model.fit(
    train_data_provider,
    validation_data=val_data_provider,
    epochs=configs.train_epochs,
    callbacks=[earlystopper, checkpoint, trainLogger, reduceLROnPlat, tb_callback, model2onnx],
    workers=configs.train_workers
)

Epoch 1/1000
Epoch 1: val_CER improved from inf to 0.63526, saving model to Models/05_sound_to_text/202310062339/model.h5


  saving_api.save_model(


Epoch 2/1000
Epoch 2: val_CER improved from 0.63526 to 0.53366, saving model to Models/05_sound_to_text/202310062339/model.h5
Epoch 3/1000
Epoch 3: val_CER improved from 0.53366 to 0.43705, saving model to Models/05_sound_to_text/202310062339/model.h5
Epoch 4/1000
Epoch 4: val_CER did not improve from 0.43705
Epoch 5/1000
Epoch 5: val_CER did not improve from 0.43705
Epoch 6/1000
Epoch 6: val_CER did not improve from 0.43705
Epoch 7/1000
Epoch 7: val_CER did not improve from 0.43705
Epoch 8/1000
Epoch 8: val_CER did not improve from 0.43705

Epoch 8: ReduceLROnPlateau reducing learning rate to 0.0004000000189989805.
Epoch 9/1000
Epoch 9: val_CER improved from 0.43705 to 0.43441, saving model to Models/05_sound_to_text/202310062339/model.h5
Epoch 10/1000
Epoch 10: val_CER did not improve from 0.43441
Epoch 11/1000
Epoch 11: val_CER did not improve from 0.43441
Epoch 12/1000
Epoch 12: val_CER did not improve from 0.43441
Epoch 13/1000
Epoch 13: val_CER did not improve from 0.43441
Epoch 

<keras.src.callbacks.History at 0x7f65caeed510>

In [None]:
import typing
import numpy as np

from mltu.inferenceModel import OnnxInferenceModel
from mltu.preprocessors import WavReader
from mltu.utils.text_utils import ctc_decoder, get_cer, get_wer

class WavToTextModel(OnnxInferenceModel):
    def __init__(self, char_list: typing.Union[str, list], *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.char_list = char_list

    def predict(self, data: np.ndarray):
        data_pred = np.expand_dims(data, axis=0)

        preds = self.model.run(None, {self.input_name: data_pred})[0]

        text = ctc_decoder(preds, self.char_list)[0]

        return text

In [None]:
train_data_provider.to_csv(os.path.join(configs.model_path, "train.csv"))
val_data_provider.to_csv(os.path.join(configs.model_path, "val.csv"))

In [None]:
configs = BaseModelConfigs.load("Models/05_sound_to_text/202310062238/configs.yaml")

model = WavToTextModel(model_path=configs.model_path, char_list=configs.vocab, force_cpu=False)

df = pd.read_csv("Models/05_sound_to_text/202310062238/val.csv").values.tolist()

In [None]:
accum_cer, accum_wer = [], []
for wav_path, label in df:

    spectrogram = WavReader.get_spectrogram(wav_path, frame_length=configs.frame_length, frame_step=configs.frame_step, fft_length=configs.fft_length)
    # WavReader.plot_raw_audio(wav_path, label)

    padded_spectrogram = np.pad(spectrogram, ((0, configs.max_spectrogram_length - spectrogram.shape[0]),(0,0)), mode="constant", constant_values=0)

    # WavReader.plot_spectrogram(spectrogram, label)

    text = model.predict(padded_spectrogram)

    true_label = "".join([l for l in label.lower() if l in configs.vocab])
    if text[((len(text)//4)):]==true_label[((len(text)//4)):]:
      print(f"predicted :: {text},\n True :: {true_label}")



    cer = get_cer(text, true_label)
    wer = get_wer(text, true_label)

    accum_cer.append(cer)
    accum_wer.append(wer)

print(f"Average CER: {np.average(accum_cer)}, Average WER: {np.average(accum_wer)}")

Average CER: 0.41068229180535554, Average WER: 0.7902645502645503


In [None]:
!zip -r /content/models_4_58.zip /content/Models

  adding: content/Models/ (stored 0%)
  adding: content/Models/05_sound_to_text/ (stored 0%)
  adding: content/Models/05_sound_to_text/202310062238/ (stored 0%)
  adding: content/Models/05_sound_to_text/202310062238/logs/ (stored 0%)
  adding: content/Models/05_sound_to_text/202310062238/logs/validation/ (stored 0%)
  adding: content/Models/05_sound_to_text/202310062238/logs/validation/events.out.tfevents.1696632111.5ad6962bf640.317.1.v2 (deflated 78%)
  adding: content/Models/05_sound_to_text/202310062238/logs/train/ (stored 0%)
  adding: content/Models/05_sound_to_text/202310062238/logs/train/events.out.tfevents.1696632053.5ad6962bf640.317.0.v2 (deflated 84%)
  adding: content/Models/05_sound_to_text/202310062238/configs.yaml (deflated 28%)
  adding: content/Models/05_sound_to_text/202310062238/val.csv (deflated 78%)
  adding: content/Models/05_sound_to_text/202310062238/train.csv (deflated 82%)
  adding: content/Models/05_sound_to_text/202310062238/logs.log (deflated 69%)
  adding: 

In [None]:
from google.colab import files
files.download("/content/models_4_58.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>