<a href="https://colab.research.google.com/github/Jkirk2/CMSC-473-Project-jkirk2-bstout1/blob/main/S2TModelPlayground.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [33]:
import os
import pathlib

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import tensorflow as tf

import numpy as numpy

from tensorflow import keras

import keras

import pandas as pd

import librosa

from tensorflow.keras import layers
from tensorflow.keras import models
from IPython import display

!pip install jiwer
from jiwer import wer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [None]:
DATASET_PATH = 'data/mini_speech_commands'

data_dir = pathlib.Path(DATASET_PATH)

commands = np.array(tf.io.gfile.listdir(str(data_dir)))
commands = commands[commands != 'README.md']
print('Commands:', commands)

In [None]:
train_ds, val_ds = tf.keras.utils.audio_dataset_from_directory(
    directory=data_dir,
    batch_size=64,
    validation_split=0.2,
    seed=0,
    output_sequence_length=16000,
    subset='both')

label_names = np.array(train_ds.class_names)
print()
print("label names:", label_names)

In [None]:
def squeeze(audio, labels):
  audio = tf.squeeze(audio, axis=-1)
  return audio, labels

train_ds = train_ds.map(squeeze, tf.data.AUTOTUNE)
val_ds = val_ds.map(squeeze, tf.data.AUTOTUNE)

def get_spectrogram(waveform):
  # Convert the waveform to a spectrogram via a STFT.
  spectrogram = tf.signal.stft(
      waveform, frame_length=255, frame_step=128)
  # Obtain the magnitude of the STFT.
  spectrogram = tf.abs(spectrogram)
  # Add a `channels` dimension, so that the spectrogram can be used
  # as image-like input data with convolution layers (which expect
  # shape (`batch_size`, `height`, `width`, `channels`).
  spectrogram = spectrogram[..., tf.newaxis]
  return spectrogram

def plot_spectrogram(spectrogram, ax):
  if len(spectrogram.shape) > 2:
    assert len(spectrogram.shape) == 3
    spectrogram = np.squeeze(spectrogram, axis=-1)
  # Convert the frequencies to log scale and transpose, so that the time is
  # represented on the x-axis (columns).
  # Add an epsilon to avoid taking a log of zero.
  log_spec = np.log(spectrogram.T + np.finfo(float).eps)
  height = log_spec.shape[0]
  width = log_spec.shape[1]
  X = np.linspace(0, np.size(spectrogram), num=width, dtype=int)
  Y = range(height)
  ax.pcolormesh(X, Y, log_spec)

def make_spec_ds(ds):
  return ds.map(
      map_func=lambda audio,label: (get_spectrogram(audio), label),
      num_parallel_calls=tf.data.AUTOTUNE)

class ExportModel(tf.Module):
  def __init__(self, model):
    self.model = model

    # Accept either a string-filename or a batch of waveforms.
    # YOu could add additional signatures for a single wave, or a ragged-batch. 
    self.__call__.get_concrete_function(
        x=tf.TensorSpec(shape=(), dtype=tf.string))
    self.__call__.get_concrete_function(
       x=tf.TensorSpec(shape=[None, 16000], dtype=tf.float32))


  @tf.function
  def __call__(self, x):
    # If they pass a string, load the file and decode it. 
    if x.dtype == tf.string:
      x = tf.io.read_file(x)
      x, _ = tf.audio.decode_wav(x, desired_channels=1, desired_samples=16000,)
      x = tf.squeeze(x, axis=-1)
      x = x[tf.newaxis, :]

    x = get_spectrogram(x)  
    result = self.model(x, training=False)

    class_ids = tf.argmax(result, axis=-1)
    class_names = tf.gather(label_names, class_ids)
    return {'predictions':result,
            'class_ids': class_ids,
            'class_names': class_names}

In [None]:
test_ds = val_ds.shard(num_shards=2, index=0)
val_ds = val_ds.shard(num_shards=2, index=1)

In [None]:
for example_audio, example_labels in train_ds.take(1):  
  print(example_audio.shape)
  print(example_labels.shape)

In [None]:
train_spectrogram_ds = make_spec_ds(train_ds)
val_spectrogram_ds = make_spec_ds(val_ds)
test_spectrogram_ds = make_spec_ds(test_ds)

for example_spectrograms, example_spect_labels in train_spectrogram_ds.take(1):
  break

In [None]:
input_shape = example_spectrograms.shape[1:]
print('Input shape:', input_shape)
num_labels = len(commands)

# Instantiate the `tf.keras.layers.Normalization` layer.
norm_layer = layers.Normalization()
# Fit the state of the layer to the spectrograms
# with `Normalization.adapt`.
norm_layer.adapt(data=train_spectrogram_ds.map(map_func=lambda spec, label: spec))

model = models.Sequential([
    layers.Input(shape=input_shape),
    # Downsample the input.
    layers.Resizing(32, 32),
    # Normalize.
    norm_layer,
    layers.Conv2D(32, 3, activation='relu'),
    layers.Conv2D(64, 3, activation='relu'),
    layers.MaxPooling2D(),
    layers.Dropout(0.25),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(num_labels),
])

model.summary()

In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy'],
)

In [None]:
EPOCHS = 10
history = model.fit(
    train_spectrogram_ds,
    validation_data=val_spectrogram_ds,
    epochs=EPOCHS,
    callbacks=tf.keras.callbacks.EarlyStopping(verbose=1, patience=2),
)

In [None]:
model.evaluate(test_spectrogram_ds, return_dict=True)

In [None]:
y_pred = model.predict(test_spectrogram_ds)

In [None]:
x = data_dir/'no/01bb6a2a_nohash_0.wav'
x = tf.io.read_file(str(x))
x, sample_rate = tf.audio.decode_wav(x, desired_channels=1, desired_samples=16000,)
x = tf.squeeze(x, axis=-1)
waveform = x
x = get_spectrogram(x)
x = x[tf.newaxis,...]

prediction = model(x)
plt.bar(commands, tf.nn.softmax(prediction[0]))
plt.title('No')
plt.show()

display.display(display.Audio(waveform, rate=16000))

In [None]:
export = ExportModel(model)
export(tf.constant(str(data_dir/'no/01bb6a2a_nohash_0.wav')))

In [None]:
tf.saved_model.save(export, "saved")
imported = tf.saved_model.load("saved")
imported(waveform[tf.newaxis, :])

In [None]:
#Download LJSPeech Data set from https://www.kaggle.com/datasets/mathurinache/the-lj-speech-dataset/code
#Following tutorial here https://www.youtube.com/watch?v=qKz_lmgad3o for the implementation of DeepSpeech like model
#https://arxiv.org/pdf/1412.5567.pdf

In [7]:
!pip install -q kaggle
from google.colab import files

files.upload()

!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets list

Traceback (most recent call last):
  File "/usr/local/bin/kaggle", line 5, in <module>
    from kaggle.cli import main
  File "/usr/local/lib/python3.8/dist-packages/kaggle/__init__.py", line 23, in <module>
    api.authenticate()
  File "/usr/local/lib/python3.8/dist-packages/kaggle/api/kaggle_api_extended.py", line 164, in authenticate
    raise IOError('Could not find {}. Make sure it\'s located in'
OSError: Could not find kaggle.json. Make sure it's located in /root/.kaggle. Or use the environment method.


Saving kaggle.json to kaggle (1).json
mkdir: cannot create directory ‘/root/.kaggle’: File exists
ref                                                             title                                             size  lastUpdated          downloadCount  voteCount  usabilityRating  
--------------------------------------------------------------  -----------------------------------------------  -----  -------------------  -------------  ---------  ---------------  
meirnizri/covid19-dataset                                       COVID-19 Dataset                                   5MB  2022-11-13 15:47:17          10501        309  1.0              
michals22/coffee-dataset                                        Coffee dataset                                    24KB  2022-12-15 20:02:12           1584         49  1.0              
thedevastator/jobs-dataset-from-glassdoor                       Salary Prediction                                  3MB  2022-11-16 13:52:31           6575        

In [10]:
!kaggle datasets list -s "mathurinache/the-lj-speech-dataset"
!kaggle datasets download -d mathurinache/the-lj-speech-dataset

ref                                 title                   size  lastUpdated          downloadCount  voteCount  usabilityRating  
----------------------------------  ---------------------  -----  -------------------  -------------  ---------  ---------------  
mathurinache/the-lj-speech-dataset  The LJ Speech Dataset    3GB  2021-02-15 09:19:54           1083         91  1.0              
yinxj24/ttstactronpractice          TTS-Tactron            111KB  2021-07-21 09:28:31              2          2  0.25             
Downloading the-lj-speech-dataset.zip to /content
100% 2.99G/2.99G [00:22<00:00, 169MB/s]
100% 2.99G/2.99G [00:22<00:00, 145MB/s]


In [18]:
!unzip data/the-lj-speech-dataset.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: LJSpeech-1.1/wavs/LJ030-0111.wav  
  inflating: LJSpeech-1.1/wavs/LJ030-0112.wav  
  inflating: LJSpeech-1.1/wavs/LJ030-0113.wav  
  inflating: LJSpeech-1.1/wavs/LJ030-0114.wav  
  inflating: LJSpeech-1.1/wavs/LJ030-0115.wav  
  inflating: LJSpeech-1.1/wavs/LJ030-0116.wav  
  inflating: LJSpeech-1.1/wavs/LJ030-0117.wav  
  inflating: LJSpeech-1.1/wavs/LJ030-0118.wav  
  inflating: LJSpeech-1.1/wavs/LJ030-0119.wav  
  inflating: LJSpeech-1.1/wavs/LJ030-0120.wav  
  inflating: LJSpeech-1.1/wavs/LJ030-0121.wav  
  inflating: LJSpeech-1.1/wavs/LJ030-0122.wav  
  inflating: LJSpeech-1.1/wavs/LJ030-0123.wav  
  inflating: LJSpeech-1.1/wavs/LJ030-0124.wav  
  inflating: LJSpeech-1.1/wavs/LJ030-0125.wav  
  inflating: LJSpeech-1.1/wavs/LJ030-0126.wav  
  inflating: LJSpeech-1.1/wavs/LJ030-0127.wav  
  inflating: LJSpeech-1.1/wavs/LJ030-0128.wav  
  inflating: LJSpeech-1.1/wavs/LJ030-0129.wav  
  inflating: LJSpeech-1

In [20]:
DATASET_PATH = 'data/LJSpeech-1.1'
WAV_PATH = DATASET_PATH + '/wavs/'

data_dir = pathlib.Path(DATASET_PATH + "/metadata.csv")

df = pd.read_csv(data_dir, sep="|", header=None, quoting=3)

In [21]:
df.columns = ["id", "transcript", "normalized_transcript"]
df = df[["id", "normalized_transcript"]]
df = df.sample(frac=1).reset_index(drop=True)
df.head(3)

Unnamed: 0,id,normalized_transcript
0,LJ045-0124,Marina Oswald thought he did so in order to em...
1,LJ036-0200,the radio dispatcher on channel one ordered al...
2,LJ006-0256,Visitors were still permitted to come with sup...


In [22]:
characters = [x for x in "abcedfghijklmnopqrstuvwxyz'?! "]
char_to_num = keras.layers.StringLookup(vocabulary=characters, oov_token="")
num_to_char = keras.layers.StringLookup(vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True)

print(
f"The vocabulary is: {char_to_num.get_vocabulary()}"
f"(size={char_to_num.vocabulary_size()})")


The vocabulary is: ['', 'a', 'b', 'c', 'e', 'd', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', "'", '?', '!', ' '](size=31)


In [23]:
split = int(len(df) * 0.85)
df_train = df[:split]
df_test = df[split:]

print(f"Size of training set: {len(df_train)}")
print(f"Size of test set: {len(df_test)}")

Size of training set: 11135
Size of test set: 1965


In [24]:
frame_length = 256
frame_step = 160
fft_length = 384

def encode_single_sample(wav, label):
    file = tf.io.read_file(WAV_PATH + wav + ".wav")
    
    audio, _ = tf.audio.decode_wav(file)
    audio = tf.squeeze(audio, axis=-1)
    audio =tf.cast(audio, tf.float32) #Look up why float32?
    spectrogram = tf.signal.stft(audio, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length)
    spectrogram = tf.abs(spectrogram)
    spectrogram = tf.math.pow(spectrogram, 0.5)
    
    means = tf.math.reduce_mean(spectrogram, 1, keepdims=True)
    stddevs = tf.math.reduce_std(spectrogram, 1, keepdims=True)
    spectrogram = (spectrogram - means) / (stddevs + 1e-10)
    
    label = tf.strings.lower(label)
    
    label = tf.strings.unicode_split(label, input_encoding="UTF-8")
    label = char_to_num(label)
    
    return spectrogram, label
    

In [25]:
batch_size = 32

train_dataset = tf.data.Dataset.from_tensor_slices(
    (list(df_train["id"]), list(df_train["normalized_transcript"]))
)

train_dataset = (
    train_dataset.map(encode_single_sample, num_parallel_calls=tf.data.AUTOTUNE)
    .padded_batch(batch_size)
    .prefetch(buffer_size=tf.data.AUTOTUNE)
)

test_dataset = tf.data.Dataset.from_tensor_slices(
    (list(df_test["id"]), list(df_test["normalized_transcript"]))
)

test_dataset = (
    test_dataset.map(encode_single_sample, num_parallel_calls=tf.data.AUTOTUNE)
    .padded_batch(batch_size)
    .prefetch(buffer_size=tf.data.AUTOTUNE)
)



In [26]:
def CTCLoss(y_true, y_pred):
    batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
    input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
    label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")
    
    input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
    label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")
    
    loss = keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
    return loss

In [27]:
def build_model(input_dim, output_dim, rnn_layers=5, rnn_units=128):
    input_spectrogram = layers.Input((None, input_dim), name="input")
    x = layers.Reshape((-1, input_dim, 1), name="expand_dim")(input_spectrogram)
    
    x = layers.Conv2D(
        filters=32,
        kernel_size=[11, 41],
        strides=[2, 2],
        padding="same",
        use_bias=False,
        name="conv_1",
    )(x)
    x = layers.BatchNormalization(name="conv_1_bn")(x)
    x = layers.ReLU(name="conv_1_relu")(x)

    x = layers.Conv2D(
        filters=32,
        kernel_size=[11, 21],
        strides=[1, 2],
        padding="same",
        use_bias=False,
        name="conv_2",
    )(x)
    
    x = layers.BatchNormalization(name="conv_2_bn")(x)
    x = layers.ReLU(name="conv_2_relu")(x)
    x = layers.Reshape((-1, x.shape[-2] * x.shape[-1]))(x)
    
    for i in range(1, rnn_layers + 1):
        recurrent = layers.GRU(
            units=rnn_units,
            activation="tanh",
            recurrent_activation="sigmoid",
            use_bias=True,
            return_sequences=True,
            reset_after=True,
            name=f"gru_{i}",
        )
        x = layers.Bidirectional(
            recurrent, name=f"bidirection_{i}", merge_mode="concat"
        )(x)
        if i < rnn_layers:
            x = layers.Dropout(rate=0.5)(x)
            
    x = layers.Dense(units=rnn_units * 2, name="dense_1")(x)
    x = layers.ReLU(name="dense_1_relu")(x)
    x = layers.Dropout(rate=0.5)(x)
    
    output = layers.Dense(units=output_dim + 1, activation="softmax")(x)
    model = keras.Model(input_spectrogram, output, name="deepSpeech_2")
    opt = keras.optimizers.Adam(learning_rate=1e-4)
    
    model.compile(optimizer=opt, loss=CTCLoss)
    return model
    

In [28]:
model = build_model(
    input_dim=fft_length // 2 + 1,
    output_dim=char_to_num.vocabulary_size(),
    rnn_units=512,
)
model.summary(line_length=110)

Model: "deepSpeech_2"
______________________________________________________________________________________________________________
 Layer (type)                                    Output Shape                                Param #          
 input (InputLayer)                              [(None, None, 193)]                         0                
                                                                                                              
 expand_dim (Reshape)                            (None, None, 193, 1)                        0                
                                                                                                              
 conv_1 (Conv2D)                                 (None, None, 97, 32)                        14432            
                                                                                                              
 conv_1_bn (BatchNormalization)                  (None, None, 97, 32)                     

In [39]:
# A unitlity function to decode the output of the network
def decode_batch_predictions(pred):
    input_len = np.ones(pred.shape[0]) * pred.shape[1]
    
    results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0]
    
    output_text = []
    for result in results:
        result = tf.strings.reduce_join(num_to_char(result)).numpy().decode("utf-8")
        output_text.append(result)
    return output_text

# A callback class to output a few transcriptions during training
class CallbackEval(keras.callbacks.Callback):
    def __init__(self, dataset):
        super().__init__()
        self.dataset = dataset
        
    def on_epoch_end(self, epoch: int, logs=None):
        predictions = []
        targets = []
        for batch in self.dataset:
            X, y = batch
            batch_predictions = model.predict(X)
            batch_predictions = decode_batch_predictions(batch_predictions)
            predictions.extend(batch_predictions)
            for label in y:
                label = (
                    tf.strings.reduce_join(num_to_char(label)).numpy().decode("utf-8")
                )
                targets.append(label)
            wer_score = wer(targets, predictions)
            print("-" * 100)
            print(f"Word Error Rate: {wer_score:.4f}")
            print("-" * 100)
            
            for i in np.random.randint(0, len(predictions), 5):
                print(f"Target\t: {targets[i]}")
                print(f"Prediction\t: {predictions[i]}")
                print("-" * 100)
        

In [30]:
print(WAV_PATH)

data/LJSpeech-1.1/wavs/


In [45]:
epochs = 100
test_callback = CallbackEval(test_dataset)

history = model.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=epochs,
    callbacks=[test_callback]
)

Epoch 1/100
 58/348 [====>.........................] - ETA: 3:25 - loss: 89.1301

KeyboardInterrupt: ignored

In [44]:
predictions = []
targets = []

for batch in test_dataset:
    X, y = batch
    batch_predictions = model.predict(X)
    batch_predictions = decode_batch_predictions(batch_predictions)
    predictions.extend(batch_predictions)
    for label in y:
        label = tf.strings.reduce_join(num_to_char(label)).numpy().decode("utf-8")
        targets.append(label)
    wer_score = wer(targets, predictions)
    print("-" * 100)
    print(f"Word Error Rate: {wer_score:.4f}")
    print("-" * 100)
    for i in np.random.randint(0, len(predictions), 5):
        print(f"Target\t: {targets[i]}")
        print(f"Prediction\t: {predictions[i]}")
        print("-" * 100)

----------------------------------------------------------------------------------------------------
Word Error Rate: 0.6797
----------------------------------------------------------------------------------------------------
Target	: this basic approach to the problem of planning for emergencies is sound
Prediction	: this basit aproe to the proble of planing formurencs e sound
----------------------------------------------------------------------------------------------------
Target	: down below between the galleries was the mass of the prison population
Prediction	: down lo be twen the gaers was the mas of the prisonvotilation
----------------------------------------------------------------------------------------------------
Target	: suspicion fell at length upon ashley who was seen to handle the forks and spoons at table in a strange manner
Prediction	: suspition feltlingt the ponaly tho was sein tohandl the for sonpn attable instrange man
------------------------------------------

In [34]:
numpY

<module 'numpy' from '/usr/local/lib/python3.8/dist-packages/numpy/__init__.py'>