In [None]:
pip install jiwer

Collecting jiwer
  Downloading jiwer-3.0.3-py3-none-any.whl (21 kB)
Collecting rapidfuzz<4,>=3 (from jiwer)
  Downloading rapidfuzz-3.6.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.0.3 rapidfuzz-3.6.2


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
from IPython import display
from jiwer import wer

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd drive/

/content/drive


In [None]:
cd MyDrive/


/content/drive/MyDrive


In [None]:
cd Speech_to_text/

[Errno 2] No such file or directory: 'Speech_to_text/'
/content/drive/MyDrive


In [None]:
data_url="http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2"
data_path=keras.utils.get_file("LJSpeech-1.1", data_url, untar=True)

Downloading data from http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2


In [None]:
wavs_path=data_path+"/wavs/"
metadata_path=data_path+"/metadata.csv"

In [None]:
metadata_df=pd.read_csv(metadata_path, sep='|', header= None, quoting=3)

In [None]:
metadata_df.tail()

Unnamed: 0,0,1,2
13095,LJ050-0274,made certain recommendations which it believes...,made certain recommendations which it believes...
13096,LJ050-0275,materially improve upon the procedures in effe...,materially improve upon the procedures in effe...
13097,LJ050-0276,"As has been pointed out, the Commission has no...","As has been pointed out, the Commission has no..."
13098,LJ050-0277,with the active cooperation of the responsible...,with the active cooperation of the responsible...
13099,LJ050-0278,the recommendations we have here suggested wou...,the recommendations we have here suggested wou...


In [None]:
metadata_df.head(10)

Unnamed: 0,0,1,2
0,LJ001-0001,"Printing, in the only sense with which we are ...","Printing, in the only sense with which we are ..."
1,LJ001-0002,in being comparatively modern.,in being comparatively modern.
2,LJ001-0003,For although the Chinese took impressions from...,For although the Chinese took impressions from...
3,LJ001-0004,"produced the block books, which were the immed...","produced the block books, which were the immed..."
4,LJ001-0005,the invention of movable metal letters in the ...,the invention of movable metal letters in the ...
5,LJ001-0006,"And it is worth mention in passing that, as an...","And it is worth mention in passing that, as an..."
6,LJ001-0007,"the earliest book printed with movable types, ...","the earliest book printed with movable types, ..."
7,LJ001-0008,has never been surpassed.,has never been surpassed.
8,LJ001-0009,"Printing, then, for our purpose, may be consid...","Printing, then, for our purpose, may be consid..."
9,LJ001-0010,"Now, as all books not primarily intended as pi...","Now, as all books not primarily intended as pi..."


In [None]:
metadata_df.columns= ["file_name", "transcription", "normalized_transcription"]
metadata_df= metadata_df[["file_name", "normalized_transcription"]]
metadata_df=metadata_df.sample(frac=1).reset_index(drop=True)
metadata_df.head(3)

Unnamed: 0,file_name,normalized_transcription
0,LJ019-0228,"it also contained eleven reception cells, six ..."
1,LJ026-0040,And in this connection the fact that some bact...
2,LJ008-0273,At the Old Bailey almost every one capitally c...


In [None]:
split=int(len(metadata_df)*0.90)
df_train=metadata_df[:split]
df_test=metadata_df[split:]
print(f"size of traning dataset {len(df_train)}")
print(f"size of testing data {len(df_test)}")

size of traning dataset 11790
size of testing data 1310


Preprocessing

In [None]:
#the set of characters accepted in the transcription
charcters=[x for x in "abcdefghijklmnopqrstuvwxyz'?!"]
#mapping charcters to integers
char_to_nums=keras.layers.StringLookup(vocabulary=charcters, oov_token="", invert=True)
#mappiing integers back to charchters
num_to_char=keras.layers.StringLookup(
    vocabulary =char_to_nums.get_vocabulary(), oov_token="",invert=True
)
print(
    f"the vocabulary is: {char_to_nums.get_vocabulary()}"
    f"(size={char_to_nums.vocab_size()})"
)



the vocabulary is: ['', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', "'", '?', '!'](size=30)


In [None]:
char_to_nums

<keras.src.layers.preprocessing.string_lookup.StringLookup at 0x7eab68ae4460>

In [None]:
frame_length=256
frame_step=160
fft_length=384

processing the audio

In [None]:
def encode_single_sample(wav_file, label):
  #read the wav file
  file=tf.io.read_file(wav_file + wavs_path +".wav")
  audio,_=tf.audio.decode_wav(file)
  audio=tf.squeeze(audio, axis=-1)
  audio=tf.cast(audio,tf.float32)
  spectrogram=tf.signal.stft(
      audio, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length
  )
  spectrogram=tf.abs(spectrogram)
  spectrogram=tf.math.pow(spectrogram, 0.5)
  means=tf.math.reduce_mean(spectrogram, 1, keepdims=True)
  stddevs= tf.math.reduce_std(spectrogram, 1, keepdims=True)
  spectrogram=(spectrogram-means)/(stddevs+1e-10)
  label=tf.strings.lower(label)
  label=tf.strings.unicode_split(label, input_encoding="UTF-8")
  label=char_to_nums(label)
  return spectrogram, label

In [None]:
batch_size=32
train_dataset=tf.data.Dataset.from_tensor_slices(
    (list(df_train["file_name"]), list(df_train["normalized_transcription"]))
)


In [None]:
train_dataset = (
    train_dataset.map(encode_single_sample, num_parallel_calls=tf.data.AUTOTUNE)
    .padded_batch(batch_size)
    .prefetch(buffer_size=tf.data.AUTOTUNE)
)
validation_dataset=tf.data.Dataset.from_tensor_slices(
    (list(df_test["file_name"]), list(df_test["normalized_transcription"]))
)
validation_dataset = (
    validation_dataset.map(encode_single_sample, num_parallel_calls=tf.data.AUTOTUNE)
    .padded_batch(batch_size)
    .prefetch(buffer_size=tf.data.AUTOTUNE)
)

Visualization

In [None]:
fig=plt.figure(figsize=(8, 5))
for batch in train_dataset.take(1):
  spectrogram=batch[0][0].numpy()
  spectrogram=np.array([np.trim_zeroes(x) for x in np.transpose(spectrogram)])
  label=batch[0][1]
  label=tf.strings.to_number(label).numpy()
  ax=plt.subplot(2,1,1)
  ax.imshow(spectrogram, vmax=1)
  ax.set_title(label)
  ax.axis("off")
  file=tf.io.read_file(wavs_path+list(df_train["file_name"])[0]+".wav")
  audio,_=tf.audio.decode_wav(file)
  audio=audio.numpy()
  ax=plt.subplot(2,1,2)
  plt.plot(audio)
  ax.set_title("Signal Wave")
  ax.set_xlim(0, len(audio))
  display.display(display.Audio(np.transpose(audio), rate=16000))
plt.show()

UnimplementedError: {{function_node __wrapped__MakeIterator_device_/job:localhost/replica:0/task:0/device:CPU:0}} Cast string to int64 is not supported
	 [[{{node string_lookup_2/Cast}}]] [Op:MakeIterator] name: 

<Figure size 800x500 with 0 Axes>

Model

In [None]:
def CTCLoss(y_true, y_pred):
  batch_len= tf.cast(tf.shape(y_true)[0], dtype="int64")
  input_length=tf.cast(tf.shape(y_pred)[1], dtype="int64")
  label_length=tf.cast(tf.shape(y_true)[1], dtype="int64")

  input_length=input_length*tf.ones(shape=(batch_len, 1), dtype="int64")
  label_lenght=label_length*tf.ones(shape=(batch_len, 1), dtype="int64")
  loss=keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
  return loss

we now define our model. we will define a model based on DeepSpeech2.

In [None]:
def build_model(input_dim, output_dim, rnn_layer=5, rnn_units=128):
    input_spectrogram= layers.Input((None, input_dim), name="input")
    x=layers.Reshape((-1, input_dim), name="expand_dim")(input_spectrogram)
    x=layers.Conv2D(
        filter=32,
        kernel_size=[11,41],
        strides=[2,2],
        padding="same",
        use_bais=False,
        name="conv_1",
        )(x)
    x=layers.BatchNormalization(name="conv_1_bn")(x)
    x=layers.ReLU(name="conv_1_relu")(x)
    x==layers.Conv2D(
        filters=32,
        kernel_size=[11,21],
        strides=[1,2],
        padding="same",
        use_bais=False,
        name="conv_2",
        )(x)
    x=layers.BatchNormalization(name="conv_2_bn")(x)
    x=layers.ReLU(name="con_2__relu")(x)
    for i in range(1, rnn_layer+1):
        recurrent=layers.GPU(
        units=rnn_units,
        activation="tanh",
        recurrent_activation="sigmoid",
        use_bais=True,
        reset_after= True,
        name=f"gru_{i}",
            )
    x=layers.Bidirectional (
        recurrent, name=f"bidirectional{i}", merge_mode="concat"
            )(x)
    if i < rnn_layers:
      x=layers.Dropout(rate=0.5)(x)
    x=layers.Dense(units=rnn_units*2, name="dense_1")(x)
    x=layers.ReLU(name="dense_1_relu")(x)
    x=layers.Dropout(rate=0.5)(x)
    output=layers.Dense(units=output_dim + 1, activation="softmax")(x)
    model=keras.Model(input_spectrogram, output, name= "DeepSpeech_2")
    opt = keras.optimizers.Adam(learning_rate=1e-4)
    model.compile(optimizers=opt, loss=CTCLoss)
    return model
model=build_model(
    input_dim=fft_length//2 + 1,
    output_dim=char_to_nums.vocabulary_size(),
    rnn_units=512,

)
model.summary (line_length=110)

TypeError: Conv2D.__init__() missing 1 required positional argument: 'filters'

Training and Evaluating

In [None]:
def decode_batch_predictions(pred):
    input_len=np.ones(pred.shape[0])* pred.shape[1]
    results=keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0]
    output_text=[]
    for result in results:
        result=tf.strings.reduce_join(num_to_char(result)).numpy().decode("utf-8")
        return output_text
class CallbackEval(keras.callbacks.Callback):
  def __init__ (self,dataset):
      super().__init__()
      self.dataset=dataset
  def on_epoch_end(self, epoch: int, logs=None):
      predictions=[]
      targets=[]
      for batch in self.dataset:
          X, y= batch
          batch_predictions=model.predict(X)
          batch_predictions=decode_batch_predictions(batch_predictions)
          for label in y:
              label=(
                  tf.strings.reduce_join(num_to_char(label)).numpy().decode("utf-8")
              )
              targets.append(label)
          wer_score=wer(targets, predictions)
          print("-"*100)
          print(f"word error rate:{wer_score: 4f}")
          print("-"*100)
          for i in np.random.randit(0, len(predictions), 2):
              print(f"Target    : {targets[i]}")
              print(f"Prediction   : {predictions[i]}")
              print("-"*100)




Lets start the Training process

In [None]:
epochs= 2
validation_callback= CallbackEval(validation_dataset)
history= model.fit(
    train_dataset,
    validation_data=validation_dataset,
    epochs=epochs,
    callbacks=[validation_callback],
)

NameError: name 'model' is not defined

Inference

In [None]:
predictions=[]
targets=[]
for batch in validation_dataset:
    X, y= batch
    batch_predictions= model.predict()
    batch_predictions=decode_batch_predictions(batch_predictions)
    predictions.extend(batch_predicitons)
    for label in y:
        label=tf.strings.reduce_join(num_to_char(label)).numpy().decode("utf_8")
        targets.append(label)
wer_score=wer(targets, predictions)
print("-"*100)
print(f"word error rate:{wer_score: 4f}")
print("-"*100)
for i in np.random.randit(0, len(predictions), 2):
    print(f"Target    : {targets[i]}")
    print(f"Prediction   : {predictions[i]}")
    print("-"*100)

UnimplementedError: {{function_node __wrapped__MakeIterator_device_/job:localhost/replica:0/task:0/device:CPU:0}} Cast string to int64 is not supported
	 [[{{node string_lookup_2/Cast}}]] [Op:MakeIterator] name: 

Conclusion