In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd drive/

/content/drive


In [None]:
cd MyDrive/

/content/drive/MyDrive


In [None]:
cd text/

/content/drive/MyDrive/text


In [None]:
ls

my_model.keras  speech_to_text.ipynb


In [1]:
!pip install jiwer



In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
from IPython import display
from jiwer import wer

**Import LJ Speech Dataset**

ID: this is name of corresponding .wav file
Transcription: words spoken by the reader
Normalized Transcription: transcription with numbers, ordinals and monetary unit expanded into full word

In [22]:
#https://https://keithito.com/LJ-Speech-Dataset/

In [23]:
#https://https://www.kaggle.com/datasets/awsaf49/ljspeech-sr16k-dataset

In [26]:
data_url = "https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2"
data_path = keras.utils.get_file("LJSpeech-1.1", data_url,untar=True)

In [27]:
wavs_path = data_path + "/wavs/"
metadata_path = data_path + "/metadata.csv"

In [28]:
#Read metadata file and parse it
metadata_df = pd.read_csv(metadata_path, sep="|", header=None, quoting=3)

In [29]:
metadata_df.head()

Unnamed: 0,0,1,2
0,LJ001-0001,"Printing, in the only sense with which we are ...","Printing, in the only sense with which we are ..."
1,LJ001-0002,in being comparatively modern.,in being comparatively modern.
2,LJ001-0003,For although the Chinese took impressions from...,For although the Chinese took impressions from...
3,LJ001-0004,"produced the block books, which were the immed...","produced the block books, which were the immed..."
4,LJ001-0005,the invention of movable metal letters in the ...,the invention of movable metal letters in the ...


In [30]:
metadata_df.columns = ["file_name", "transcription", "normalized_transcription"]
metadata_df = metadata_df[["file_name","normalized_transcription"]]
metadata_df = metadata_df.sample(frac=1).reset_index(drop=True)
metadata_df.head(3)

Unnamed: 0,file_name,normalized_transcription
0,LJ011-0207,whom I never saw until I was taken from Liverp...
1,LJ017-0187,"Her husband, who came up to town, would not al..."
2,LJ045-0126,"to the Soviet Embassy in Washington, he asked ..."


Now Split data for training and validation

In [31]:
split = int(len(metadata_df) * 0.90)
df_train = metadata_df[:split]
df_val = metadata_df[split:]
print(f"Size of training set: {len(df_train)}")
print(f"Size of validation set: {len(df_val)}")

Size of training set: 11790
Size of validation set: 1310


**Pre-Processing**

We First prepare the vocublary to be used

In [8]:
# The set of characters accepted in transcription
characters = [x for x in "abcdefghijklmnopqrstuvwxyz'?! "]
#Mapping characters to integers
char_to_num = keras.layers.StringLookup(vocabulary=characters, oov_token="")
#Mapping integers back to orignal characters
num_to_char = keras.layers.StringLookup(vocabulary=char_to_num.get_vocabulary(), oov_token="",invert=True)
print(f"The vocabulary is: {char_to_num.get_vocabulary()}")
print(f"size={char_to_num.vocabulary_size()}")

The vocabulary is: ['', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', "'", '?', '!', ' ']
size=31


Next, we create the function that describes the transformation that we apply to each element of our dataset

In [33]:
#An integer scalar Tensor. The window length is samples
frame_length  = 256
#AN integer scalar Tensor. The number of samples to step
frame_step = 160
#AN integer scalar Tensor. The size of the FFt to apply
# If not provided, uses the smallest power of 2 enclosing frame_length.
fft_length = 384

def encode_single_sample(wav_file, label):
  #####################################
  ## Process Audio
  #####################################
  # 1. Read wav file
  file = tf.io.read_file(wavs_path + wav_file + ".wav")
  # 2. Decode the wav file
  audio, _ = tf.audio.decode_wav(file)
  audio = tf.squeeze(audio, axis=1)
  # 3. change type to float
  audio = tf.cast(audio, tf.float32)
  # 4. Get the spectogram
  spectrogram = tf.signal.stft(audio, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length)
  # 5. Only need magnitude, which can be derived by applying f.abs
  spectrogram = tf.abs(spectrogram)
  spectrogram = tf.math.pow(spectrogram, 0.5)
  # 6. Normalisation
  means = tf.math.reduce_mean(spectrogram, 1, keepdims=True)
  stddevs  = tf.math.reduce_std(spectrogram, 1, keepdims=True)
  spectrogram = (spectrogram - means) / (stddevs + 1e-10)

  ########################################################
  ## Process the label
  ########################################################
  # 7. convert label to lower case
  label = tf.strings.lower(label)
  # 8. Split the label
  label = tf.strings.unicode_split(label, input_encoding="UTF-8")
  # 9. Map the characters in label to numbers
  label = char_to_num(label)
  # 10. Return a dict as our model is expecting two inputs
  return spectrogram, label

**Creating Dataset Objects**

create tf.data.Dataset object that yields the transformed elements, in the same order as they appeared in the input

In [34]:
batch_size = 32
# Define the training dataset
train_dataset = tf.data.Dataset.from_tensor_slices(
    (list(df_train["file_name"]),list(df_train["normalized_transcription"]))
)
train_dataset= (
    train_dataset.map(encode_single_sample, num_parallel_calls=tf.data.AUTOTUNE)
    .padded_batch(batch_size)
    .prefetch(buffer_size=tf.data.AUTOTUNE)
)
# Define the validation dataset
validation_dataset = tf.data.Dataset.from_tensor_slices(
    (list(df_val["file_name"]), list(df_val["normalized_transcription"]))
)
validation_dataset = (
    validation_dataset.map(encode_single_sample, num_parallel_calls=tf.data.AUTOTUNE)
    .padded_batch(batch_size)
    .prefetch(buffer_size=tf.data.AUTOTUNE)
)

**Visualize data**

visualize dataset including audio clip, the spectrogram and corresponding label.

In [36]:
fig = plt.figure(figsize=(8, 5))

for batch in train_dataset.take(1):
    spectrogram = batch[0][0].numpy()
    spectrogram = np.array([np.trim_zeros(x) for x in np.transpose(spectrogram)])
    label = batch[1][0]

    # Check the label value
    label_str = tf.strings.reduce_join(num_to_char(label)).numpy().decode("utf-8")
    print("Label:", label_str)

    ax1 = plt.subplot(2, 1, 1)
    ax1.imshow(spectrogram, vmax=1)
    ax1.set_title(label_str)
    ax1.axis("off")

    # Load and display the waveform
    file_path = wavs_path + list(df_train["file_name"])[0] + ".wav"
    audio, _ = tf.audio.decode_wav(tf.io.read_file(file_path))
    audio = audio.numpy()
    ax2 = plt.subplot(2, 1, 2)
    ax2.plot(audio)
    ax2.set_title("Signal Wave")
    ax2.set_xlim(0, len(audio))

    # Display the audio player
    display.display(display.Audio(np.transpose(audio), rate=16000))

plt.show()

NotFoundError: {{function_node __wrapped__IteratorGetNext_output_types_2_device_/job:localhost/replica:0/task:0/device:CPU:0}} NewRandomAccessFile failed to Create/Open: C:\Users\Hp\.keras\datasets\LJSpeech-1.1/wavs/LJ011-0207.wav : The system cannot find the file specified.
; No such file or directory
	 [[{{node ReadFile}}]] [Op:IteratorGetNext] name: 

<Figure size 800x500 with 0 Axes>

**Model**

First define CTC loss function

In [4]:
def CTCLoss(y_true,y_pred):
  # compute the training-time loss value
  batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
  input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
  label_length  = tf.cast(tf.shape(y_true)[1], dtype="int64")

  input_length  = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
  label_length = label_length * tf.ones(shape=(batch_len, 1),dtype="int64")

  loss = keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
  return loss

**We now define our model similar to deepspeech2**

In [16]:
def build_model(input_dim, output_dim, rnn_layers=8, rnn_units=128):
    """Model similar to deepspeech2"""
    # Model input
    input_spectrogram = layers.Input((None, input_dim), name="input")
    # Expand dimension to use 2D CNN
    x = layers.Reshape((-1, input_dim, 1), name="expand_dim")(input_spectrogram)
    # Convolution layer 1
    x = layers.Conv2D(
        filters=32,
        kernel_size=[11, 41],
        strides=[2, 2],
        padding="same",
        use_bias=False,
        name="conv_1",
    )(x)
    x = layers.BatchNormalization(name="conv_1_bn")(x)
    x = layers.ReLU(name="conv_1_relu")(x)
    # Convolution layer 2
    x = layers.Conv2D(
        filters=32,
        kernel_size=[11, 21],
        strides=[2, 2],  # Fix the strides
        padding="same",
        use_bias=False,
        name="conv_2",
    )(x)
    x = layers.BatchNormalization(name="conv_2_bn")(x)
    x = layers.ReLU(name="conv_2_relu")(x)
    # Reshape the resulted volume to feed the RNNs layers
    x = layers.Reshape((-1, x.shape[-2] * x.shape[-1]))(x)
    # RNN layers
    for i in range(1, rnn_layers + 1):
        recurrent = layers.GRU(
            units=rnn_units,
            activation="tanh",
            recurrent_activation="sigmoid",
            use_bias=True,
            return_sequences=True,
            reset_after=True,
            name=f"gru_{i}",  # Correct the name
        )
        x = layers.Bidirectional(
            recurrent, name=f"bidirectional_{i}", merge_mode="concat"
        )(x)
        if i < rnn_layers:
            x = layers.Dropout(rate=0.5)(x)
    # Dense layer
    x = layers.Dense(units=rnn_units * 2, name="dense_1")(x)
    x = layers.ReLU(name="dense_1_relu")(x)
    x = layers.Dropout(rate=0.5)(x)
    # Classification layer
    output = layers.Dense(units=output_dim + 1, activation="softmax")(x)
    # Model
    model = keras.Model(input_spectrogram, output, name="DeepSpeech2")
    # Optimizer
    opt = keras.optimizers.Adam(learning_rate=1e-4)
    # Compile the model and return
    model.compile(optimizer=opt, loss=CTCLoss)
    return model

# Get the model
model1 = build_model(
    input_dim=fft_length // 2 + 1,
    output_dim=char_to_num.vocabulary_size(),
    rnn_units=256,
)
model1.summary(line_length=110)


Model: "DeepSpeech2"
______________________________________________________________________________________________________________
 Layer (type)                                    Output Shape                                Param #          
 input (InputLayer)                              [(None, None, 193)]                         0                
                                                                                                              
 expand_dim (Reshape)                            (None, None, 193, 1)                        0                
                                                                                                              
 conv_1 (Conv2D)                                 (None, None, 97, 32)                        14432            
                                                                                                              
 conv_1_bn (BatchNormalization)                  (None, None, 97, 32)                      

**Training and Evaluating**

In [6]:

# utility function to decode the output of the networl
def decode_batch_predictions(pred):
  input_len = np.ones(pred.shape[0]) * pred.shape[1]
  #use greedy search. for complex task, otherwise can use beam search
  results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0]
  # Iterate over the results and get back the text
  output_text = []
  for result in results:
    result = tf.strings.reduce_join(num_to_char(result)).numpy().decode("utf-8")
    output_text.append(result)
  return output_text

# A callback class to output a few transcription during training
class CallbackEval(keras.callbacks.Callback):
  """Display a batch of outputs after every epoch """

  def __init__(self, dataset):
    super().__init__()
    self.dataset = dataset

  def on_epoch_end(self, epoch: int, logs=None):
    predictions = []
    targets = []
    for batch in self.dataset:
      X, y = batch
      batch_predictions = model.predict(X)
      batch_predictions  =decode_batch_predictions(batch_predictions)
      predictions.extend(batch_predictions)
      for label in y:
        label = (
            tf.strings.reduce_join(num_to_char(label)).numpy().decode("utf-8")
        )
        targets.append(label)
    wer_score = wer(targets, predictions)
    print("-" * 100)
    print(f"Word Error Rate: {wer_score: .4f}")
    print("-" * 100)
    for i in np.random.randint(0, len(predictions), 2):
        print(f"Target  : {targets[i]}")
        print(f"Prediction: {predictions[i]}")
        print("-" * 100)




**Starting Training Process**

In [18]:
# Define the number of epochs
epochs = 50
# callback function to check transcription on val set
validation_callback = CallbackEval(validation_dataset)
# Train the model
history = model.fit(
    train_dataset,
    validation_data=validation_dataset,
    epochs=epochs,
    callbacks=[validation_callback]
)

Epoch 1/50
----------------------------------------------------------------------------------------------------
Word Error Rate:  1.0000
----------------------------------------------------------------------------------------------------
Target  : and a determination made as to whether the information might indicate possible harm to the president
Prediction: h
----------------------------------------------------------------------------------------------------
Target  : or a dungeon above or below the gate of a town to the first attempts at systematic reconstruction carried out under the advice and supervision of howard
Prediction: 
----------------------------------------------------------------------------------------------------
Epoch 2/50
----------------------------------------------------------------------------------------------------
Word Error Rate:  1.0000
----------------------------------------------------------------------------------------------------
Target  : mother plat

**Inference**

In [23]:
# Let's check results on more validation samples
predictions = []
targets = []
for batch in validation_dataset:
  X, y = batch
  batch_predictions = model.predict(X)
  batch_predictions  =decode_batch_predictions(batch_predictions)
  predictions.extend(batch_predictions)
  for label in y:
    label =  tf.strings.reduce_join(num_to_char(label)).numpy().decode("utf-8")
    targets.append(label)
wer_score = wer(targets, predictions)
print("-" * 100)
print(f"Word Error Rate: {wer_score: .4f}")
print("-" * 100)
for i in np.random.randint(0, len(predictions), 5):
    print(f"Target  : {targets[i]}")
    print(f"Prediction: {predictions[i]}")
    print("-" * 100)


----------------------------------------------------------------------------------------------------
Word Error Rate:  0.3706
----------------------------------------------------------------------------------------------------
Target  : the only practical way for westbound traffic on main street
Prediction: the onlin practical way wr west boun traffic on main street
----------------------------------------------------------------------------------------------------
Target  : on which the river sank to such an extent that the natural bed of the stream became fordable
Prediction: on which the riber sank to such an extend that that natural bed of the streme became fordable
----------------------------------------------------------------------------------------------------
Target  : the rifle was according to the evidence stored in a green and brown blanket in the paines' garage among the oswalds' other possessions
Prediction: the rifle was according to the evidence stoard in a green and b

In [20]:
model.save("speech_to_text_model_modified(37).h5")

In [21]:
model.save('my_model.keras')

#agaim

In [22]:
# Define the number of epochs
epochs = 30
# callback function to check transcription on val set
validation_callback = CallbackEval(validation_dataset)
# Train the model
history = model.fit(
    train_dataset,
    validation_data=validation_dataset,
    epochs=epochs,
    callbacks=[validation_callback]
)

Epoch 1/30
----------------------------------------------------------------------------------------------------
Word Error Rate:  0.4348
----------------------------------------------------------------------------------------------------
Target  : i want forgiveness of man i want those doors of the prison opened
Prediction: i ant for giveness of man i wont those doors of the prison od
----------------------------------------------------------------------------------------------------
Target  : the letter found in the pocket of the deceased was sealed with a wafer marked f
Prediction: the letter found in the pocket of the deceast was sceled with a a er at f
----------------------------------------------------------------------------------------------------
Epoch 2/30
----------------------------------------------------------------------------------------------------
Word Error Rate:  0.4233
-------------------------------------------------------------------------------------------------

KeyboardInterrupt: 

In [13]:

# 1. Record or load your audio file
# For instance, if you record an audio file named "test_audio.wav"
loaded_model = keras.models.load_model(
    "C:\\Users\\Hp\\Downloads\\Module 2\\speech_to_text_model_modified(37).h5",
    custom_objects={'CTCLoss': CTCLoss}
)



# 2. Preprocess the audio file
def preprocess_audio(audio_file):
    file = tf.io.read_file(audio_file)
    audio, _ = tf.audio.decode_wav(file)
    audio = tf.squeeze(audio, axis=1)
    audio = tf.cast(audio, tf.float32)
    spectrogram = tf.signal.stft(audio, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length)
    spectrogram = tf.abs(spectrogram)
    spectrogram = tf.math.pow(spectrogram, 0.5)
    means = tf.math.reduce_mean(spectrogram, 1, keepdims=True)
    stddevs = tf.math.reduce_std(spectrogram, 1, keepdims=True)
    spectrogram = (spectrogram - means) / (stddevs + 1e-10)
    return spectrogram
frame_length = 256
frame_step = 160
fft_length = 384
batch_size=32
# Preprocess the audio file
test_audio_file = "\\Users\\Hp\\Downloads\\Module 2\\test4.wav"

preprocessed_audio = preprocess_audio(test_audio_file)

# 3. Feed the audio to the model for transcription
predicted_text = loaded_model.predict(preprocessed_audio)
decoded_text = decode_batch_predictions(predicted_text)[0]  # Assuming a single audio file
print("decoder",decode_batch_predictions)
# 4. Evaluate the transcription
actual_transcription = "Your actual transcription if available"
print("Predicted Transcription:", predicted_text, decode_batch_predictions)
#print("Actual Transcription:", actual_transcription)

# Calculate Word Error Rate (WER) if you have the actual transcription
wer_score = wer([actual_transcription], [decoded_text])
print("Word Error Rate:", wer_score)

decoder <function decode_batch_predictions at 0x0000019377ECF380>
Predicted Transcription: [[[0.07331122 0.02450207 0.0389813  ... 0.0144445  0.02508346 0.1237794 ]]

 [[0.07331122 0.02450207 0.0389813  ... 0.0144445  0.02508346 0.1237794 ]]

 [[0.07331122 0.02450207 0.0389813  ... 0.0144445  0.02508346 0.1237794 ]]

 ...

 [[0.07331122 0.02450207 0.0389813  ... 0.0144445  0.02508346 0.1237794 ]]

 [[0.07331121 0.02450207 0.0389813  ... 0.0144445  0.02508346 0.12377939]]

 [[0.07331121 0.02450207 0.0389813  ... 0.0144445  0.02508346 0.1237794 ]]] <function decode_batch_predictions at 0x0000019377ECF380>
Word Error Rate: 1.0
