<a href="https://colab.research.google.com/github/MuhammedAdheeb/Tamil-BiGRU/blob/main/BiGRU_Tamil_ASR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install required libraries
!pip install tensorflow==2.12 librosa jiwer soundfile

# Import libraries
import os
import tarfile
import urllib.request
import numpy as np
import pandas as pd
import librosa
import tensorflow as tf
from tensorflow.keras.layers import Input, Bidirectional, GRU, Dense, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from jiwer import wer



In [None]:
#previous approach to download and extract, discontinued because the dataset was downloaded in the drive
# # Define dataset URL and paths
# dataset_url = "https://openslr.org/resources/127/mile_tamil_asr_corpus.tar.gz"
# dataset_path = "/content/mile_tamil_asr_corpus.tar.gz"
# extracted_path = "/content/mile_tamil_asr_corpus"

# # Download the dataset
# urllib.request.urlretrieve(dataset_url, dataset_path)

# # Extract the dataset
# with tarfile.open(dataset_path, "r:gz") as tar:
#     tar.extractall(path=extracted_path)

# print("Dataset downloaded and extracted successfully.")

Dataset downloaded and extracted successfully.


In [2]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Define paths
tar_file_path = "/content/drive/My Drive/mile_tamil_asr_corpus.tar.gz"  # Update with actual file path
extracted_path = "/content"

# Ensure the extraction directory exists
os.makedirs(extracted_path, exist_ok=True)

# Extract the dataset
with tarfile.open(tar_file_path, "r:gz") as tar:
    tar.extractall(path=extracted_path)

print("Dataset extracted successfully at:", extracted_path)

Mounted at /content/drive
Dataset extracted successfully at: /content


In [3]:
# Paths to audio and transcript directories
train_audio_dir = os.path.join(extracted_path, "mile_tamil_asr_corpus/train/audio_files")
train_trans_dir = os.path.join(extracted_path, "mile_tamil_asr_corpus/train/trans_files")

test_audio_dir = os.path.join(extracted_path, "mile_tamil_asr_corpus/test/audio_files")
test_trans_dir = os.path.join(extracted_path, "mile_tamil_asr_corpus/test/trans_files")

# Count .wav and .txt files
train_wav_count = len([f for f in os.listdir(train_audio_dir) if f.endswith(".wav")])
train_txt_count = len([f for f in os.listdir(train_trans_dir) if f.endswith(".txt")])

test_wav_count = len([f for f in os.listdir(test_audio_dir) if f.endswith(".wav")])
test_txt_count = len([f for f in os.listdir(test_trans_dir) if f.endswith(".txt")])

print(f"Training set: {train_wav_count} .wav files, {train_txt_count} .txt files")
print(f"Testing set: {test_wav_count} .wav files, {test_txt_count} .txt files")

Training set: 77314 .wav files, 77314 .txt files
Testing set: 12087 .wav files, 12087 .txt files


In [4]:
def load_transcripts(trans_dir):
    transcripts = {}
    for trans_file in os.listdir(trans_dir):
        if trans_file.endswith(".txt"):
            audio_id = os.path.splitext(trans_file)[0]  # Extract audio ID from filename
            file_path = os.path.join(trans_dir, trans_file)
            with open(file_path, "r", encoding="utf-8") as f:
                transcript = f.read().strip()  # Read the entire file as the transcript
                transcripts[audio_id] = transcript
    return transcripts

# Load train and test transcripts
train_transcripts = load_transcripts(train_trans_dir)
test_transcripts = load_transcripts(test_trans_dir)

print(f"Loaded {len(train_transcripts)} training transcripts.")
print(f"Loaded {len(test_transcripts)} testing transcripts.")

Loaded 77314 training transcripts.
Loaded 12087 testing transcripts.


In [5]:
import librosa

def extract_mfcc(audio_path, n_mfcc=40):
    """
    Extract MFCC features from an audio file.

    Args:
        audio_path (str): Path to the audio file.
        n_mfcc (int): Number of MFCC coefficients to extract.

    Returns:
        np.ndarray: MFCC features with shape (time_steps, num_features).
    """
    # Load the audio file
    y, sr = librosa.load(audio_path, sr=None)  # sr=None preserves the original sampling rate

    # Extract MFCC features
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)

    # Transpose to shape (time_steps, num_features)
    return mfccs.T

def calculate_max_audio_len(audio_dir):
    max_len = 0
    for audio_file in os.listdir(audio_dir):
        if audio_file.lower().endswith(".wav"):
            audio_path = os.path.join(audio_dir, audio_file)
            mfccs = extract_mfcc(audio_path)
            max_len = max(max_len, len(mfccs))
    return max_len

# Function to calculate max_text_len
def calculate_max_text_len(transcripts, char_to_idx):
    max_len = 0
    for label in transcripts.values():
        encoded_label = [char_to_idx[char] for char in label]
        max_len = max(max_len, len(encoded_label))
    return max_len

In [None]:
# previous appproach to load data, not cotinued because of the excessive memory usage
# def load_audio_data(audio_dir, transcripts):
#     audio_data = []
#     labels = []
#     for audio_file in os.listdir(audio_dir):
#         if audio_file.lower().endswith(".wav"):
#             audio_id = os.path.splitext(audio_file)[0]  # Extract audio ID from filename
#             if audio_id in transcripts:  # Ensure transcript exists
#                 audio_path = os.path.join(audio_dir, audio_file)
#                 mfccs = extract_mfcc(audio_path)  # Extract MFCC features
#                 audio_data.append(mfccs)
#                 labels.append(transcripts[audio_id])
#     return audio_data, labels

# # Load train and test audio data
# train_audio_data, train_labels = load_audio_data(train_audio_dir, train_transcripts)
# test_audio_data, test_labels = load_audio_data(test_audio_dir, test_transcripts)

# print(f"Loaded {len(train_audio_data)} training audio files.")
# print(f"Loaded {len(test_audio_data)} testing audio files.")

In [6]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
def audio_generator(audio_dir, transcripts, char_to_idx, batch_size, max_audio_len, max_text_len):
    """
    Generator to yield batches of audio data and encoded labels.
    """
    audio_files = [f for f in os.listdir(audio_dir) if f.lower().endswith(".wav")]
    num_files = len(audio_files)

    for i in range(0, num_files, batch_size):
        batch_audio_data = []
        batch_labels = []

        for j in range(i, min(i + batch_size, num_files)):
            audio_file = audio_files[j]
            audio_id = os.path.splitext(audio_file)[0]

            if audio_id in transcripts:
                # Load audio data
                audio_path = os.path.join(audio_dir, audio_file)
                mfccs = extract_mfcc(audio_path)
                batch_audio_data.append(mfccs)

                # Encode transcript
                label = transcripts[audio_id]
                encoded_label = [char_to_idx[char] for char in label]
                batch_labels.append(encoded_label)

        # Pad sequences dynamically
        batch_padded_audio = pad_sequences(batch_audio_data, maxlen=max_audio_len, dtype="float32", padding="post")
        batch_padded_labels = pad_sequences(batch_labels, maxlen=max_text_len, padding="post")

        yield batch_padded_audio, batch_padded_labels

In [7]:
# Build character vocabulary
all_text = " ".join(list(train_transcripts.values()) + list(test_transcripts.values()))
unique_chars = sorted(set(all_text))
char_to_idx = {char: idx + 1 for idx, char in enumerate(unique_chars)}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}
char_to_idx["<PAD>"] = 0  # Padding token
idx_to_char[0] = "<PAD>"

print(f"Vocabulary size: {len(char_to_idx)}")

# Calculate max lengths
train_audio_dir = os.path.join(extracted_path, "mile_tamil_asr_corpus/train/audio_files")
test_audio_dir = os.path.join(extracted_path, "mile_tamil_asr_corpus/test/audio_files")

max_audio_len = max(calculate_max_audio_len(train_audio_dir), calculate_max_audio_len(test_audio_dir))
max_text_len = max(calculate_max_text_len(train_transcripts, char_to_idx), calculate_max_text_len(test_transcripts, char_to_idx))

print(f"Max audio length: {max_audio_len}")
print(f"Max text length: {max_text_len}")

Vocabulary size: 49
Max audio length: 1215
Max text length: 605


In [10]:
# Define model parameterss
input_dim = 40  # Number of MFCC features
num_classes = len(char_to_idx)  # Vocabulary size

# Input layer
inputs = Input(shape=(None, input_dim))

# Bi-directional GRU layers
x = Bidirectional(GRU(256, return_sequences=True))(inputs)
x = Bidirectional(GRU(256, return_sequences=True))(x)

# Dense layer for character prediction
outputs = TimeDistributed(Dense(num_classes, activation="softmax"))(x)

# Define model
model = Model(inputs, outputs)

# Compile with CTC loss
def ctc_loss(y_true, y_pred):
    batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
    input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
    label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")

    input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
    label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")

    loss = tf.keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
    return loss

model.compile(optimizer=Adam(), loss=ctc_loss)
model.summary()

# Create generators
batch_size = 32
train_gen = audio_generator(train_audio_dir, train_transcripts, char_to_idx, batch_size, max_audio_len, max_text_len)
test_gen = audio_generator(test_audio_dir, test_transcripts, char_to_idx, batch_size, max_audio_len, max_text_len)

# Train the model
history = model.fit(
    train_gen,
    validation_data=test_gen,
    steps_per_epoch=len(os.listdir(train_audio_dir)) // batch_size,
    validation_steps=len(os.listdir(test_audio_dir)) // batch_size,
    epochs=20,
    callbacks=[tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)]
)

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, None, 40)]        0         
                                                                 
 bidirectional_2 (Bidirectio  (None, None, 512)        457728    
 nal)                                                            
                                                                 
 bidirectional_3 (Bidirectio  (None, None, 512)        1182720   
 nal)                                                            
                                                                 
 time_distributed_1 (TimeDis  (None, None, 49)         25137     
 tributed)                                                       
                                                                 
Total params: 1,665,585
Trainable params: 1,665,585
Non-trainable params: 0
_________________________________________________

InvalidArgumentError: Graph execution error:

Detected at node 'ctc_loss/CTCLoss' defined at (most recent call last):
    File "<frozen runpy>", line 198, in _run_module_as_main
    File "<frozen runpy>", line 88, in _run_code
    File "/usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py", line 37, in <module>
      ColabKernelApp.launch_instance()
    File "/usr/local/lib/python3.11/dist-packages/traitlets/config/application.py", line 992, in launch_instance
      app.start()
    File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelapp.py", line 712, in start
      self.io_loop.start()
    File "/usr/local/lib/python3.11/dist-packages/tornado/platform/asyncio.py", line 205, in start
      self.asyncio_loop.run_forever()
    File "/usr/lib/python3.11/asyncio/base_events.py", line 608, in run_forever
      self._run_once()
    File "/usr/lib/python3.11/asyncio/base_events.py", line 1936, in _run_once
      handle._run()
    File "/usr/lib/python3.11/asyncio/events.py", line 84, in _run
      self._context.run(self._callback, *self._args)
    File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 510, in dispatch_queue
      await self.process_one()
    File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 499, in process_one
      await dispatch(*args)
    File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 406, in dispatch_shell
      await result
    File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 730, in execute_request
      reply_content = await reply_content
    File "/usr/local/lib/python3.11/dist-packages/ipykernel/ipkernel.py", line 383, in do_execute
      res = shell.run_cell(
    File "/usr/local/lib/python3.11/dist-packages/ipykernel/zmqshell.py", line 528, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 2975, in run_cell
      result = self._run_cell(
    File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3030, in _run_cell
      return runner(coro)
    File "/usr/local/lib/python3.11/dist-packages/IPython/core/async_helpers.py", line 78, in _pseudo_sync_runner
      coro.send(None)
    File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3257, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3473, in run_ast_nodes
      if (await self.run_code(code, result,  async_=asy)):
    File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "<ipython-input-10-018f1a0ad348>", line 39, in <cell line: 0>
      history = model.fit(
    File "/usr/local/lib/python3.11/dist-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/usr/local/lib/python3.11/dist-packages/keras/engine/training.py", line 1685, in fit
      tmp_logs = self.train_function(iterator)
    File "/usr/local/lib/python3.11/dist-packages/keras/engine/training.py", line 1284, in train_function
      return step_function(self, iterator)
    File "/usr/local/lib/python3.11/dist-packages/keras/engine/training.py", line 1268, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/usr/local/lib/python3.11/dist-packages/keras/engine/training.py", line 1249, in run_step
      outputs = model.train_step(data)
    File "/usr/local/lib/python3.11/dist-packages/keras/engine/training.py", line 1051, in train_step
      loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/usr/local/lib/python3.11/dist-packages/keras/engine/training.py", line 1109, in compute_loss
      return self.compiled_loss(
    File "/usr/local/lib/python3.11/dist-packages/keras/engine/compile_utils.py", line 265, in __call__
      loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/usr/local/lib/python3.11/dist-packages/keras/losses.py", line 142, in __call__
      losses = call_fn(y_true, y_pred)
    File "/usr/local/lib/python3.11/dist-packages/keras/losses.py", line 268, in call
      return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "<ipython-input-8-018f1a0ad348>", line 27, in ctc_loss
      loss = tf.keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
    File "/usr/local/lib/python3.11/dist-packages/keras/backend.py", line 7050, in ctc_batch_cost
      tf.compat.v1.nn.ctc_loss(
Node: 'ctc_loss/CTCLoss'
Saw a non-null label (index >= num_classes - 1) following a null label, batch: 0 num_classes: 49 labels: 23,39,1,9,24,48,24,44,1,33,38,20,48,20,40,33,38,20,40,1,3,33,28,48,15,31,44,25,48,25,37,28,48,15,48,15,38,30,40,26,48,1,7,24,48,24,44,1,25,30,22,48,22,22,40,26,48,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 labels seen so far: 23,39,1,9,24
	 [[{{node ctc_loss/CTCLoss}}]] [Op:__inference_train_function_23561]

In [None]:
# Added a newer implementation for audio as well , so discontinued
# Build character vocabulary
# all_text = " ".join(list(train_transcripts.values()) + list(test_transcripts.values()))
# unique_chars = sorted(set(all_text))
# char_to_idx = {char: idx + 1 for idx, char in enumerate(unique_chars)}
# idx_to_char = {idx: char for char, idx in char_to_idx.items()}
# char_to_idx["<PAD>"] = 0  # Padding token
# idx_to_char[0] = "<PAD>"

# print(f"Vocabulary size: {len(char_to_idx)}")

Vocabulary size: 49


In [None]:
# defined befor the audio_generator part, discontinued
 # Build character vocabulary
# all_text = " ".join(list(train_transcripts.values()) + list(test_transcripts.values()))
# unique_chars = sorted(set(all_text))
# char_to_idx = {char: idx + 1 for idx, char in enumerate(unique_chars)}
# idx_to_char = {idx: char for char, idx in char_to_idx.items()}
# char_to_idx["<PAD>"] = 0  # Padding token
# idx_to_char[0] = "<PAD>"

# # Encode transcripts into sequences of indices
# def encode_transcripts(labels):
#     encoded = []
#     for label in labels:
#         encoded.append([char_to_idx[char] for char in label])
#     return encoded

# train_encoded_labels = encode_transcripts(train_labels)
# test_encoded_labels = encode_transcripts(test_labels)

# print(f"Vocabulary size: {len(char_to_idx)}")

NameError: name 'train_labels' is not defined

In [None]:
# Define batch size
batch_size = 32

# Create generators
train_gen = audio_generator(train_audio_dir, train_transcripts, char_to_idx, batch_size)
test_gen = audio_generator(test_audio_dir, test_transcripts, char_to_idx, batch_size)

# Test the generator
for batch_audio, batch_labels in train_gen:
    print(f"Batch audio shape: {batch_audio.shape}")
    print(f"Batch labels shape: {batch_labels.shape}")
    break  # Process only one batch for testing

NameError: name 'max_audio_len' is not defined

In [None]:
# from tensorflow.keras.preprocessing.sequence import pad_sequences

# # Pad sequences
# max_audio_len = max(len(seq) for seq in train_audio_data + test_audio_data)
# max_text_len = max(len(seq) for seq in train_encoded_labels + test_encoded_labels)

# train_padded_audio = pad_sequences(train_audio_data, maxlen=max_audio_len, dtype="float32", padding="post")
# test_padded_audio = pad_sequences(test_audio_data, maxlen=max_audio_len, dtype="float32", padding="post")

# train_padded_labels = pad_sequences(train_encoded_labels, maxlen=max_text_len, padding="post")
# test_padded_labels = pad_sequences(test_encoded_labels, maxlen=max_text_len, padding="post")

# print(f"Padded training audio shape: {train_padded_audio.shape}")
# print(f"Padded training labels shape: {train_padded_labels.shape}")

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Function to dynamically pad sequences in batches
def dynamic_padding(audio_data, labels, batch_size, max_audio_len, max_text_len):
    padded_audio_batches = []
    padded_label_batches = []

    for i in range(0, len(audio_data), batch_size):
        batch_audio = audio_data[i:i + batch_size]
        batch_labels = labels[i:i + batch_size]

        # Pad each batch to the maximum length within the batch
        batch_padded_audio = pad_sequences(batch_audio, maxlen=max_audio_len, dtype="float32", padding="post")
        batch_padded_labels = pad_sequences(batch_labels, maxlen=max_text_len, padding="post")

        padded_audio_batches.append(batch_padded_audio)
        padded_label_batches.append(batch_padded_labels)

    return padded_audio_batches, padded_label_batches

# Define batch size
batch_size = 32

# Dynamically pad sequences
train_padded_audio_batches, train_padded_labels_batches = dynamic_padding(
    train_audio_data, train_encoded_labels, batch_size, max_audio_len, max_text_len
)
test_padded_audio_batches, test_padded_labels_batches = dynamic_padding(
    test_audio_data, test_encoded_labels, batch_size, max_audio_len, max_text_len
)

print(f"Number of training batches: {len(train_padded_audio_batches)}")
print(f"Number of testing batches: {len(test_padded_audio_batches)}")