In [3]:
%run utils.ipynb

In [4]:
%run configs.ipynb

In [5]:
%run model.ipynb

(2, 100, 66)
Model: "ASR_model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, None, 52)]           0         []                            
                                                                                                  
 conv1d (Conv1D)             (None, None, 50)             39050     ['input_1[0][0]']             
                                                                                                  
 batch_normalization (Batch  (None, None, 50)             200       ['conv1d[0][0]']              
 Normalization)                                                                                   
                                                                                                  
 p_re_lu (PReLU)             (None, None, 50)             50        ['batch_n

                                                                    ]                             
                                                                                                  
 add_3 (Add)                 (None, None, 50)             0         ['add_2[0][0]',               
                                                                     'p_re_lu_8[0][0]']           
                                                                                                  
 conv1d_9 (Conv1D)           (None, None, 50)             37550     ['add_3[0][0]']               
                                                                                                  
 batch_normalization_9 (Bat  (None, None, 50)             200       ['conv1d_9[0][0]']            
 chNormalization)                                                                                 
                                                                                                  
 p_re_lu_9

In [6]:
import tensorflow as tf
import tensorflow.keras.backend as K
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
import librosa
import time
from tqdm import tqdm
import edit_distance as ed

In [7]:
def train_model(model, optimizer, train_wavs, train_texts, test_wavs, test_texts, epochs=100, batch_size=50):

    with tf.device(device_name):

        for e in range(0, epochs + epochs):
            start_time = time.time()

            len_train = len(train_wavs)
            len_test = len(test_wavs)
            train_loss = 0
            test_loss = 0
            test_CER = 0
            train_batch_count = 0
            test_batch_count = 0

            print("Training epoch: {}".format(e+1))
            for start in tqdm(range(0, len_train, batch_size)):

                end = None
                if start + batch_size < len_train:
                    end = start + batch_size
                else:
                    end = len_train
                x, target, target_lengths, output_lengths = batchify(
                    train_wavs[start:end], train_texts[start:end], UNQ_CHARS)

                with tf.GradientTape() as tape:
                    output = model(x, training=True)

                    loss = K.ctc_batch_cost(
                        target, output, output_lengths, target_lengths)

                grads = tape.gradient(loss, model.trainable_weights)
                optimizer.apply_gradients(zip(grads, model.trainable_weights))

                train_loss += np.average(loss.numpy())
                train_batch_count += 1

            print("Testing epoch: {}".format(e+1))
            for start in tqdm(range(0, len_test, batch_size)):

                end = None
                if start + batch_size < len_test:
                    end = start + batch_size
                else:
                    end = len_test
                x, target, target_lengths, output_lengths = batchify(
                    test_wavs[start:end], test_texts[start:end], UNQ_CHARS)

                output = model(x, training=False)

                # Calculate CTC Loss
                loss = K.ctc_batch_cost(
                    target, output, output_lengths, target_lengths)

                test_loss += np.average(loss.numpy())
                test_batch_count += 1

                """
                    The line of codes below is for computing evaluation metric (CER) on internal validation data.
                """
                input_len = np.ones(output.shape[0]) * output.shape[1]
                decoded_indices = K.ctc_decode(output, input_length=input_len,
                                       greedy=False, beam_width=100)[0][0]
                
                # Remove the padding token from batchified target texts
                target_indices = [sent[sent != 0].tolist() for sent in target]

                # Remove the padding, unknown token, and blank token from predicted texts
                predicted_indices = [sent[sent > 1].numpy().tolist() for sent in decoded_indices] # idx 0: padding token, idx 1: unknown, idx -1: blank token

                len_batch = end - start
                for i in range(len_batch):

                    pred = predicted_indices[i]
                    truth = target_indices[i]
                    sm = ed.SequenceMatcher(pred, truth)
                    ed_dist = sm.distance()                 # Edit distance
                    test_CER += ed_dist / len(truth)
                test_CER /= len_batch

            train_loss /= train_batch_count
            test_loss /= test_batch_count
            test_CER /= test_batch_count

            rec = "Epoch: {}, Train Loss: {:.2f}, Test Loss {:.2f}, Test CER {:.2f} % in {:.2f} secs.\n".format(
                e+1, train_loss, test_loss, test_CER*100, time.time() - start_time)

            print(rec)
            #Save the final trained model
            model.save("model/trained_model_v1.h5")

In [8]:
def load_data(wavs_dir, texts_dir):
    texts_df = pd.read_csv(texts_dir)
    train_wavs = []
    for f_name in texts_df["file"]:
        wav, _ = librosa.load(f"{wavs_dir}/{f_name}.flac", sr=SR)
        train_wavs.append(wav)
    train_texts = texts_df["text"].tolist()
    return train_wavs, train_texts

# Defintion of the model



In [9]:
model = get_model(INPUT_DIM, NUM_UNQ_CHARS, num_res_blocks=5, num_cnn_layers=2,
                      cnn_filters=50, cnn_kernel_size=15, rnn_dim=170, rnn_dropout=0.15, num_rnn_layers=2,
                      num_dense_layers=1, dense_dim=340, model_name=MODEL_NAME, rnn_type="lstm",
                      use_birnn=True)
print("Model defined \u2705 \u2705 \u2705 \u2705\n")

Model defined ✅ ✅ ✅ ✅



# Defintion of the optimizer
 

In [10]:
optimizer = tf.keras.optimizers.Adam()
optimizer

<keras.src.optimizers.adam.Adam at 0x209ace7efe0>

# Load the data

In [11]:
print("Loading data.....")
train_wavs, train_texts = load_data(wavs_dir="dataset/wav_files(sampled)", 
                                    texts_dir="dataset/transcriptions(sampled)/file_speaker_text(sampled).csv")
print("Data loaded \u2705 \u2705 \u2705 \u2705\n")

Loading data.....
Data loaded ✅ ✅ ✅ ✅



### To replicate the results give the argument astext_dir="dataset/transcriptions(sampled)/file_speaker_text(orignally_trained).csv".
### Get all of the wavs files from https://openslr.org/54/, put them in a single directory, and give that directory as argument for wavs_dir.


# Clean the audio file by removing the silent gaps from the both ends the audio file

In [12]:
print("Cleaning the audio files.....")
train_wavs = [clean_single_wav(wav) for wav in train_wavs]
print("Audio files cleaned \u2705 \u2705 \u2705 \u2705\n")
train_wavs

Cleaning the audio files.....
Audio files cleaned ✅ ✅ ✅ ✅



[array([ 1.8005371e-03,  2.3193359e-03,  2.4414062e-03, ...,
        -5.7573766e-06, -5.7573766e-06, -5.7573766e-06], dtype=float32),
 array([ 3.2196045e-02,  4.5593262e-02,  5.3833008e-02, ...,
        -5.0090207e-06, -5.0090207e-06, -5.0090207e-06], dtype=float32),
 array([-5.7983398e-04,  7.6293945e-04,  6.1035156e-05, ...,
        -5.9117820e-06, -5.9117820e-06, -5.9117820e-06], dtype=float32),
 array([-2.6733398e-02, -2.7770996e-02, -2.9327393e-02, ...,
        -4.3437790e-06, -4.3437790e-06, -4.3437790e-06], dtype=float32),
 array([-2.5299072e-02, -2.4932861e-02, -2.3712158e-02, ...,
        -4.4921876e-06, -4.4921876e-06, -4.4921876e-06], dtype=float32),
 array([-3.3508301e-02, -2.5238037e-02, -2.2949219e-02, ...,
         1.4448364e-06,  1.4448364e-06,  1.4448364e-06], dtype=float32),
 array([-1.0284424e-02, -1.4831543e-02, -1.9409180e-02, ...,
        -2.2604054e-05, -2.2604054e-05, -2.2604054e-05], dtype=float32),
 array([-5.3924561e-02,  1.4953613e-02,  2.4749756e-02, ...,
 

# Generate mfcc features for the audio files

In [13]:
print("Generating mfcc features.....")
train_wavs = [gen_mfcc(wav) for wav in train_wavs]
print("MFCC features generated \u2705 \u2705 \u2705 \u2705\n")
train_wavs

Generating mfcc features.....
MFCC features generated ✅ ✅ ✅ ✅



[array([-4.796408  ,  1.5434637 ,  0.636496  , ..., -0.09425944,
         0.34669274,  0.33026385], dtype=float32),
 array([-3.1955981 ,  1.217868  ,  0.621759  , ...,  0.6174974 ,
         0.17604741, -0.05681407], dtype=float32),
 array([-3.1990135 ,  1.1079593 ,  0.6238055 , ..., -0.04251979,
         0.11003518,  0.09648506], dtype=float32),
 array([-3.7469773 ,  1.7098516 ,  0.46088555, ...,  0.29264623,
         0.252809  ,  0.3118067 ], dtype=float32),
 array([-3.8844001 ,  2.0760522 ,  0.7067117 , ...,  0.27621436,
         0.13850644,  0.32015398], dtype=float32),
 array([-3.2187698 ,  1.1329865 ,  0.6421117 , ...,  0.08824953,
         0.26802835,  0.27487585], dtype=float32),
 array([-3.6637573 ,  1.3264847 ,  0.9430076 , ...,  0.11223838,
         0.23151182,  0.13336673], dtype=float32),
 array([-4.0937696 , -0.17164868,  1.0485486 , ...,  0.19568054,
         0.44634715,  0.10186896], dtype=float32),
 array([-2.822823  ,  0.6948634 ,  1.493339  , ...,  0.2614791 ,
       

# Train Test Split

### Originally the data was split in the 95% train and 5% test set; With total of 148K (audio,text) pairs.
  

In [14]:
train_wavs, test_wavs, train_texts, test_texts = train_test_split(train_wavs,
                                                                  train_texts,
                                                                  test_size=0.2)
#train_wavs
#test_wavs
#train_texts
#test_texts

# Train the model
### Originally the model was trained for 58 epochs; With a batch size of 50.

In [17]:
train_model(model, 
            optimizer, 
            train_wavs, 
            train_texts,
            test_wavs, 
            test_texts, 
            epochs=10, 
            batch_size=2)

Training epoch: 1


  0%|                                                                                           | 0/32 [00:00<?, ?it/s]

Object was never used (type <class 'tensorflow.python.ops.tensor_array_ops.TensorArray'>):
<tensorflow.python.ops.tensor_array_ops.TensorArray object at 0x00000209AE0AC4C0>
If you want to mark it as used call its "mark_used()" method.
It was originally created here:
  File "C:\Users\Dell\anaconda3\lib\site-packages\keras\src\backend.py", line 5160, in <genexpr>
    output_ta_t = tuple(  File "C:\Users\Dell\anaconda3\lib\site-packages\tensorflow\python\util\tf_should_use.py", line 288, in wrapped


100%|██████████████████████████████████████████████████████████████████████████████████| 32/32 [02:10<00:00,  4.06s/it]


Testing epoch: 1


100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:14<00:00,  1.84s/it]

Epoch: 1, Train Loss: 65.54, Test Loss 86.34, Test CER 95.25 % in 144.76 secs.






Training epoch: 2


100%|██████████████████████████████████████████████████████████████████████████████████| 32/32 [02:16<00:00,  4.28s/it]


Testing epoch: 2


100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:17<00:00,  2.23s/it]

Epoch: 2, Train Loss: 65.29, Test Loss 84.54, Test CER 93.46 % in 154.81 secs.






# Save the final trained model

In [27]:
# model.save("model/trained_model_v1.h5")
import pickle
pickle.dump(model,open('./model/trained_model_v1.pkl'))
# Not working to save the error file

FileNotFoundError: [Errno 2] No such file or directory: './model/trained_model_v1.pkl'