In [None]:
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.preprocessing import text_dataset_from_directory
from tensorflow.keras.layers import(SimpleRNN,Embedding,Input,LSTM,Input,Conv1D,Softmax
                                    Dropout,Dense,GRU,LayerNormalization,Reshape,
                                    Bidirectional,Reshape)
from tensorflow.data.experimental import AUTOTUNE
import numpy as np
import re
import string
import nltk
import datetime
import numpy as np
from matplotlib import pyplot as plt
import pandas

<H1>DATA PREPARATION</H1>

In [None]:
path='...'
text_path='...'

In [None]:
audio_tet={}

In [None]:
with open(text_path, encoding="utf-8") as f:
    for line in f:
        i=line.strip().split("|")[0]
        text=line.strip().split("|")[1]
        audio_text[str(i)]=text

In [None]:
def decode_audio(audio_binary):
    audio,_=tf.audio.decode_wav(audio_binary)
    return tf.squeeze(audio,axis=-1)

In [None]:
def get_spec(filepath):
    label=audio_text[os.path.basename(file_path)[:-4]]
    
    audio_binary=tf.io.read_file(filepath)
    waveform=decode_audio(audio_binary)
    
    zero_padding=(222621-len(waveform))*[0]
    zero_padding=tf.constant(zero_padding,tf.float32)
    
    waveform=tf.cast(waveform,tf.float32)
    equal_length=tf.concat([waveform,zero_padding],axis=0)
    
    spectrogram=tf.signal.stft(
        equal_length,frame_length=63,frame_step=32)
    spectrogram=tf.abs(spectrogram)
    return tf.expand_dims(spectrogram,axis=-1),label

In [None]:
vocabulary=[" ","UNK",".",",","?"]+[chr(i) for i in range(97,97+26)]+["PAD"]

In [None]:
def get_vocab(char):
    if char in vocabulary:
        return vocabulary.index(char)
    else:
        return 0

In [None]:
def get_label(label,seq_len=192):
    label=label[:190]
    out_label=[get_vocab(i.lower()) for i in label]
    out_label=tf.constant(out_label)
    return out_label

In [None]:
class DataGenerator(tf.keras.utils.Sequence):
    def __init__(self,train_images,batch_size,SEQUENCE_LENGTH,VOCAB_SIZE,shuffle=False):
        self.train_images=train_images
        self.batch_size=batch_size
        self.train_image_list=os.listdir(train_images)
        self.SEQUENCE_LENGTH=SEQUENCE_LENGTH
        self.VOCAB_SIZE=VOCAB_SIZE
    def __len__(self):
        return int(np.floor(len(self.train_image_list)/self.batch_size))
    def __getitem__(self,idx):
        X,y=self.__data_generation(idx)
        return X,y
    def __data_generation(self,idx):
        X=[]
        y=[]
        for j in range(idx*self.batch_size,(idx+1)*self.batch_size):
            spec,label=get_spec(self.train_images+self.train_image_list[j])
            X.append(spec)
            label=get_label(label,self.SEQUENCE_LENGTH)
            y.append(label)
        return tf.convert_to_tensor(X),tf.convert_to_tensor(y)

In [None]:
train_path='...'
BATCH_SIZE=1
SEQUENCE_LENGTH=192
VOCAB_SIZE=1
LR=1e-3
EPOCH=1000

In [None]:
train_gen=DataGenerator(train_path,BATCH_SIZE,SEQUENCE_LENGTH,VOCAB_SIZE)

<H1>MODELING</H1>

In [None]:
norm_layer=tf.keras.layers.experimental.preprocessing.Normalization()
input_shape=(6955,33,1)

In [None]:
model=Sequential([
    Input(shape=input_shape),
    norm_layer,
    Conv2D(512,3,padding='same',activation='relu'),
    Conv2D(256,3,padding='same',activation='relu'),
    
    MaxPooling2D(),
    Reshape((SEQUENCE_LENGTH,-1)),
    Conv1D(len(vocabulary),3,padding='same'),
    Softmax(axis=2),
    
    
])

In [None]:
model.summary()

<H1>TRAINING</H1>

In [None]:
def ctc_loss(y_true,y_pred):
    batch_size=tf.shape(y_pred)[0]
    pred_length=tf.shape(y_pred)[1]
    true_length=tf.shape(y_true)[1]
    
    pred_length=pred_length*tf.ones([batch_size,1])
    true_length=true_length*tf.ones([batch_size,1])
    
    return tf.keras.backend.ctc_batch_cost(y_true,y_pred,pred_length,true_length)

In [None]:
model.compile(
    loss=ctc_loss,
    optimizer=tf.keras.optimizers.Adam(lr=LR,),
)

In [None]:
checkpoint_filepath='...'
callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='loss',
    mode='min',
    save_best_only=True
)

In [None]:
history=model.fit(train_gen,verbose=1, shuffle=True,epochs=EPOCH,callbacks=[callback])

<h1>TESTING</h1>

In [None]:
test_str='...'
test_path='...'

In [None]:
spectrogram,_=get_spec(test_path)
out=tf.argmax(model.predict(tf.expand_dims(spectrogram,axis=0))[0],axis=1)
out=[vocabulary[i] for i in out]

In [None]:
out_str=""
for i in out:
    out_str+=i
pritn(out_str)

In [None]:
def decode(y_pred):
    batch_size=tf.shape(y_pred)[0]
    
    pred_length=tf.shape(y_pred)[1]
    pred_length*=tf.ones([batch_size,],dtype=tf.int32)
    
    y_pred=tf.one_hot(y_pred,32)
    output=tf.keras.backend.ctc_decode(y_pred,input_length=pred_length,greedy=True)[0][0]
    
    out=[vocabulary[i] fro i in output[0]]
    out_str=""
    for i in out:
        out_str+=i
    return out_str

In [None]:
spectrogram,_=get_spec(test_path)
out=tf.argmax(model.predict(tf.expand_dims(spectrogram,axis=0))[0],axis=1)
decode(tf.expand_dims(out,axis=0))