In [2]:
import os
import random
from glob import glob
from tqdm import tqdm
from multiprocessing import Pool

import librosa

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


print("TensorFlow:", tf.__version__)
# Set seed for experiment reproducibility
seed = 777
tf.random.set_seed(seed)

TensorFlow: 2.10.0


In [3]:
utt_path = 'asr_bengali/utt_spk_text.tsv'
flac_audio_dir = 'asr_bengali/data'

max_target_len = 30

In [4]:
FILTER_CHARS = [
    '"', '%', "'", ',', '-', '.', '/', '\x93', '\x94', '\u200c', '\u200d', '‘', 
    '’', '“', '”', '…', '!', ':', ';', '?', 'œ', '।', '–'
]

ENGLISH = set([
    '0', '1', '2', '3', '4', '5', 'B', 'L', 'T', 'a', 'b', 'c', 'd', 'e', 'f', 
    'g', 'h', 'i', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'x', 'z',
])


def clean(text):
    '''Clean text'''
    for c in FILTER_CHARS:
        if c in text:
            text = text.replace(c, '')
    return text


def is_contrain_english_chars(text):
    if ENGLISH.intersection(set(text)):
        return True
    return False 

def convert_flac_to_wav(f):
    print('Rony')
    cmd = f"sox {f} {f.split('.')[0] + '.wav'}"
    os.system(cmd)

def get_data(utt_path, flac_audio_dir):
    '''Get data from utt, flac audio dir
    '''
    _take = 5000
    flac_audio_files = glob(flac_audio_dir + '/*/*.flac')[:_take*2]

    #print(f"Converting flac to wav")
    print(f"Converting flac to wav")
    for f in tqdm(flac_audio_files):
        cmd = f"sox {f} {f.split('.')[0] + '.wav'}"
        os.system(cmd)
    print('done')
    
    flac_list = [
        os.path.splitext(os.path.basename(_file))[0] 
        for _file in flac_audio_files
    ]
    flac_set = set(flac_list)

    data = []
    unique_chars = set()
    max_text_len = 0
    max_text = ''
    en_bn_mixed = 0
    takes = 0
    audio_duration = 0
    unique_words = set()
    with open(utt_path, 'r',encoding='utf-8') as fp:
        lines = fp.readlines()
        for line in tqdm(lines, total=len(lines)):
            line = line.strip(' \n')
            line = line.split('\t')
            file_name, text= line[0], line[2]
    
            if takes >= _take:
                break

            if file_name in flac_set:
                text = clean(text)
                # skip text which has > max_target_len chars
                if len(text) > max_target_len:
                    continue
                
                # skip english text
                if is_contrain_english_chars(text):
                    en_bn_mixed += 1
                    continue
                
                file_abs_path = flac_audio_files[flac_list.index(file_name)].split('.')[0] + '.wav'
                data.append({'audio': file_abs_path, 'text': text})
                duration = librosa.get_duration(filename=file_abs_path)
                audio_duration += duration
                # create unique chars set
                for c in text:
                    unique_chars.add(c)
                
                words = text.split()
                for w in words:
                    unique_words.add(w)
                
                # find max text sequence lenght, text
                text_len = len(text)
                if max_text_len < text_len:
                    max_text_len = text_len
                    max_text = text
                
                takes += 1
                
    unique_chars = sorted(unique_chars)

    print(f'flac audio files: {len(flac_audio_files)}')
    print(f'flac_dic         : {len(flac_set)}')
    print(f'utt entry       : {len(lines)}')
    print(f'unique chars  : {len(unique_chars)}')
    print(f'data             : {len(data)}')
    print(f"max text length : {max_text_len}")
    print(f'max text          : {max_text}')
    print(f'en bn mixed     : {en_bn_mixed}')
    print(f"Total unique words: {len(unique_words)}")
    print(f"audio duration: {audio_duration / 3600:.3f} hr")
 
    return data, unique_chars

In [5]:
data, unique_chars = get_data(utt_path, flac_audio_dir)
print('chars:', unique_chars)

Converting flac to wav


100%|████████████████████████████████████████████████████████████████████████████| 10000/10000 [11:15<00:00, 14.80it/s]


done


	This alias will be removed in version 1.0.
  duration = librosa.get_duration(filename=file_abs_path)
  2%|█▊                                                                        | 5403/218703 [05:02<3:19:16, 17.84it/s]

flac audio files: 10000
flac_dic         : 10000
utt entry       : 218703
unique chars  : 70
data             : 5000
max text length : 30
max text          : বা অ্যালার্জির বংশগত রোগ যুক্ত
en bn mixed     : 2
Total unique words: 6661
audio duration: 4.731 hr
chars: [' ', 'ঁ', 'ং', 'ঃ', 'অ', 'আ', 'ই', 'ঈ', 'উ', 'এ', 'ঐ', 'ও', 'ঔ', 'ক', 'খ', 'গ', 'ঘ', 'ঙ', 'চ', 'ছ', 'জ', 'ঝ', 'ঞ', 'ট', 'ঠ', 'ড', 'ঢ', 'ণ', 'ত', 'থ', 'দ', 'ধ', 'ন', 'প', 'ফ', 'ব', 'ভ', 'ম', 'য', 'র', 'ল', 'শ', 'ষ', 'স', 'হ', '়', 'া', 'ি', 'ী', 'ু', 'ূ', 'ৃ', 'ে', 'ৈ', 'ো', 'ৌ', '্', 'ৎ', 'ড়', 'য়', '০', '১', '২', '৩', '৪', '৫', '৬', '৭', '৮', '৯']





In [6]:
class TokenEmbedding(layers.Layer):
    def __init__(self, num_vocab=1000, maxlen=100, num_hid=64):
        super().__init__()
        self.emb = tf.keras.layers.Embedding(num_vocab, num_hid)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=num_hid,trainable=False)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        x = self.emb(x)
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        return x + positions


class SpeechFeatureEmbedding(layers.Layer):
    def __init__(self, num_hid=64, maxlen=100):
        super().__init__()
        self.conv1 = tf.keras.layers.Conv1D(
            num_hid, 11, strides=2, padding="same", activation="relu"
        )
        self.conv2 = tf.keras.layers.Conv1D(
            num_hid, 11, strides=2, padding="same", activation="relu"
        )
        self.conv3 = tf.keras.layers.Conv1D(
            num_hid, 11, strides=2, padding="same", activation="relu"
        )
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=num_hid,trainable=False)

    def call(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        return self.conv3(x)


class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, num_heads, feed_forward_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [
                layers.Dense(feed_forward_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)


class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, num_heads, feed_forward_dim, dropout_rate=0.1):
        super().__init__()
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = layers.LayerNormalization(epsilon=1e-6)
        self.self_att = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.enc_att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.self_dropout = layers.Dropout(0.5)
        self.enc_dropout = layers.Dropout(0.1)
        self.ffn_dropout = layers.Dropout(0.1)
        self.ffn = keras.Sequential(
            [
                layers.Dense(feed_forward_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )

    def causal_attention_mask(self, batch_size, n_dest, n_src, dtype):
        """Masks the upper half of the dot product matrix in self attention.

        This prevents flow of information from future tokens to current token.
        1's in the lower triangle, counting from the lower right corner.
        """
        i = tf.range(n_dest)[:, None]
        j = tf.range(n_src)
        m = i >= j - n_src + n_dest
        mask = tf.cast(m, dtype)
        mask = tf.reshape(mask, [1, n_dest, n_src])
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)], 0
        )
        return tf.tile(mask, mult)

    def call(self, enc_out, target):
        input_shape = tf.shape(target)
        batch_size = input_shape[0]
        seq_len = input_shape[1]
        causal_mask = self.causal_attention_mask(batch_size, seq_len, seq_len, tf.bool)
        target_att = self.self_att(target, target, attention_mask=causal_mask)
        target_norm = self.layernorm1(target + self.self_dropout(target_att))
        enc_out = self.enc_att(target_norm, enc_out)
        enc_out_norm = self.layernorm2(self.enc_dropout(enc_out) + target_norm)
        ffn_out = self.ffn(enc_out_norm)
        ffn_out_norm = self.layernorm3(enc_out_norm + self.ffn_dropout(ffn_out))
        return ffn_out_norm

    
class Transformer(keras.Model):
    def __init__(
        self,
        num_hid=64,
        num_head=2,
        num_feed_forward=128,
        source_maxlen=100,
        target_maxlen=100,
        num_layers_enc=4,
        num_layers_dec=1,
        num_classes=10,
    ):
        super().__init__()
        self.loss_metric = keras.metrics.Mean(name="loss")
        self.num_layers_enc = num_layers_enc
        self.num_layers_dec = num_layers_dec
        self.target_maxlen = target_maxlen
        self.num_classes = num_classes

        self.enc_input = SpeechFeatureEmbedding(num_hid=num_hid, maxlen=source_maxlen)
        self.dec_input = TokenEmbedding(
            num_vocab=num_classes, maxlen=target_maxlen, num_hid=num_hid
        )

        self.encoder = keras.Sequential(
            [self.enc_input]
            + [
                TransformerEncoder(num_hid, num_head, num_feed_forward)
                for _ in range(num_layers_enc)
            ]
        )

        for i in range(num_layers_dec):
            setattr(
                self,
                f"dec_layer_{i}",
                TransformerDecoder(num_hid, num_head, num_feed_forward),
            )

        self.classifier = layers.Dense(num_classes, activation='softmax')

    def decode(self, enc_out, target):
        y = self.dec_input(target)
        for i in range(self.num_layers_dec):
            y = getattr(self, f"dec_layer_{i}")(enc_out, y)
        return y

    def call(self, inputs):
        source = inputs[0]
        target = inputs[1]
        x = self.encoder(source)
        y = self.decode(x, target)
        return self.classifier(y)

    @property
    def metrics(self):
        return [self.loss_metric]

    @tf.function
    def train_step(self, batch):
        """Processes one batch inside model.fit()."""
        source = batch["source"]
        target = batch["target"]
        dec_input = target[:, :-1]
        dec_target = target[:, 1:]
        with tf.GradientTape() as tape:
            preds = self([source, dec_input])
            one_hot = tf.one_hot(dec_target, depth=self.num_classes)
            mask = tf.math.logical_not(tf.math.equal(dec_target, 0))
            loss = self.compiled_loss(one_hot, preds, sample_weight=mask)
        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
        self.loss_metric.update_state(loss)
        return {"loss": self.loss_metric.result()}

    def test_step(self, batch):
        source = batch["source"]
        target = batch["target"]
        dec_input = target[:, :-1]
        dec_target = target[:, 1:]
        preds = self([source, dec_input])
        # print('test preds', preds.numpy)
        one_hot = tf.one_hot(dec_target, depth=self.num_classes)
        mask = tf.math.logical_not(tf.math.equal(dec_target, 0))
        loss = self.compiled_loss(one_hot, preds, sample_weight=mask)
        self.loss_metric.update_state(loss)
        return {"loss": self.loss_metric.result()}

    def generate(self, source, target_start_token_idx):
        """Performs inference over one batch of inputs using greedy decoding."""
        bs = tf.shape(source)[0]
        enc = self.encoder(source)
        dec_input = tf.ones((bs, 1), dtype=tf.int32) * target_start_token_idx
        dec_logits = []
        for i in range(self.target_maxlen - 1):
            dec_out = self.decode(enc, dec_input)
            logits = self.classifier(dec_out)
            logits = tf.argmax(logits, axis=-1, output_type=tf.int32)
            last_logit = tf.expand_dims(logits[:, -1], axis=-1)
            dec_logits.append(last_logit)
            dec_input = tf.concat([dec_input, last_logit], axis=-1)
        return dec_input

In [7]:
class DisplayOutputs(keras.callbacks.Callback):
    def __init__(
        self, batch, idx_to_token, target_start_token_idx=27, target_end_token_idx=28
    ):
        """Displays a batch of outputs after every epoch

        Args:
            batch: A test batch containing the keys "source" and "target"
            idx_to_token: A List containing the vocabulary tokens corresponding to their indices
            target_start_token_idx: A start token index in the target vocabulary
            target_end_token_idx: An end token index in the target vocabulary
        """
        self.batch = batch
        self.target_start_token_idx = target_start_token_idx
        self.target_end_token_idx = target_end_token_idx
        self.idx_to_char = idx_to_token

    def on_epoch_end(self, epoch, logs=None):
        if epoch % 20 != 0:
            return
        self.batch = batch = next(iter(val_ds))
        source = self.batch["source"]
        target = self.batch["target"].numpy()
        bs = tf.shape(source)[0]
        preds = self.model.generate(source, self.target_start_token_idx)
        preds = preds.numpy()
        for i in range(bs):
            target_text = "".join([self.idx_to_char[_] for _ in target[i, :]])
            prediction = ""
            # print("preds[i, :]", preds[i, :])
            for idx in preds[i, :]:
                prediction += self.idx_to_char[idx]
                if idx == self.target_end_token_idx:
                    break
            print(f"target:     {target_text.replace('-','')}")
            print(f"prediction: {prediction}")


class CustomSchedule(keras.optimizers.schedules.LearningRateSchedule):
    def __init__(
        self,
        init_lr=0.00001,
        lr_after_warmup=0.001,
        final_lr=0.00001,
        warmup_epochs=15,
        decay_epochs=85,
        steps_per_epoch=203,
    ):
        super().__init__()
        self.init_lr = init_lr
        self.lr_after_warmup = lr_after_warmup
        self.final_lr = final_lr
        self.warmup_epochs = warmup_epochs
        self.decay_epochs = decay_epochs
        self.steps_per_epoch = steps_per_epoch

    def calculate_lr(self, epoch):
        """ linear warm up - linear decay """
        warmup_lr = (
            self.init_lr
            + ((self.lr_after_warmup - self.init_lr) / (self.warmup_epochs - 1)) * epoch
        )
        decay_lr = tf.math.maximum(
            self.final_lr,
            self.lr_after_warmup
            - (epoch - self.warmup_epochs)
            * (self.lr_after_warmup - self.final_lr)
            / (self.decay_epochs),
        )
        lr =  tf.math.minimum(warmup_lr, decay_lr)
        return lr

    def __call__(self, step):
        epoch = step // self.steps_per_epoch
        return self.calculate_lr(epoch)

In [8]:
class VectorizeChar:
    def __init__(self, max_len=50):
        self.vocab = (
            ["-", "#", "<", ">"]
            + list(unique_chars)
            # + [chr(i + 96) for i in range(1, 27)]
            # + [" ", ".", ",", "?"]
        )
        self.max_len = max_len
        self.char_to_idx = {}
        for i, ch in enumerate(self.vocab):
            self.char_to_idx[ch] = i

    def __call__(self, text):
        # text = text.lower()
        text = text[: self.max_len - 2]
        text = "<" + text + ">"
        pad_len = self.max_len - len(text)
        return [self.char_to_idx.get(ch, 1) for ch in text] + [0] * pad_len

    def get_vocabulary(self):
        return self.vocab


# data = get_data(wavs, id_to_text, max_target_len)
vectorizer = VectorizeChar(max_target_len)
print("vocab size", len(vectorizer.get_vocabulary()))


def create_text_ds(data):
    texts = [_["text"] for _ in data]
    text_ds = [vectorizer(t) for t in texts]
    text_ds = tf.data.Dataset.from_tensor_slices(text_ds)
    return text_ds


def path_to_audio(path):
    # spectrogram using stft
    audio = tf.io.read_file(path)
    audio, _ = tf.audio.decode_wav(audio, 1)    
    # audio = tf.cast(tfio.audio.decode_flac(audio, dtype=tf.int16), tf.float32)
    audio = tf.squeeze(audio, axis=-1)
    stfts = tf.signal.stft(audio, frame_length=200, frame_step=80, fft_length=256)
    x = tf.math.pow(tf.abs(stfts), 0.5)
    audio_len = tf.shape(x)[0]
    # padding to 10 seconds
    pad_len = 1700 # 2754
    paddings = tf.constant([[0, pad_len], [0, 0]])
#     print('paddings shape:', tf.shape(paddings))
    x = tf.pad(x, paddings, "CONSTANT")[:pad_len, :]
#     print('final audio len:', tf.shape(x))
    return x


def create_audio_ds(data):
    flist = [_["audio"] for _ in data]
    audio_ds = tf.data.Dataset.from_tensor_slices(flist)
    audio_ds = audio_ds.map(
        path_to_audio, num_parallel_calls=tf.data.experimental.AUTOTUNE
    )
    return audio_ds

# @tf.function
def create_tf_dataset(data, bs=4, val_data=False):
    audio_ds = create_audio_ds(data)
    text_ds = create_text_ds(data)
    ds = tf.data.Dataset.zip((audio_ds, text_ds))
    ds = ds.map(lambda x, y: {"source": x, "target": y})
    # cashe data to RAM 
    ds = ds.cache()
    # do shuffle
    ds = ds.shuffle(1000)
    ds = ds.batch(bs)
    ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
    return ds

data = data
split = int(len(data) * 0.99)
train_data = data # data[:split]
test_data =  data[split:]
# ds = create_tf_dataset(train_data, bs=400)
ds = create_tf_dataset(train_data, bs=64)
val_ds = create_tf_dataset(test_data, bs=64)

vocab size 74


# Visualize data

In [9]:
print(f"Total test data: {len(test_data)}")

Total test data: 50


In [10]:
import IPython
import random
for i in range(5):
    idx = random.randint(0, len(train_data))
    item = train_data[idx]
    print(item["text"])
    IPython.display.display(IPython.display.Audio(item["audio"]))

বারো কি তেরো


থলের ভিতর


একটি বিশেষ মহলকে


বা তাদের অস্বীকৃতির কারণেই


দীর্ঘ দিন


In [11]:
batch = next(iter(val_ds))

# The vocabulary to convert predicted indices into characters
idx_to_char = vectorizer.get_vocabulary()
display_cb = DisplayOutputs(
    batch, idx_to_char, target_start_token_idx=2, target_end_token_idx=3
)  # set the arguments as per vocabulary index for '<' and '>'

model = Transformer(
    num_hid=128,
    num_head=2,
    num_feed_forward=256,
    target_maxlen=max_target_len,
    num_layers_enc=2,
    num_layers_dec=1,
    num_classes=len(vectorizer.get_vocabulary()) # 75 # 48 # 67 # 108,
)
loss_fn = tf.keras.losses.CategoricalCrossentropy(
    from_logits=False, label_smoothing=0.1,
)

learning_rate = CustomSchedule(
    init_lr=0.00001,
    lr_after_warmup=0.001,
    final_lr=0.00001,
    warmup_epochs=15,
    decay_epochs=85,
    steps_per_epoch=len(ds),
)
optimizer = keras.optimizers.Adam(learning_rate)
# optimizer = keras.optimizers.Adam()
# optimizer = keras.optimizers.Adam(lr=0.00005)

# model.compile(optimizer=optimizer, loss=loss_fn)
model.compile(optimizer=optimizer, loss=loss_fn)
# model.load_weights('/content/gdrive/MyDrive/bangla-ai/models/bnasr-57')
# latest = tf.train.latest_checkpoint('/content/drive/MyDrive/bangla-ai/models/08-04-2021_ID_03')
# model.load_weights(latest)

In [None]:
if not os.path.exists("asr-checkpoint"):
    os.makedirs("asr-checkpoint")

checkpoint_path = 'asr-checkpoint/bnasr-{epoch:02d}-{val_loss:0.6f}'
cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path, 
    monitor='val_loss',
    mode='min',
    save_best_only=True,
    verbose=1
)

print("tf.executing_eagerly():", tf.executing_eagerly())

# optimizer = keras.optimizers.Adam(lr=0.0001)
# model.compile(optimizer=optimizer, loss=loss_fn)
history = model.fit(
    ds, 
    validation_data=val_ds, 
    callbacks=[display_cb],
    initial_epoch=0,
    epochs=600
)

tf.executing_eagerly(): True
Epoch 1/600
prediction: <ক্রাইমিয়ায় হাজার অলা>
target:     <সৃষ্টির জন্য যথেষ্ট>
prediction: <সৃষ্টির জন্য যথেষ্ট>
target:     <তথ্য ও যোগাযোগ প্রযুক্তি আইন>
prediction: <তথ্য ও যোগাযোগ প্রযুক্তি আইন>
target:     <শহিদুল্লাহ শহিদের মেয়ে>
prediction: <শহিদুল্লাহ শহিদের মেয়ে>
target:     <স্বামী সংসার>
prediction: <স্বামী সংসার>
target:     <এক বাংলাদেশিকে>
prediction: <এক বাংলাদেশিকে>
target:     <নড়াইলের পক্ষে হ্যাটট্রিক কর>
prediction: <নড়াইলের পক্ষে হ্যাটট্রিক কর>
target:     <পাওয়া যায়না>
prediction: <পাওয়া যায়না>
target:     <ফুচকা>
prediction: <ফুচকা>
target:     <দক্ষিণ এশিয়ায়>
prediction: <দক্ষিণ এশিয়ায়>
target:     <বেলপাতা দিবি না কাউরে>
prediction: <বেলপাতা দিবি না কাউরে>
target:     <কেননা আমি যদি ধরা পড়ি>
prediction: <কেননা আমি যদি ধরা পড়ি>
target:     <রাজনৈতিক দলগুলোর মধ্যে সংঘাত>
prediction: <রাজনৈতিক দলগুলোর মধ্যে সংঘাত>
target:     <আমার ভালবাসা শুধু তোমার জন্য>
prediction: <আমার ভালবাসা শুধু তোমার জন্য>
target:     <অপরদিকে