In [None]:
import os
from datasets import load_dataset, Audio, Dataset
from datasets import concatenate_datasets
import re
import numpy as np

from os import path, listdir
from pydub import AudioSegment

os.environ["KERAS_BACKEND"] = "tensorflow"
import pandas as pd
from glob import glob
import tensorflow as tf
import keras
from keras import layers

In [None]:
tf.config.list_physical_devices('GPU')

Followed https://keras.io/examples/audio/transformer_asr/
and used 
https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Fine_Tune_XLSR_Wav2Vec2_on_Common_Voice.ipynb#scrollTo=kAR0-2KLkopp
to make adaptations 

## Data prep

Get data

In [None]:
# Download Swedish data
#swedish = load_dataset("mozilla-foundation/common_voice_12_0", "sv-SE", cache_dir="data_swedish", token="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", num_proc=8)

In [None]:
# Download English data
#english = load_dataset("mozilla-foundation/common_voice_12_0", "en", cache_dir="data_english", token="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", num_proc=16)

In [None]:
# Spanish Data
#spanish = load_dataset("mozilla-foundation/common_voice_12_0", "es", cache_dir="data_spanish", token="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", num_proc=16)

In [None]:
spanish = load_dataset("/home/coder/projects/Audio Translate/Embedded-Project/data_spanish/mozilla-foundation___common_voice_12_0/es/12.0.0")
english = load_dataset("/home/coder/projects/Audio Translate/Embedded-Project/data_english/mozilla-foundation___common_voice_12_0/en/12.0.0")
swedish = load_dataset("/home/coder/projects/Audio Translate/Embedded-Project/data_swedish/mozilla-foundation___common_voice_12_0/sv-SE/12.0.0")

In [None]:
#def mp3_to_wav(batch):
    '''
    Function to convert mp3 to wav (issues were found that have nan values when reading the new wav file)
    '''
#    if(not path.exists(batch["path"])):
#        return batch
#    sound = AudioSegment.from_mp3(batch["path"])
#    output_file = batch["path"].split(".")[0]
#    output_file = output_file + ".wav"
#    sound.export(output_file, format="wav") 
#    os.remove(batch["path"])
#  
#    return batch
## Takes about 2 hours
#english = english.map(mp3_to_wav, desc="prepare_sentences", num_proc=32)
#spanish = spanish.map(mp3_to_wav, desc="prepare_sentences", num_proc=32)
#swedish = swedish.map(mp3_to_wav, desc="prepare_sentences", num_proc=32)

In [None]:
train_data = concatenate_datasets([spanish["train"].select(range(7421)), english["train"].select(range(7421)), swedish["train"].select(range(7421))])
val_data = concatenate_datasets([spanish["validation"].select(range(2000)), english["validation"].select(range(2000)), swedish["validation"].select(range(2000))])
train_data = concatenate_datasets([train_data, val_data])
test_data = concatenate_datasets([spanish["test"].select(range(5091)), english["test"].select(range(5091)), swedish["test"].select(range(5091))])

In [None]:
train_data = train_data.remove_columns(["path", "accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])
test_data = test_data.remove_columns(["path", "accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])

In [None]:

def prepare_sentences(batch):
  '''
    Function to preprocess the dataset with the .map method
  '''
  transcription = batch["sentence"]
  if transcription.startswith('"') and transcription.endswith('"'):
    # we can remove trailing quotation marks as they do not affect the transcription
    transcription = transcription[1:-1]
  
  if transcription[-1] not in [".", "?", "!"]:
    # append a full-stop to sentences that do not end in punctuation
    transcription = transcription + "."
  
  batch["sentence"] = transcription
  
  chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�]'
  batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
  
  return {"text":batch["sentence"], "audio":batch["audio"]["array"]}
  

train_data = train_data.map(prepare_sentences, desc="prepare_sentences", num_proc=16)
test_data = test_data.map(prepare_sentences, desc="prepare_sentences", num_proc=16)
print(train_data)
print(test_data)

In [None]:
train_data = train_data.remove_columns(["sentence"])
test_data = test_data.remove_columns(["sentence"])

In [None]:
'''
Filter out the text that have symbols from diffrent languges than eng, swe, and spanish    
'''
pd_train = train_data.to_pandas()
pd_test = test_data.to_pandas()

pattern = "[^a-zA-ZáéíóúüñÁÉÍÓÚÜÑåäöÄÅÖ -'.,¿¡\!?#<>:;\"]"

filt = pd_train["text"].str.contains(pattern)
pd_train = pd_train[~filt]
pd_train = pd_train.reset_index(drop=True)

filt = pd_test["text"].str.contains(pattern)
pd_test = pd_test[~filt]
pd_test = pd_test.reset_index(drop=True)

In [None]:
pd_train["len"] = pd_train["audio"].map(len)

pd_test["len"] = pd_test["audio"].map(len)

In [None]:
'''
Removes sound clips that are longer than ~10sec to speed up traning    
'''
longest_clip = 600000
pd_train = pd_train.loc[pd_train['len'] < longest_clip] #about 10 sec
pd_test = pd_test.loc[pd_test['len'] < longest_clip] #about 10 sec

In [None]:
pd_train

In [None]:
def pad_arr(batch):
    '''
    Function to pad audio spectogram within a pandas frame.
    '''
    audi = batch["audio"]
    padded = np.pad(audi, pad_width=(0, 600000 - len(audi)), mode='constant', constant_values=[0,0])
    batch["audio"] = padded
    return batch
pd_train = pd_train.apply(pad_arr, axis=1)
pd_test = pd_test.apply(pad_arr, axis=1)

In [None]:
train_data = Dataset.from_pandas(pd_train)
test_data = Dataset.from_pandas(pd_test)
train_data

In [None]:
def extract_all_chars(batch):
  all_text = " ".join(batch["text"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

In [None]:
def max_sentence(batch):
    leng = len(batch["text"])
    return {"len":leng}

lengts = train_data.map(max_sentence, keep_in_memory=True)
lengts_t = test_data.map(max_sentence, keep_in_memory=True)

In [None]:
train_data = train_data.remove_columns(["len", '__index_level_0__'])
test_data = test_data.remove_columns(["len", '__index_level_0__'])

In [None]:
vocab_train = train_data.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=train_data.column_names)
vocab_test = test_data.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=test_data.column_names)

vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]))
vocab_dict = {k: v for k, v in enumerate(vocab_list)}
vocab_dict

In [None]:
vocab_dict[39] = '-' 
vocab_dict[40] = '.' 
vocab_dict[41] = ',' 
vocab_dict[42] = '?' 
vocab_dict[43] = '!' 
vocab_dict[44] = '¡' 
vocab_dict[45] = '<' 
vocab_dict[46] = '>' 

In [None]:
class VectorizeChar:
    '''
    Class to vectorize text to right index of dict
    '''
    def __init__(self, max_len=50, vocab=None):
        self.vocab = vocab
        self.max_len = max_len

    def __call__(self, text):
        text = text.lower()
        text = text[: self.max_len - 2]
        text = "<" + text.strip() + ">"
        pad_len = self.max_len - len(text)
        return [list(self.vocab.values()).index(ch) for ch in text] + [list(self.vocab.values()).index(" ")] * pad_len

    def get_vocabulary(self):
        return self.vocab

def path_to_audio_wav(path): 
    '''
    Read a wav file and send back spectrogram not used for traning dataset
    '''
    # spectrogram using stft
    audio = tf.io.read_file(path)
    audio, _ = tf.audio.decode_wav(audio, 1)
    audio = tf.squeeze(audio, axis=-1)
    stfts = tf.signal.stft(audio, frame_length=200, frame_step=80, fft_length=256)
    x = tf.math.pow(tf.abs(stfts), 0.5)
    
    # normalisation
    means = tf.math.reduce_mean(x, 1, keepdims=True)
    stddevs = tf.math.reduce_std(x, 1, keepdims=True)
    x = (x - means) / stddevs
    audio_len = tf.shape(x)[0]
    # This padding might not be needed for inference
    # padding to 10 seconds
    pad_len = 2754
    paddings = tf.constant([[0, pad_len], [0, 0]])
    x = tf.pad(x, paddings, "CONSTANT")[:pad_len, :]
    return x

def create_text_ds(data):
    '''
    Reads the texts and converts all letters to numbers from vocab.
    '''
    texts = data["text"]
    text_ds = [vectorizer(t) for t in texts]
    text_ds = tf.data.Dataset.from_tensor_slices(text_ds)
    return text_ds


def create_audio_ds(data):
    '''
    Reads the audio spectorgram from the dataset and makes proper lengts of it and normalizes it
    '''
    audio_arr = data["audio"]

    stfts = tf.signal.stft(audio_arr, frame_length=200, frame_step=80, fft_length=256)
    x = tf.math.pow(tf.abs(stfts), 0.5)
    # normalisation
    means = tf.math.reduce_mean(x, 1, keepdims=True)
    stddevs = tf.math.reduce_std(x, 1, keepdims=True)
    x = (x - means) / stddevs
    audio_ds = tf.data.Dataset.from_tensor_slices(x)
    
    return audio_ds


def create_tf_dataset(data, bs=4):
    audio_ds = create_audio_ds(data)
    text_ds = create_text_ds(data)
    ds = tf.data.Dataset.zip((audio_ds, text_ds))
    ds = ds.map(lambda x, y: {"source": x, "target": y})
    ds = ds.batch(bs)
    ds = ds.prefetch(tf.data.AUTOTUNE)
    return ds

In [None]:
sentece_len = np.max(lengts["len"])
sentece_len_t = np.max(lengts_t["len"])

max_target_len = sentece_len
if(max_target_len < sentece_len_t):
    max_target_len = sentece_len_t

vectorizer = VectorizeChar(max_target_len, vocab_dict)
print("vocab size", len(vectorizer.get_vocabulary()))


In [None]:
ds = create_tf_dataset(train_data, bs=64)
val_ds = create_tf_dataset(test_data, bs=8)

## Transformer  - From https://keras.io/examples/audio/transformer_asr/

In [None]:
class TokenEmbedding(layers.Layer):
    def __init__(self, num_vocab=1000, maxlen=100, num_hid=64):
        super().__init__()
        self.emb = keras.layers.Embedding(num_vocab, num_hid)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=num_hid)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        x = self.emb(x)
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        return x + positions


class SpeechFeatureEmbedding(layers.Layer):
    def __init__(self, num_hid=64, maxlen=100):
        super().__init__()
        self.conv1 = keras.layers.Conv1D(
            num_hid, 11, strides=2, padding="same", activation="relu"
        )
        self.conv2 = keras.layers.Conv1D(
            num_hid, 11, strides=2, padding="same", activation="relu"
        )
        self.conv3 = keras.layers.Conv1D(
            num_hid, 11, strides=2, padding="same", activation="relu"
        )

    def call(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        return self.conv3(x)

In [None]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, num_heads, feed_forward_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [
                layers.Dense(feed_forward_dim, activation="relu"), # TODO go back to the wav and change this to than?
                layers.Dense(embed_dim),
            ]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training=False):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [None]:
class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, num_heads, feed_forward_dim, dropout_rate=0.1):
        super().__init__()
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = layers.LayerNormalization(epsilon=1e-6)
        self.self_att = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.enc_att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.self_dropout = layers.Dropout(0.5)
        self.enc_dropout = layers.Dropout(0.1)
        self.ffn_dropout = layers.Dropout(0.1)
        self.ffn = keras.Sequential(
            [
                layers.Dense(feed_forward_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )

    def causal_attention_mask(self, batch_size, n_dest, n_src, dtype):
        """Masks the upper half of the dot product matrix in self attention.

        This prevents flow of information from future tokens to current token.
        1's in the lower triangle, counting from the lower right corner.
        """
        i = tf.range(n_dest)[:, None]
        j = tf.range(n_src)
        m = i >= j - n_src + n_dest
        mask = tf.cast(m, dtype)
        mask = tf.reshape(mask, [1, n_dest, n_src])
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)], 0
        )
        return tf.tile(mask, mult)

    def call(self, enc_out, target):
        input_shape = tf.shape(target)
        batch_size = input_shape[0]
        seq_len = input_shape[1]
        causal_mask = self.causal_attention_mask(batch_size, seq_len, seq_len, tf.bool)
        target_att = self.self_att(target, target, attention_mask=causal_mask)
        target_norm = self.layernorm1(target + self.self_dropout(target_att))
        enc_out = self.enc_att(target_norm, enc_out)
        enc_out_norm = self.layernorm2(self.enc_dropout(enc_out) + target_norm)
        ffn_out = self.ffn(enc_out_norm)
        ffn_out_norm = self.layernorm3(enc_out_norm + self.ffn_dropout(ffn_out))
        return ffn_out_norm

In [None]:
class Transformer(keras.Model):
    def __init__(
        self,
        num_hid=64,
        num_head=2,
        num_feed_forward=128,
        source_maxlen=100,
        target_maxlen=100,
        num_layers_enc=4,
        num_layers_dec=1,
        num_classes=10,
    ):
        super().__init__()
        self.loss_metric = keras.metrics.Mean(name="loss")
        self.num_layers_enc = num_layers_enc
        self.num_layers_dec = num_layers_dec
        self.target_maxlen = target_maxlen
        self.num_classes = num_classes

        self.enc_input = SpeechFeatureEmbedding(num_hid=num_hid, maxlen=source_maxlen)
        self.dec_input = TokenEmbedding(
            num_vocab=num_classes, maxlen=target_maxlen, num_hid=num_hid
        )

        self.encoder = keras.Sequential(
            [self.enc_input]
            + [
                TransformerEncoder(num_hid, num_head, num_feed_forward)
                for _ in range(num_layers_enc)
            ]
        )

        for i in range(num_layers_dec):
            setattr(
                self,
                f"dec_layer_{i}",
                TransformerDecoder(num_hid, num_head, num_feed_forward),
            )

        self.classifier = layers.Dense(num_classes)

    def decode(self, enc_out, target):
        y = self.dec_input(target)
        for i in range(self.num_layers_dec):
            y = getattr(self, f"dec_layer_{i}")(enc_out, y)
        return y

    def call(self, inputs):
        source = inputs[0]
        target = inputs[1]
        x = self.encoder(source)
        y = self.decode(x, target)
        return self.classifier(y)

    @property
    def metrics(self):
        return [self.loss_metric]

    def train_step(self, batch):
        """Processes one batch inside model.fit()."""
        source = batch["source"]
        target = batch["target"]
        dec_input = target[:, :-1]
        dec_target = target[:, 1:]
        with tf.GradientTape() as tape:
            preds = self([source, dec_input])
            one_hot = tf.one_hot(dec_target, depth=self.num_classes)
            mask = tf.math.logical_not(tf.math.equal(dec_target, 0))
            loss = model.compute_loss(None, one_hot, preds, sample_weight=mask)
        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
        self.loss_metric.update_state(loss)
        return {"loss": self.loss_metric.result()}

    def test_step(self, batch):
        source = batch["source"]
        target = batch["target"]
        dec_input = target[:, :-1]
        dec_target = target[:, 1:]
        preds = self([source, dec_input])
        one_hot = tf.one_hot(dec_target, depth=self.num_classes)
        mask = tf.math.logical_not(tf.math.equal(dec_target, 0))
        loss = model.compute_loss(None, one_hot, preds, sample_weight=mask)
        self.loss_metric.update_state(loss)
        return {"loss": self.loss_metric.result()}

    def generate(self, source, target_start_token_idx):
        """Performs inference over one batch of inputs using greedy decoding."""
        bs = tf.shape(source)[0]
        enc = self.encoder(source)
        dec_input = tf.ones((bs, 1), dtype=tf.int32) * target_start_token_idx
        dec_logits = []
        for i in range(self.target_maxlen - 1):
            dec_out = self.decode(enc, dec_input)
            logits = self.classifier(dec_out)
            logits = tf.argmax(logits, axis=-1, output_type=tf.int32)
            last_logit = tf.expand_dims(logits[:, -1], axis=-1)
            dec_logits.append(last_logit)
            dec_input = tf.concat([dec_input, last_logit], axis=-1)
        return dec_input

## Training

In [None]:
class DisplayOutputs(keras.callbacks.Callback):
    def __init__(
        self, batch, idx_to_token, target_start_token_idx=34, target_end_token_idx=35
    ):
        """Displays a batch of outputs after every epoch

        Args:
            batch: A test batch containing the keys "source" and "target"
            idx_to_token: A List containing the vocabulary tokens corresponding to their indices
            target_start_token_idx: A start token index in the target vocabulary
            target_end_token_idx: An end token index in the target vocabulary
        """
        self.batch = batch
        self.target_start_token_idx = target_start_token_idx
        self.target_end_token_idx = target_end_token_idx
        self.idx_to_char = idx_to_token

    def on_epoch_end(self, epoch, logs=None):
        if epoch % 5 != 0:
            return
        source = self.batch["source"]
        target = self.batch["target"].numpy()
        bs = tf.shape(source)[0]
        preds = self.model.generate(source, self.target_start_token_idx)
        preds = preds.numpy()
        print("")
        for i in range(bs):
            target_text = "".join([self.idx_to_char[_] for _ in target[i,:]])
            prediction = ""
            for idx in preds[i, :]:
                prediction += self.idx_to_char[idx]
                if idx == self.target_end_token_idx:
                    break
            print(f"target:     {target_text.replace('-','')}")
            print(f"prediction: {prediction}\n")

In [None]:
class CustomSchedule(keras.optimizers.schedules.LearningRateSchedule):
    '''
    This is generating issues when savning a model
    '''
    def __init__(
        self,
        init_lr=0.00001,
        lr_after_warmup=0.001,
        final_lr=0.00001,
        warmup_epochs=15,
        decay_epochs=85,
        steps_per_epoch=203,
    ):
        super().__init__()
        self.init_lr = init_lr
        self.lr_after_warmup = lr_after_warmup
        self.final_lr = final_lr
        self.warmup_epochs = warmup_epochs
        self.decay_epochs = decay_epochs
        self.steps_per_epoch = steps_per_epoch

    def calculate_lr(self, epoch):
        """linear warm up - linear decay"""
        warmup_lr = (
            self.init_lr
            + ((self.lr_after_warmup - self.init_lr) / (self.warmup_epochs - 1)) * epoch
        )
        decay_lr = tf.math.maximum(
            self.final_lr,
            self.lr_after_warmup
            - (epoch - self.warmup_epochs)
            * (self.lr_after_warmup - self.final_lr)
            / self.decay_epochs,
        )
        return tf.math.minimum(warmup_lr, decay_lr)

    def __call__(self, step):
        epoch = step // self.steps_per_epoch
        epoch = tf.cast(epoch, "float32")
        return self.calculate_lr(epoch)


    def get_config(self):
        '''
        This is generating issues when savning a model
        '''
        config = {
        "init_lr" : self.init_lr
        "lr_after_warmup" : self.lr_after_warmup
        "final_lr" : self.final_lr
        "warmup_epochs" : self.warmup_epochs
        "decay_epochs" : self.decay_epochs
        "steps_per_epoch" : self.steps_per_epoch
        }
        return config

In [None]:
batch = next(iter(val_ds))

# The vocabulary to convert predicted indices into characters
idx_to_char = vectorizer.get_vocabulary()
display_cb = DisplayOutputs(
    batch, idx_to_char, target_start_token_idx=len(vectorizer.get_vocabulary())-2, target_end_token_idx=len(vectorizer.get_vocabulary())-1
)  # set the arguments as per vocabulary index for '<' and '>'

model = Transformer(
    num_hid=200,
    num_head=2,
    num_feed_forward=400,
    target_maxlen=max_target_len,
    num_layers_enc=4,
    num_layers_dec=1,
    num_classes=len(vectorizer.get_vocabulary()),
)
loss_fn = keras.losses.CategoricalCrossentropy(
    from_logits=True
)

learning_rate = CustomSchedule(
    init_lr=0.00001,
    lr_after_warmup=0.001,
    final_lr=0.00001,
    warmup_epochs=45,
    decay_epochs=125,
    steps_per_epoch=len(ds),
)
optimizer = keras.optimizers.Adam(learning_rate)
model.compile(optimizer=optimizer, loss=loss_fn)

history = model.fit(ds, validation_data=val_ds, shuffle=True, callbacks=[display_cb], epochs=1) # one to test saving for now

## Save model and various

In [None]:
export_path = os.path.join(os.getcwd(),'saved_models','model')
model.save(export_path)

In [None]:
def getfoldersize(path):
  total_size = 0
  for dirpath, dirnames, filenames in os.walk(path):
      for f in filenames:
          fp = os.path.join(dirpath, f)
          # skip if it is symbolic link
          if not os.path.islink(fp):
              total_size += os.path.getsize(fp)

  total_size /= (1024 ** 3)
  total_size = round(total_size, 3)
  return total_size

In [None]:
import os
old_model_size = getfoldersize(os.path.join(os.getcwd(),'data_rixvox'))
#new_model_size = getfoldersize(os.path.join(os.getcwd(),'saved_tflite_models'))

print("Old GB: " + str(old_model_size))
#print("New GB: " + str(new_model_size))

## Compression