In [6]:
import os
import random
from glob import glob
from tqdm import tqdm
from multiprocessing import Pool

import librosa

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


print("TensorFlow:", tf.__version__)
# Set seed for experiment reproducibility
seed = 777
tf.random.set_seed(seed)

TensorFlow: 2.10.0


In [7]:
max_target_len=50

In [8]:
class TokenEmbedding(layers.Layer):
    def __init__(self, num_vocab=1000, maxlen=100, num_hid=64):
        super().__init__()
        self.emb = tf.keras.layers.Embedding(num_vocab, num_hid)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=num_hid,trainable=False)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        x = self.emb(x)
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        return x + positions


class SpeechFeatureEmbedding(layers.Layer):
    def __init__(self, num_hid=64, maxlen=100):
        super().__init__()
        self.conv1 = tf.keras.layers.Conv1D(
            num_hid, 11, strides=2, padding="same", activation="relu"
        )
        self.conv2 = tf.keras.layers.Conv1D(
            num_hid, 11, strides=2, padding="same", activation="relu"
        )
        self.conv3 = tf.keras.layers.Conv1D(
            num_hid, 11, strides=2, padding="same", activation="relu"
        )
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=num_hid,trainable=False)

    def call(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        return self.conv3(x)


class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, num_heads, feed_forward_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [
                layers.Dense(feed_forward_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)


class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, num_heads, feed_forward_dim, dropout_rate=0.1):
        super().__init__()
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = layers.LayerNormalization(epsilon=1e-6)
        self.self_att = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.enc_att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.self_dropout = layers.Dropout(0.5)
        self.enc_dropout = layers.Dropout(0.1)
        self.ffn_dropout = layers.Dropout(0.1)
        self.ffn = keras.Sequential(
            [
                layers.Dense(feed_forward_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )


In [10]:
model = Transformer(
    num_hid=128,
    num_head=2,
    num_feed_forward=256,
    target_maxlen=max_target_len,
    num_layers_enc=2,
    num_layers_dec=1,
    #num_classes=len(vectorizer.get_vocabulary()) # 75 # 48 # 67 # 108,
)


In [13]:
maxlen = 100
num_hid = 64

# Create an instance of the TokenEmbedding layer
token_embedding_layer = TokenEmbedding(num_vocab=1000, maxlen=maxlen, num_hid=num_hid)

# Create an instance of the SpeechFeatureEmbedding layer
speech_feature_embedding_layer = SpeechFeatureEmbedding(num_hid=num_hid, maxlen=maxlen)

# Create an instance of the TransformerEncoder layer
transformer_encoder_layer = TransformerEncoder(embed_dim=num_hid, num_heads=2, feed_forward_dim=256, rate=0.1)

# Create an instance of the TransformerDecoder layer
transformer_decoder_layer = TransformerDecoder(embed_dim=num_hid, num_heads=2, feed_forward_dim=256, dropout_rate=0.1)

# Define a sample input tensor
input_tensor = tf.random.uniform((1, maxlen, 1), dtype=tf.float32)  # Batch size of 1, 1 channel

# Pass the input tensor through each layer to calculate the output shapes
token_embeddings = token_embedding_layer(input_tensor)
speech_embeddings = speech_feature_embedding_layer(input_tensor)
encoder_output = transformer_encoder_layer(speech_embeddings, training=True)
decoder_output = transformer_decoder_layer(encoder_output, training=True)

# Print the shapes of the intermediate outputs
print("Token Embeddings Shape:", token_embeddings.shape)
print("Speech Feature Embeddings Shape:", speech_embeddings.shape)
print("Encoder Output Shape:", encoder_output.shape)
print("Decoder Output Shape:", decoder_output.shape)

Token Embeddings Shape: (1, 100, 1, 64)
Speech Feature Embeddings Shape: (1, 13, 64)
Encoder Output Shape: (1, 13, 64)
Decoder Output Shape: (1, 13, 64)


In [None]:
import json

# Your list of values (replace with your actual list)
wer_values = [1,0.92665667,0.83615385,0.76671429,0.74333333,0.6775,
 0.64705882,0.61711111,0.57724737,0.54,0.53380952,0.59,
 0.47517087,0.46833633,0.4354,0.42507692,0.41740741,0.39285714,
 0.38531034,0.36866667,0.36483871,0.35375,0.37363333,0.33362941,
 0.31428571, 0.30755556,0.3172973,0.29347368,0.28705128,0.2737,
 0.26527268,0.26090476 ,0.25681395,0.25105,0.24644744,0.23413343,
 0.23904255,0.22716667,0.2274798,0.21967,0.21568627,0.21153846,
 0.21754617,0.2033077,0.2008,0.19742557,0.19398746,0.18767517,
 0.18548068,0.18373383,0.17952787,0.17711735,0.17160317,0.174385,
 0.16823087,0.16566867,0.1631761,0.16078471,0.16082029,0.15684286,
 0.15392758,0.15477278,0.15088473,0.14764365,0.14766867,0.14533284,
 0.14382624,0.14143564,0.13174311,0.13686,0.13280357,0.13374534,
 0.13313012,0.13995238,0.12881176,0.12810658,0.12593618,0.124905,
 0.12339451,0.12182712,0.12117412,0.12917522,0.11628957,0.11472328,
 0.11538647,0.1159833,0.11335206,0.1127949,0.11371311,0.110356,
 0.10941089,0.10824314,0.10639512,0.10746423,0.0928619,0.10378158,
 0.10230374,0.10025585,0.10091843,0.0942]
# Define the file name for the JSON file
json_file_name = 'wer_values.json'

# Save the list to a JSON file
with open(json_file_name, 'w') as json_file:
    json.dump(wer_values, json_file)

print(f'WER values saved to {json_file_name}')
