🔧 Segment 1: Imports and Basic Configuration


In [1]:
!pip install lxml




In [2]:
# !pip install tensorflow

In [3]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout, LayerNormalization
from tensorflow.keras.models import Model
import numpy as np

# Parameters
AUDIO_SEQ_LEN = 2000   # from your MFCC/audio features
AUDIO_FEATURE_DIM = 64
MAX_TARGET_LEN = 100    # token sequence length of XML/tab
VOCAB_SIZE = 500        # depends on tokenizer used on target text
EMBED_DIM = 256         # transformer model dimensionality
NUM_HEADS = 4
FF_DIM = 512
NUM_LAYERS = 4
DROPOUT_RATE = 0.1


🧠 Segment 2: Transformer Encoder & Decoder Blocks


In [4]:
class TransformerEncoderBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential([
            Dense(ff_dim, activation="relu"),
            Dense(embed_dim),
        ])
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, x, training):
        attn_output = self.att(x, x)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

class TransformerDecoderBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att1 = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.att2 = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)

        self.ffn = tf.keras.Sequential([
            Dense(ff_dim, activation="relu"),
            Dense(embed_dim),
        ])
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.layernorm3 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)
        self.dropout3 = Dropout(rate)

    def call(self, x, enc_output, training, look_ahead_mask=None, padding_mask=None):
        attn1 = self.att1(x, x, attention_mask=look_ahead_mask)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(x + attn1)

        attn2 = self.att2(out1, enc_output)
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layernorm2(out1 + attn2)

        ffn_output = self.ffn(out2)
        ffn_output = self.dropout3(ffn_output, training=training)
        return self.layernorm3(out2 + ffn_output)


📏 Segment 3: Positional Encoding + Token Embedding


In [5]:
import tensorflow as tf
import numpy as np

def get_positional_encoding(seq_len, d_model):
    pos = np.arange(seq_len)[:, np.newaxis]
    i = np.arange(d_model)[np.newaxis, :]
    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
    angle_rads = pos * angle_rates

    # Apply sin to even indices, cos to odd indices
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

    pos_encoding = angle_rads[np.newaxis, ...]
    return tf.cast(pos_encoding, dtype=tf.float32)

class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, vocab_size, max_len, d_model):
        super().__init__()
        self.token_embedding = tf.keras.layers.Embedding(vocab_size, d_model)
        self.pos_encoding = get_positional_encoding(max_len, d_model)

    def call(self, x):
        length = tf.shape(x)[1]
        x = self.token_embedding(x)
        return x + self.pos_encoding[:, :length, :]


🏗️ Segment 4: Full Transformer Model Assembly


In [6]:
def build_transformer_model(
    audio_seq_len=2000,
    audio_feat_dim=64,
    max_target_len=100,
    vocab_size=500,
    embed_dim=256,
    ff_dim=512,
    num_heads=4,
    num_layers=4,
    dropout_rate=0.1,
):
    # ==== AUDIO ENCODER ====
    encoder_input = Input(shape=(audio_seq_len, audio_feat_dim), name="audio_input")
    x = Dense(embed_dim)(encoder_input)  # Project audio features to model dimension
    x += get_positional_encoding(audio_seq_len, embed_dim)[:, :audio_seq_len, :]

    for _ in range(num_layers):
        x = TransformerEncoderBlock(embed_dim, num_heads, ff_dim, dropout_rate)(x, training=True)
    encoder_output = x

    # ==== TEXT DECODER ====
    decoder_input = Input(shape=(max_target_len,), name="decoder_input")  # token ids
    y = PositionalEmbedding(vocab_size, max_target_len, embed_dim)(decoder_input)

    for _ in range(num_layers):
        y = TransformerDecoderBlock(embed_dim, num_heads, ff_dim, dropout_rate)(
            y, encoder_output, training=True
        )

    # Output logits over vocabulary
    decoder_output = Dense(vocab_size, activation='softmax')(y)

    model = Model(inputs=[encoder_input, decoder_input], outputs=decoder_output)
    return model


✅ Compile the Model

In [7]:
transformer = build_transformer_model()

transformer.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)


In [8]:
transformer.summary()

🧪 Segment 5: Training Loop with Teacher Forcing


```
# This is formatted as code
```



🔄 Step 1: Prepare Training Data


In [9]:
def prepare_decoder_inputs_outputs(tokenized_targets, start_token_id):
    """
    Takes target sequences (e.g., [34, 76, 12, 9, ...]) and returns:
    decoder_input: [<start>, 34, 76, 12, ...]
    decoder_output: [34, 76, 12, 9, ...]
    """
    decoder_inputs = []
    decoder_outputs = []

    for seq in tokenized_targets:
        decoder_inputs.append([start_token_id] + seq[:-1])
        decoder_outputs.append(seq)

    return np.array(decoder_inputs), np.array(decoder_outputs)


🧾 🔠 Segment 5.1: Tokenize XML or Tablature Targets

🧾 Step 1: Tokenize Output Text (XML or Tablature)


In [10]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Assume you have a list of string targets (e.g., XML or tablature format)
# Example:
text_targets = [
    "<note><onsetSec>6.605</onsetSec></note>",
    "<note><onsetSec>7.230</onsetSec></note>",
    "E|--0--2--3--|",
    # etc.
]

# Use character-level tokenizer (or switch to word-level if preferred)
tokenizer = Tokenizer(char_level=True, filters='', lower=False)
tokenizer.fit_on_texts(text_targets)

# Convert each string to a list of token IDs
Y_token = tokenizer.texts_to_sequences(text_targets)

# Set a maximum length for output sequences
MAX_TARGET_LEN = 100
Y_token = pad_sequences(Y_token, maxlen=MAX_TARGET_LEN, padding='post')

# Update vocabulary size
VOCAB_SIZE = len(tokenizer.word_index) + 1
start_token_id = tokenizer.word_index.get('<', 1)  # use `<` as start token


🔁 Step 2: Prepare Decoder Input and Target Output


In [11]:
def prepare_decoder_inputs_outputs(tokenized_targets, start_token_id):
    decoder_inputs = []
    decoder_outputs = []

    for seq in tokenized_targets:
        decoder_input = [start_token_id] + list(seq[:-1])  # shift right
        decoder_output = list(seq)
        decoder_inputs.append(decoder_input)
        decoder_outputs.append(decoder_output)

    return np.array(decoder_inputs), np.array(decoder_outputs)

# Apply function
decoder_input_data, decoder_output_data = prepare_decoder_inputs_outputs(Y_token, start_token_id)

# decoder_output needs to be 3D for sparse_categorical_crossentropy
decoder_output_data = decoder_output_data[..., np.newaxis]


# Prepare The Audio Files


In [12]:
import os
import librosa
import numpy as np

# Directory containing your .wav files
audio_dir = "/content/drive/MyDrive/dataset2/audio"  # change as needed
files = [f for f in os.listdir(audio_dir) if f.endswith(".wav")]

# Parameters
MAX_AUDIO_LEN = 2000  # pad/truncate to 2000 time steps
N_MFCC = 64           # number of MFCCs to extract

def extract_mfcc(file_path, max_len=MAX_AUDIO_LEN, n_mfcc=N_MFCC):
    y, sr = librosa.load(file_path, sr=None)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    mfcc = mfcc.T  # shape: (time, 64)

    if mfcc.shape[0] < max_len:
        # Pad with zeros if shorter
        pad_width = max_len - mfcc.shape[0]
        mfcc = np.pad(mfcc, ((0, pad_width), (0, 0)), mode='constant')
    else:
        # Truncate if longer
        mfcc = mfcc[:max_len, :]

    return mfcc

# Load all MFCCs
X_audio = []
for fname in files:
    path = os.path.join(audio_dir, fname)
    mfcc = extract_mfcc(path)
    X_audio.append(mfcc)

X_audio = np.array(X_audio)  # final shape: (N, 2000, 64)


In [13]:
print(X_audio[0])

[[-6.8336926e+02  4.3504944e+01  3.3611908e+01 ... -4.6937406e-01
   6.4101839e-01  9.3179834e-01]
 [-6.7360248e+02  5.5079887e+01  4.1050835e+01 ... -6.6569102e-01
   2.8163908e+00  4.1774211e+00]
 [-6.6940997e+02  6.0271637e+01  4.5435455e+01 ...  2.8687230e-01
   3.8351529e+00  4.8378811e+00]
 ...
 [ 0.0000000e+00  0.0000000e+00  0.0000000e+00 ...  0.0000000e+00
   0.0000000e+00  0.0000000e+00]
 [ 0.0000000e+00  0.0000000e+00  0.0000000e+00 ...  0.0000000e+00
   0.0000000e+00  0.0000000e+00]
 [ 0.0000000e+00  0.0000000e+00  0.0000000e+00 ...  0.0000000e+00
   0.0000000e+00  0.0000000e+00]]


In [14]:
####

In [15]:
import os
import librosa
import numpy as np
from bs4 import BeautifulSoup

# Paths to folders
audio_dir = "/content/drive/MyDrive/dataset2/audio"
xml_dir   = "/content/drive/MyDrive/dataset2/annotation"

X_audio = []
text_targets = []

# Step 1: Create a set of XML base names
xml_files = {os.path.splitext(f)[0] for f in os.listdir(xml_dir) if f.endswith(".xml")}

# Step 2: Iterate audio files and only include those with matching XML
for audio_file in os.listdir(audio_dir):
    if audio_file.endswith(".wav"):
        base_name = os.path.splitext(audio_file)[0]

        if base_name in xml_files:
            audio_path = os.path.join(audio_dir, audio_file)
            xml_path   = os.path.join(xml_dir, base_name + ".xml")

            # --- Process audio ---
            y, sr = librosa.load(audio_path, sr=None)
            mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=64).T
            if mfcc.shape[0] < 2000:
                mfcc = np.pad(mfcc, ((0, 2000 - mfcc.shape[0]), (0, 0)), mode='constant')
            else:
                mfcc = mfcc[:2000, :]
            X_audio.append(mfcc)

            # --- Process XML ---
            with open(xml_path, "r", encoding="utf-8") as f:
                soup = BeautifulSoup(f, "xml")
                text_targets.append(str(soup))

# Convert to array
X_audio = np.array(X_audio)

# Final sanity check
print("✅ Matching files loaded:")
print(f" - Audio files: {len(X_audio)}")
print(f" - XML targets: {len(text_targets)}")


✅ Matching files loaded:
 - Audio files: 657
 - XML targets: 657


In [16]:
####

In [17]:
len(X_audio)

657

In [18]:
from transformers import AutoTokenizer

# Load pre-trained tokenizer (can use T5, BERT, GPT2, etc.)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Optional: Clean up any empty or malformed XML
text_targets = [txt.strip() for txt in text_targets if txt and len(txt.strip()) > 0]

# Critical: Match count of inputs and outputs
assert len(text_targets) == len(X_audio), f"Mismatch: {len(text_targets)} targets vs {len(X_audio)} audio"

# Tokenize all XMLs into input_ids
Y_token = tokenizer(
    text_targets,
    return_tensors="np",      # for NumPy (since X_audio is a NumPy array)
    padding="max_length",
    truncation=True,
    max_length=512            # or whatever your Transformer model expects
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

✅ Step 1: Parse XML Files


In [19]:
import os
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET

# Directory with XML files
xml_dir = "/content/drive/MyDrive/dataset2/annotation"

# Get all XML file paths
xml_files = [f for f in os.listdir(xml_dir) if f.endswith(".xml")]

# # Extract XML content as raw strings



# text_targets = []

# for xml_file in xml_files:
#     file_path = os.path.join(xml_dir, xml_file)

#     with open(file_path, "r", encoding="utf-8") as file:
#       xml_content = file.read()
#       print(xml_content)






        # soup = BeautifulSoup(file, "xml")

        # # Option 1: Keep full XML as text
        # text_targets.append(str(soup))

        # Option 2: If you only want specific tags like onsetSec, pitch, etc.:
        # notes = soup.find_all("note")
        # extracted = ""
        # for note in notes:
        #     extracted += str(note)  # or customize formatting
        # text_targets.append(extracted)
  # with open(file_path, 'r') as file:
          #  xml_content = file.read()

## Supplementry Code

In [20]:
import os
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET

# Directory with XML files
xml_dir = "/content/drive/MyDrive/dataset2/annotation"

# Get all XML file paths
xml_files = [f for f in os.listdir(xml_dir) if f.endswith(".xml")]

# Extract XML content as raw strings
text_targets = []

for xml_file in xml_files:
    file_path = os.path.join(xml_dir, xml_file)

    with open(file_path, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "xml")

        # Option 1: Keep full XML as text
        text_targets.append(str(soup))

        # Option 2: If you only want specific tags like onsetSec, pitch, etc.:
        # notes = soup.find_all("note")
        # extracted = ""
        # for note in notes:
        #     extracted += str(note)  # or customize formatting
        # text_targets.append(extracted)
  # with open(file_path, 'r') as file:
          #  xml_content = file.read()

In [21]:
# List of XML strings, 1 per file
print(text_targets[20][:600])  # show a snippet of the first


<?xml version="1.0" encoding="utf-8"?>
<instrumentRecording>
<globalParameter>
<audioFileName>G53-51111-1111-00012.wav</audioFileName>
<instrument>EGUI</instrument>
<recordingDate>07.01.2013</recordingDate>
</globalParameter>
<transcription>
<event>
<pitch>51</pitch>
<onsetSec>0.2</onsetSec>
<offsetSec>2.5</offsetSec>
<fretNumber>11</fretNumber>
<stringNumber>1</stringNumber>
<excitationStyle>PK</excitationStyle>
<expressionStyle>NO</expressionStyle>
</event>
</transcription>
</instrumentRecording>


✅ Step 2: Match with Audio


In [22]:
import os
import librosa
import numpy as np
from bs4 import BeautifulSoup

# Set directories
audio_dir = "/content/drive/MyDrive/dataset2/audio"
xml_dir   = "/content/drive/MyDrive/dataset2/annotation"

# Storage lists
X_audio = []
text_targets = []

# Build a dict of XML files by basename
xml_map = {
    os.path.splitext(f)[0]: os.path.join(xml_dir, f)
    for f in os.listdir(xml_dir)
    if f.lower().endswith(".xml")
}

# Loop through audio files and find matching XML
for audio_file in os.listdir(audio_dir):
    if audio_file.lower().endswith(".wav"):
        base_name = os.path.splitext(audio_file)[0]

        # Check if XML with same basename exists
        if base_name in xml_map:
            audio_path = os.path.join(audio_dir, audio_file)
            xml_path   = xml_map[base_name]

            # -- Process Audio --
            y, sr = librosa.load(audio_path, sr=None)
            mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=64).T
            if mfcc.shape[0] < 2000:
                mfcc = np.pad(mfcc, ((0, 2000 - mfcc.shape[0]), (0, 0)), mode='constant')
            else:
                mfcc = mfcc[:2000, :]
            X_audio.append(mfcc)

            # -- Process XML --
            with open(xml_path, "r", encoding="utf-8") as f:
                soup = BeautifulSoup(f, "xml")
                text_targets.append(str(soup))

# Convert to numpy
X_audio = np.array(X_audio)

# Check final stats
print("✅ Loaded:")
print(f"  - Audio files: {len(X_audio)}")
print(f"  - XML targets: {len(text_targets)}")
print(f"  - Shape of X_audio: {X_audio.shape}")


✅ Loaded:
  - Audio files: 657
  - XML targets: 657
  - Shape of X_audio: (657, 2000, 64)


In [23]:
# text_targets
# X_audio

🚀 Step 4: Train the Transformer Model


In [24]:
# print(os.listdir(xml_dir))


In [25]:
count = 0
for f in os.listdir(audio_dir):
  count +=1
print(count)


667


In [26]:
count =0
for xml in os.listdir(xml_dir):
  count+=1

print(count)


667


🚀 Step 4: Train the Transformer Model


In [27]:
# Step 1: Tokenize output (already discussed earlier)
Y_token = tokenizer(
    text_targets,
    return_tensors="np",
    padding="max_length",
    truncation=True,
    max_length=MAX_TARGET_LEN
)

# Step 2: Prepare decoder inputs and outputs
decoder_input_data = Y_token["input_ids"][:, :-1]       # shift left for input
decoder_output_data = Y_token["input_ids"][:, 1:]       # shift right for target

# Optional: pad so both have same max length
decoder_input_data = np.pad(decoder_input_data, ((0,0), (0,1)), constant_values=0)
decoder_output_data = np.pad(decoder_output_data, ((0,0), (0,1)), constant_values=0)

# Step 3: Build model (your function is good)
transformer = build_transformer_model(
    audio_seq_len=2000,
    audio_feat_dim=64,
    max_target_len=MAX_TARGET_LEN,
    vocab_size=VOCAB_SIZE,
    embed_dim= 128,
    ff_dim= 256 ,
    num_heads= 2 ,
    num_layers= 2 ,
    dropout_rate=0.1
)

# Step 4: Compile
transformer.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

# Step 5: Train
transformer.fit(
    [X_audio, decoder_input_data],       # 2-input format
    decoder_output_data,
    batch_size= 4 ,
    epochs=20,
    validation_split=0.1
)


Epoch 1/20
[1m148/148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 259ms/step - accuracy: 0.0000e+00 - loss: nan - val_accuracy: 0.0000e+00 - val_loss: nan
Epoch 2/20
[1m148/148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 52ms/step - accuracy: 0.0000e+00 - loss: nan - val_accuracy: 0.0000e+00 - val_loss: nan
Epoch 3/20
[1m148/148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 54ms/step - accuracy: 0.0000e+00 - loss: nan - val_accuracy: 0.0000e+00 - val_loss: nan
Epoch 4/20
[1m148/148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 53ms/step - accuracy: 0.0000e+00 - loss: nan - val_accuracy: 0.0000e+00 - val_loss: nan
Epoch 5/20
[1m148/148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 51ms/step - accuracy: 0.0000e+00 - loss: nan - val_accuracy: 0.0000e+00 - val_loss: nan
Epoch 6/20
[1m148/148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 52ms/step - accuracy: 0.0000e+00 - loss: nan - val_accuracy: 0.0000e+00 - val_loss: nan
Epoch 

<keras.src.callbacks.history.History at 0x7cfae010de10>

✅ 3. Check for any NaNs in your data:


In [32]:
np.isnan(X_audio).any()
# np.isnan(decoder_input_data).any()
# np.isnan(decoder_output_data).any()


False

In [33]:
print(tokenizer.decode(decoder_output_data[0]))


<? xml version = " 1. 0 " encoding = " utf - 8 "? > < instrumentrecording > < globalparameter > < audiofilename > g53 - 44104 - 1111 - 00005. wav < / audiofilename > < instrument > egui < / instrument > < recordingdate > 07. 01. 2013 < / recordingdate > < / globalparameter > < transcription > < event > < pitch > [SEP] [PAD]


In [28]:
## Testing Time

In [36]:
def preprocess_test_audio(path, max_len=2000):
    y, sr = librosa.load(path, sr=None)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=64).T

    if mfcc.shape[0] < max_len:
        mfcc = np.pad(mfcc, ((0, max_len - mfcc.shape[0]), (0, 0)), mode='constant')
    else:
        mfcc = mfcc[:max_len, :]

    return np.expand_dims(mfcc, axis=0)  # shape: (1, max_len, 64)

# Example:
test_audio_path = "/content/drive/MyDrive/dataset2/audio/1-E1-Major 00 (1).wav"
test_audio = preprocess_test_audio(test_audio_path)


In [37]:
def greedy_decode(transformer, audio_input, tokenizer, max_len=256, start_token_id=101, end_token_id=102):
    decoded_ids = [start_token_id]

    for _ in range(max_len):
        decoder_input = np.array(decoded_ids)[None, :]  # (1, seq_len)
        preds = transformer.predict([audio_input, decoder_input], verbose=0)

        next_id = np.argmax(preds[0, -1, :])  # Take last time-step's highest prob token

        if next_id == end_token_id:
            break

        decoded_ids.append(next_id)

    return decoded_ids


In [38]:
# Define tokenizer special token IDs
start_token = tokenizer.cls_token_id or 101
end_token   = tokenizer.sep_token_id or 102

# Run decoding
output_token_ids = greedy_decode(transformer, test_audio, tokenizer, max_len=256,
                                 start_token_id=start_token, end_token_id=end_token)

# Convert back to XML string
predicted_text = tokenizer.decode(output_token_ids, skip_special_tokens=True)

# Display
print("🔍 Predicted Output:\n", predicted_text)


InvalidArgumentError: Graph execution error:

Detected at node functional_13_1/positional_embedding_1_1/add defined at (most recent call last):
<stack traces unavailable>
Incompatible shapes: [1,101,128] vs. [1,100,128]
	 [[{{node functional_13_1/positional_embedding_1_1/add}}]]
	tf2xla conversion failed while converting __inference_one_step_on_data_71655[]. Run with TF_DUMP_GRAPH_PREFIX=/path/to/dump/dir and --vmodule=xla_compiler=2 to obtain a dump of the compiled functions.
	 [[StatefulPartitionedCall]] [Op:__inference_one_step_on_data_distributed_71840]

In [40]:
## Error Here


In [39]:
def get_positional_encoding(seq_len, d_model):
    pos = np.arange(seq_len)[:, np.newaxis]
    i = np.arange(d_model)[np.newaxis, :]
    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
    angle_rads = pos * angle_rates

    sines = np.sin(angle_rads[:, 0::2])
    cosines = np.cos(angle_rads[:, 1::2])
    pos_encoding = np.concatenate([sines, cosines], axis=-1)
    pos_encoding = pos_encoding[np.newaxis, ...]  # shape (1, seq_len, d_model)

    return tf.cast(pos_encoding, dtype=tf.float32)


In [41]:
class DynamicPositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, embed_dim, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim

    def call(self, x):
        seq_len = tf.shape(x)[1]
        pos_encoding = get_positional_encoding(seq_len, self.embed_dim)
        return x + pos_encoding


In [42]:
from tensorflow.keras.layers import Input, Embedding, Dense, LayerNormalization, Dropout, MultiHeadAttention, Add
from tensorflow.keras.models import Model

# Define Transformer block
def transformer_block(inputs, embed_dim, num_heads, ff_dim, dropout_rate=0.1):
    attention_output = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(inputs, inputs)
    attention_output = Dropout(dropout_rate)(attention_output)
    out1 = LayerNormalization(epsilon=1e-6)(inputs + attention_output)

    ffn_output = Dense(ff_dim, activation='relu')(out1)
    ffn_output = Dense(embed_dim)(ffn_output)
    ffn_output = Dropout(dropout_rate)(ffn_output)
    return LayerNormalization(epsilon=1e-6)(out1 + ffn_output)

# Full Transformer model
def build_transformer_model(audio_seq_len, audio_feat_dim, max_target_len,
                            vocab_size, embed_dim, ff_dim, num_heads, num_layers, dropout_rate=0.1):
    # Encoder
    encoder_input = Input(shape=(audio_seq_len, audio_feat_dim), name="encoder_input")
    x_enc = Dense(embed_dim)(encoder_input)
    for _ in range(num_layers):
        x_enc = transformer_block(x_enc, embed_dim, num_heads, ff_dim, dropout_rate)

    # Decoder
    decoder_input = Input(shape=(None,), name="decoder_input")
    x_dec = Embedding(input_dim=vocab_size, output_dim=embed_dim)(decoder_input)
    x_dec = DynamicPositionalEmbedding(embed_dim)(x_dec)

    for _ in range(num_layers):
        attention_output = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(
            query=x_dec, value=x_enc, key=x_enc)
        attention_output = Dropout(dropout_rate)(attention_output)
        x_dec = LayerNormalization(epsilon=1e-6)(x_dec + attention_output)

        ffn_output = Dense(ff_dim, activation='relu')(x_dec)
        ffn_output = Dense(embed_dim)(ffn_output)
        ffn_output = Dropout(dropout_rate)(ffn_output)
        x_dec = LayerNormalization(epsilon=1e-6)(x_dec + ffn_output)

    output = Dense(vocab_size, activation="softmax")(x_dec)
    model = Model(inputs=[encoder_input, decoder_input], outputs=output)
    return model


In [43]:
import librosa
import numpy as np

def preprocess_test_audio(path, max_len=2000):
    y, sr = librosa.load(path, sr=None)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=64).T

    if mfcc.shape[0] < max_len:
        mfcc = np.pad(mfcc, ((0, max_len - mfcc.shape[0]), (0, 0)), mode='constant')
    else:
        mfcc = mfcc[:max_len, :]

    return np.expand_dims(mfcc, axis=0)  # shape: (1, 2000, 64)

# Example:
test_audio_path = "/content/drive/MyDrive/dataset2/audio/1-E1-Major 00 (1).wav"
test_audio = preprocess_test_audio(test_audio_path)


In [44]:
def greedy_decode(transformer, audio_input, tokenizer, max_len=256, start_token_id=101, end_token_id=102):
    decoded_ids = [start_token_id]

    for _ in range(max_len):
        decoder_input = np.array(decoded_ids)[None, :]  # shape: (1, seq_len)
        preds = transformer.predict([audio_input, decoder_input], verbose=0)

        next_id = np.argmax(preds[0, -1, :])  # Pick highest prob token from last time step

        if next_id == end_token_id:
            break

        decoded_ids.append(next_id)

    return decoded_ids


In [45]:
# Define tokenizer special token IDs
start_token = tokenizer.cls_token_id or 101
end_token   = tokenizer.sep_token_id or 102

# Run decoding
output_token_ids = greedy_decode(
    transformer,
    test_audio,
    tokenizer,
    max_len=256,
    start_token_id=start_token,
    end_token_id=end_token
)

# Convert token IDs back to string
predicted_text = tokenizer.decode(output_token_ids, skip_special_tokens=True)

# Display output
print("🔍 Predicted Output:\n")
print(predicted_text)


InvalidArgumentError: Graph execution error:

Detected at node functional_13_1/positional_embedding_1_1/add defined at (most recent call last):
<stack traces unavailable>
Incompatible shapes: [1,101,128] vs. [1,100,128]
	 [[{{node functional_13_1/positional_embedding_1_1/add}}]]
	tf2xla conversion failed while converting __inference_one_step_on_data_71655[]. Run with TF_DUMP_GRAPH_PREFIX=/path/to/dump/dir and --vmodule=xla_compiler=2 to obtain a dump of the compiled functions.
	 [[StatefulPartitionedCall]] [Op:__inference_one_step_on_data_distributed_71840]