In [None]:
import numpy as np
import tensorflow as tf

for gpu in tf.config.experimental.list_physical_devices('GPU'):
    tf.config.experimental.set_memory_growth(gpu, True)

import matplotlib.pyplot as plt

import random  

from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction  # BLEU score: Measures how similar two sentences are.
from rouge_score import rouge_scorer  # ROUGE score: Used a lot in text summarization models.
from sklearn.metrics.pairwise import cosine_similarity  # Cosine similarity: Helps check how close word embeddings are.


from tensorflow.keras.optimizers.schedules import CosineDecayRestarts

import gensim.downloader as api

import nltk

import time  

# Making sure we have access to the WordNet dataset before using it.
nltk.download("wordnet")


In [None]:
def positional_encoding(length: int, depth: int):
    """
    Generates positional encodings to provide word order information in a sequence.
    Used in Transformer models to replace recurrence.
    """

    depth = depth/2
    positions = np.arange(length)[:,np.newaxis]
    depths = np.arange(depth)[np.newaxis,:]/depth
    
    angle_rates= 1/ (10000**depths)
    angle_rads = positions* angle_rates
    pos_encoding= np.concatenate([np.sin(angle_rads), np.cos(angle_rads)],axis=-1)
    return tf.cast(pos_encoding,dtype=tf.float32)

In [None]:
# Generate and visualize positional encodings.
pos_encoding = positional_encoding(length=2048, depth=512)


# Print the shape of the positional encoding (should be [2048, 512]).
print(pos_encoding.shape)

# Visual representation of positional encoding (heatmap).
plt.pcolormesh(pos_encoding.numpy().T, cmap='RdBu')

plt.ylabel('Depth')
plt.xlabel('Position')
plt.colorbar()
plt.show()

In [None]:
pos_encoding /= tf.norm(pos_encoding,axis=-1,keepdims=True)
p = pos_encoding[1000]
dots= tf.einsum('pd,d->p',pos_encoding,p).numpy()

plt.subplot(2,1,1)
plt.plot(dots)
plt.ylim([0,1])
plt.plot([950,950,float('nan'),1050,1050],[ 0,1,float('nan'),0,1], color='k',label='Zoom')
plt.legend()

plt.subplot(2,1,2)
plt.plot(range(len(dots)),dots)
plt.xlim([950,1050])
plt.ylim([0,1])

In [None]:
class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, vocab_size: int, d_model: int, embedding: tf.keras.layers.Embedding = None):
        super().__init__()
        self.d_model = d_model
        # If no custom embedding is provided, initialize a trainable embedding layer
        self.embedding = tf.keras.layers.Embedding(vocab_size, d_model, mask_zero=True) if embedding is None else embedding
        # Generate positional encodings once (for a max sequence length of 2048)
        self.pos_encoding = positional_encoding(length=2048, depth=d_model)

    def compute_mask(self, *args, **kwargs):
        # Ensure that the mask from the embedding layer is returned
        return self.embedding.compute_mask(*args, **kwargs)

    def call(self, x: tf.Tensor) -> tf.Tensor:
        seq_len = tf.shape(x)[1]  # Get the actual sequence length of input
        x = self.embedding(x)  # Convert token indices to embeddings

        # Removed unnecessary `length = tf.shape(x)[:]` (had no effect)
        
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))  # ✅ Scale embeddings for stability
        
        # Corrected positional encoding broadcasting:
        # Before: self.pos_encoding[tf.newaxis, :seq_len, :]
        # After: self.pos_encoding[:seq_len, :]
        # - Removed `tf.newaxis` to avoid unnecessary expansion
        # - Ensures positional encoding correctly aligns with input tensor
        x += self.pos_encoding[:seq_len, :]  

        return x



In [None]:
vocab_size=1000
d_model=512

embedding_layer = PositionalEmbedding(vocab_size,d_model)
random_input = np.random.randint(0,vocab_size,size=(1,100))

output= embedding_layer(random_input)
print("Random imput shape : ", random_input.shape)
print("PositionalEmbedding output : ", output.shape)

In [None]:
class BaseAttention(tf.keras.layers.Layer):
    def __init__(self,**kwargs:dict):
        super().__init__()
        self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
        self.layernorm = tf.keras.layers.LayerNormalization()
        self.add = tf.keras.layers.Add()

In [None]:
class CrossAttention(BaseAttention):
    def call(self, x: tf.Tensor, context: tf.Tensor) -> tf.Tensor:
        attn_output, attn_scores = self.mha(query=x, key=context, value=context, return_attention_scores=True)  
        x = self.add([x, attn_output])  # Residual connection
        x = self.layernorm(x)  # Normalization for stability
        self.last_attn_scores = attn_scores  # Store attention scores
        return x


In [None]:
encoder_vocab_size = 1000
decoder_vocab_size = 1000
d_model = 768

# Fixed vocab_size reference for embeddings
encoder_embedding_layer = PositionalEmbedding(encoder_vocab_size, d_model)
decoder_embedding_layer = PositionalEmbedding(decoder_vocab_size, d_model)

# Random input generation
random_encoder_input = np.random.randint(0, encoder_vocab_size, size=(1, 100))
random_decoder_input = np.random.randint(0, decoder_vocab_size, size=(1, 110))

# Compute embeddings
encoder_embedding = encoder_embedding_layer(random_encoder_input)
decoder_embedding = decoder_embedding_layer(random_decoder_input)

# Debugging outputs
print(f" Encoder Embedding Shape: {encoder_embedding.shape}")
print(f" Decoder Embedding Shape: {decoder_embedding.shape}")

# Initialize CrossAttention Layer
cross_attention_layer = CrossAttention(num_heads=2, key_dim=d_model)  # Explicitly linking key_dim to d_model
cross_attention_output = cross_attention_layer(decoder_embedding, encoder_embedding)

print(f"Cross Attention Output Shape: {cross_attention_output.shape}")


In [None]:
class GlobalSelfAttention(BaseAttention):
    def __init__(self, **kwargs: dict):
        super().__init__(**kwargs)
        self.dropout = tf.keras.layers.Dropout(0.1)  #  Added dropout for regularization

    def call(self, x: tf.Tensor, training: bool = False) -> tf.Tensor:
        # Optionally return attention scores for debugging
        attn_outpt, attn_scores = self.mha(query=x, key=x, value=x, return_attention_scores=True)

        # Apply dropout before residual connection
        attn_outpt = self.dropout(attn_outpt, training=training)

        # Residual connection and layer normalization
        x = self.add([x, attn_outpt])
        x = self.layernorm(x)

        # Store attention scores for analysis (Optional)
        self.last_attn_scores = attn_scores

        return x


In [None]:
encoder_vocab_size = 1000
d_model = 768

# Fix: Use encoder_vocab_size instead of undefined vocab_size
encoder_embedding_layer = PositionalEmbedding(encoder_vocab_size, d_model)

# Generate random input
random_encoder_input = np.random.randint(0, encoder_vocab_size, size=(1, 100))

# Pass input through embedding layer
encoder_embedding = encoder_embedding_layer(random_encoder_input)

print("Encoder_Embedding Shape:", encoder_embedding.shape)

# Ensure GlobalSelfAttention has correct argument names
cross_attention_layer = GlobalSelfAttention(num_heads=2, key_dim=512)

# Explicitly set training flag for dropout consistency
cross_attention_output = cross_attention_layer(encoder_embedding, training=True)

print("Global_Self_Attention_output shape:", cross_attention_output.shape)


In [None]:
class CausalSelfAttention(BaseAttention):
    def call(self, x: tf.Tensor, training: bool = True) -> tf.Tensor:
        attn_outpt = self.mha(query=x, key=x, value=x, use_causal_mask=True, training=training)  #  Fix: Explicit training

        x = self.add([x, attn_outpt])  # Residual connection
        x = self.layernorm(x)  #  Layer normalization for stability
        
        return x


In [None]:
decoder_vocab_size = 1000  #  Defined correctly

d_model = 768

# Fixed vocab_size variable name
decoder_embedding_layer = PositionalEmbedding(decoder_vocab_size, d_model)

random_decoder_input = np.random.randint(0, decoder_vocab_size, size=(1, 110))

decoder_embeddings = decoder_embedding_layer(random_decoder_input)

print("Decoder_Embeddings shape:", decoder_embeddings.shape)

# Pass training=True explicitly for dropout stability
causal_self_attention_layer = CausalSelfAttention(num_heads=2, key_dim=512)

causal_self_attention_output = causal_self_attention_layer(decoder_embeddings, training=True)  # Explicit training arg

print("causal_self_attention_output shape:", causal_self_attention_output.shape)

# Slice before embedding layer to prevent shape mismatch
out1 = causal_self_attention_layer(decoder_embedding_layer(random_decoder_input[:, :50]), training=False)
out2 = causal_self_attention_layer(decoder_embedding_layer(random_decoder_input[:, :50]), training=False)  # Fixed slicing

# Compute numerical difference to check consistency
diff = tf.reduce_max(tf.abs(out1 - out2)).numpy()

print("Difference between the two outputs:", diff)


In [None]:
class FeedForward(tf.keras.layers.Layer):
    def __init__(self, d_model, dff, dropout_rate=0.1, **kwargs):
        super(FeedForward, self).__init__(**kwargs)
        
        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-6)  #  Normalize before FFN
        self.seq = tf.keras.Sequential([
            tf.keras.layers.Dense(dff, activation=tf.keras.layers.ReLU()),  #  More robust activation
            tf.keras.layers.Dropout(dropout_rate),  # Dropout for regularization
            tf.keras.layers.Dense(d_model)  # Projects back to d_model
        ])
        self.add_layer = tf.keras.layers.Add()  # Explicit Add() layer

    def call(self, x: tf.Tensor, training=False) -> tf.Tensor:
        norm_x = self.layer_norm(x)  
        seq_out = self.seq(norm_x, training=training)  
        x = self.add_layer([x, seq_out])  
        return x



In [None]:
encoder_vocab_size = 1000
d_model = 768

# Fix: Use the correct variable name (encoder_vocab_size)
encoder_embedding_layer = PositionalEmbedding(encoder_vocab_size, d_model)

random_encoder_input = np.random.randint(0, encoder_vocab_size, size=(1, 100))

encoder_embeddings = encoder_embedding_layer(random_encoder_input)

print("Encoder_Embeddings shape:", encoder_embeddings.shape)

# Ensure FeedForward runs correctly
ff_layer = FeedForward(d_model, dff=2048)
ff_output = ff_layer(encoder_embeddings, training=True)  # Fix: Pass training=True

print("Feed_Forward_Output shape:", ff_output.shape)


In [None]:
    def __init__(self, d_model: int, num_heads: int, dff: int, dropout_rate: float = 0.1):
        super().__init__()
        self.self_attention = GlobalSelfAttention(
            num_heads=num_heads,
            key_dim=d_model,
            dropout=dropout_rate  # Ensure dropout is set
        )
        self.ffn = FeedForward(d_model, dff)
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)  #  Fix: Add LayerNorm
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)  #  Fix: Second LayerNorm
        self.dropout = tf.keras.layers.Dropout(dropout_rate)  #  Fix: Add explicit Dropout

    def call(self, x: tf.Tensor, training: bool = True) -> tf.Tensor:
        #  Apply Self-Attention + Add + Normalize
        attn_output = self.self_attention(x)  
        x = self.layernorm1(x + attn_output)  

        #  Apply FeedForward + Dropout + Normalize
        ffn_output = self.ffn(x)
        ffn_output = self.dropout(ffn_output, training=training)  #  Apply dropout only in training
        x = self.layernorm2(x + ffn_output)  

        return x



In [None]:
encoder_vocab_size = 1000
d_model = 768

# Fix: Use `encoder_vocab_size`
encoder_embedding_layer = PositionalEmbedding(encoder_vocab_size, d_model)  

random_encoder_input = np.random.randint(0, encoder_vocab_size, size=(1, 100))

encoder_embeddings = encoder_embedding_layer(random_encoder_input)

print("Encoder_Embeddings shape:", encoder_embeddings.shape)

#  Using the updated EncoderLayer with LayerNorm & Dropout
encoder_layer = EncoderLayer(d_model, num_heads=2, dff=2048)

encoder_layer_output = encoder_layer(encoder_embeddings, training=True)  #  Ensure training=True during training

print("Encoder_layer_Output shape:", encoder_layer_output.shape)


In [None]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers: int, d_model: int, num_heads: int, dff: int, vocab_size: int, dropout_rate: float = 0.1):
        super().__init__()
        self.d_model = d_model
        self.num_layers = num_layers

        #  Positional Embedding Layer
        self.pos_embedding = PositionalEmbedding(vocab_size=vocab_size, d_model=d_model)

        #  Encoder Layers (List Comprehension)
        self.enc_layers = [
            EncoderLayer(d_model=d_model, num_heads=num_heads, dff=dff, dropout_rate=dropout_rate)
            for _ in range(num_layers)  #  Changed 'a' to '_' (convention for unused variable)
        ]

        #  Dropout Layer
        self.dropout = tf.keras.layers.Dropout(dropout_rate)

    def call(self, x: tf.Tensor, training: bool = False) -> tf.Tensor:
        """ 
        Forward pass of the Transformer Encoder
        - `training` flag ensures dropout is only active during training
        """
        x = self.pos_embedding(x)
        x = self.dropout(x, training=training)  #  Ensure dropout applies only during training

        #  Iterate through all encoder layers
        for i, layer in enumerate(self.enc_layers):
            x = layer(x)  # No need to index explicitly (cleaner)

        return x



In [None]:
encoder_vocab_size = 1000  # Ensure the correct variable name
d_model = 768

#  Pass the correct variable name
encoder = Encoder(num_layers=2, d_model=d_model, num_heads=2, dff=2048, vocab_size=encoder_vocab_size)

random_encoder_input = np.random.randint(0, encoder_vocab_size, size=(1, 100))

encoder_output = encoder(random_encoder_input)

print("Random_Encoder_input shape : ", random_encoder_input.shape)
print("Encoder_Output shape : ", encoder_output.shape)


In [None]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model: int, num_heads: int, dff: int, dropout_rate: float = 0.1):
        super(DecoderLayer, self).__init__()
        
        #  Causal Self-Attention: Ensures autoregressive behavior
        self.causal_self_attention = CausalSelfAttention(
            num_heads=num_heads,
            key_dim=d_model,
            dropout=dropout_rate
        )
        
        #  Cross-Attention: Helps the decoder attend to encoder outputs
        self.cross_attention = CrossAttention(
            num_heads=num_heads,
            key_dim=d_model,
            dropout=dropout_rate
        )
        
        #  FeedForward Network for feature extraction
        self.ffn = FeedForward(d_model, dff)

    def call(self, x: tf.Tensor, context: tf.Tensor) -> tf.Tensor:
        x = self.causal_self_attention(x=x)  #  Self-attention first
        x = self.cross_attention(x=x, context=context)  #  Remove return_attention_scores
        self.last_attn_scores = self.cross_attention.last_attn_scores  #  Store attention scores
        x = self.ffn(x)  #  Apply FeedForward
        return x


In [None]:
# Hyperparameters (Ensure consistency)
decoder_vocab_size = 1000
d_model = 768
dff = 2048
num_heads = 8  #  Ensure this matches Encoder's num_heads (previously it was 2!)

# Instantiate Decoder Layer
decoder_layer = DecoderLayer(d_model, num_heads, dff)

#  Generate random input for Decoder
random_decoder_input = np.random.randint(0, decoder_vocab_size, size=(1, 110))

#  Use `decoder_embedding_layer` (not `encoder_embedding_layer`)
decoder_embeddings = decoder_embedding_layer(random_decoder_input)

#  Ensure `encoder_output` is passed as `context`
decoder_layer_output = decoder_layer(decoder_embeddings, context=encoder_output)

#  Print Output Shapes for Debugging
print("Random Decoder Input Shape:", random_decoder_input.shape)
print("Decoder Embeddings Shape:", decoder_embeddings.shape)
print("Decoder Output Shape:", decoder_layer_output.shape)


In [None]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers: int, d_model: int, num_heads: int, dff: int, vocab_size: int, dropout_rate: float = 0.1):
        super().__init__()
        self.num_layers = num_layers  #  Keeping only necessary attributes

        #  Positional Embedding & Dropout
        self.pos_embedding = PositionalEmbedding(vocab_size=vocab_size, d_model=d_model)
        self.dropout = tf.keras.layers.Dropout(dropout_rate)

        #  Decoder Layers
        self.dec_layers = [
            DecoderLayer(d_model=d_model, num_heads=num_heads, dff=dff, dropout_rate=dropout_rate)
            for _ in range(num_layers)  
        ]

        #  Last Attention Scores (Default: None)
        self.last_attn_scores = None

    def call(self, x: tf.Tensor, context: tf.Tensor) -> tf.Tensor:
        x = self.pos_embedding(x)
        x = self.dropout(x)

        #  Apply Decoder Layers
        for layer in self.dec_layers:
            x = layer(x, context)

        # Store Last Attention Scores (Only if layers exist)
        if self.num_layers > 0:
            self.last_attn_scores = self.dec_layers[-1].last_attn_scores
        
        return x


In [None]:
#  Define Vocab Size & Model Dimensions
decoder_vocab_size = 1000
d_model = 768

#  Ensure `encoder_output` is defined correctly
assert 'encoder_output' in locals(), "Error: `encoder_output` is not defined. Run the encoder first."

#  Create Decoder with Better Head Count (8 instead of 2 for `d_model=512`)
decoder_layer = Decoder(num_layers=2, d_model=d_model, num_heads=8, dff=2048, vocab_size=decoder_vocab_size)

#  Generate Random Decoder Input
random_decoder_input = np.random.randint(0, decoder_vocab_size, size=(1, 100))

#  Pass Inputs to Decoder
decoder_output = decoder_layer(random_decoder_input, encoder_output)

#  Check for NaN Outputs (Debugging Step)
if tf.math.reduce_any(tf.math.is_nan(decoder_output)):
    print("Warning: NaN values detected in `decoder_output`!")

#  Print Output Shapes
print("Random_decoder_input shape:", random_decoder_input.shape)
print("Decoder_Output shape:", decoder_output.shape)


In [None]:
def Transformer(
    input_vocab_size: int,
    target_vocab_size: int,
    encoder_input_size: int = None,   # Allow None for dynamic shape
    decoder_input_size: int = None,   # Allow None for dynamic shape
    num_layers: int = 6,
    d_model: int = 512,
    num_heads: int = 8,
    dff: int = 2048,
    dropout_rate: float = 0.1
) -> tf.keras.Model:
    
    #  Define Inputs (None allows variable-length sequences)
    encoder_input = tf.keras.Input(shape=(encoder_input_size or None,), dtype=tf.int64, name="encoder_input")
    decoder_input = tf.keras.Input(shape=(decoder_input_size or None,), dtype=tf.int64, name="decoder_input")
    
    #  Build Encoder & Decoder
    encoder = Encoder(
        num_layers=num_layers, d_model=d_model, num_heads=num_heads,
        dff=dff, vocab_size=input_vocab_size, dropout_rate=dropout_rate
    )(encoder_input)

    decoder = Decoder(
        num_layers=num_layers, d_model=d_model, num_heads=num_heads,
        dff=dff, vocab_size=target_vocab_size, dropout_rate=dropout_rate
    )(decoder_input, encoder)

    #  Add Final Dense Layer with `softmax` Activation
    output = tf.keras.layers.Dense(target_vocab_size, activation="softmax", name="output_layer")(decoder)

    #  Define & Return Model
    model = tf.keras.Model(inputs=[encoder_input, decoder_input], outputs=output, name="Transformer_Model")
    
    #  Print Model Summary
    model.summary()

    return model


In [None]:
encoder_input_size=100
decoder_input_size=110

encoder_vocab_size=1000
decoder_vocab_size=1000

model=Transformer(
    input_vocab_size=encoder_vocab_size,
    target_vocab_size=decoder_vocab_size,
    encoder_input_size=encoder_input_size,
    decoder_input_size=decoder_input_size,
    num_layers=2,
    d_model=512,
    num_heads=2,
    dff=512,
    dropout_rate=0.1)

model.summary()

In [None]:
import tensorflow as tf
import numpy as np
import random
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from sklearn.metrics.pairwise import cosine_similarity
from tensorflow.keras.optimizers.schedules import CosineDecayRestarts
import gensim.downloader as api
from nltk.corpus import wordnet
import nltk
import time

nltk.download("wordnet")

#  Optimized Learning Rate & Dropout
learning_rate_schedule = CosineDecayRestarts(
    initial_learning_rate=0.0003, first_decay_steps=1000, t_mul=2.0
)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate_schedule)
dropout_rate = 0.1  # Best-performing dropout

#  Load Pretrained GloVe 100D (Best Performing)
embedding_dim = 50  
start_time = time.time()
glove_model = api.load("glove-twitter-50")  
print(f" Pretrained embeddings loaded in {time.time() - start_time:.2f} seconds")

#  Embed Words Using GloVe 100D
vocab_size = 10000
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in word_index.items():
    embedding_matrix[i] = glove_model.get_vector(word) if word in glove_model else np.random.uniform(-0.1, 0.1, embedding_dim)

embedding_layer = tf.keras.layers.Embedding(
    vocab_size, embedding_dim, weights=[embedding_matrix], trainable=True
)

#  Improved Beam Search (Finalized)
def beam_search_decoder(predictions, beam_width=10, alpha=0.5):  
    sequences = [[list(), 1.0]]
    for row in predictions:
        all_candidates = []
        for seq, score in sequences:
            for j, prob in enumerate(row):
                prob = max(prob, 1e-9)  
                length_norm = (1 + len(seq))*alpha / (1 + 1)*alpha  
                candidate = [seq + [j], score + (-np.log(prob) / length_norm)]
                all_candidates.append(candidate)

        ordered = sorted(all_candidates, key=lambda tup: tup[1])
        sequences = ordered[:beam_width]

    return sequences[0][0]

#  Optimized Synonym Replacement
def synonym_replace(sentence, n=1):
    words = sentence.split()
    if len(words) < 3: return sentence  # Avoid modifying very short sentences
    word_idx = random.randint(0, len(words) - 1)
    synonyms = [lemma.name() for syn in wordnet.synsets(words[word_idx]) for lemma in syn.lemmas()]
    synonyms = list(set(synonyms) - {words[word_idx]})  
    if synonyms:
        words[word_idx] = random.choice(synonyms)
    return " ".join(words)

#  Fix Cosine Similarity Calculation
def get_sentence_vector(sentence):
    vectors = [embedding_matrix[word_index.get(word, 0)] for word in sentence]
    return np.mean(vectors, axis=0) if vectors else np.zeros(embedding_dim)

#  Model Evaluation (Best Version)
def evaluate_model(y_true, y_pred_logits, idx2word):
    y_pred_classes = np.argmax(y_pred_logits, axis=-1)
    
    loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits=True, label_smoothing=0.1)
    loss_values = [loss_fn(tf.one_hot(y_true[i], depth=len(idx2word)), y_pred_logits[i]).numpy() for i in range(len(y_true))]
    avg_loss = np.mean(loss_values)
    perplexity = np.exp(avg_loss)
    
    ref_sentences = [[idx2word.get(idx, "<UNK>") for idx in y_true[i] if idx > 0] for i in range(len(y_true))]
    pred_sentences = [[idx2word.get(idx, "<UNK>") for idx in y_pred_classes[i] if idx > 0] for i in range(len(y_pred_classes))]

    smoothing = SmoothingFunction().method1
    bleu_scores = [sentence_bleu([ref], pred, smoothing_function=smoothing) for ref, pred in zip(ref_sentences, pred_sentences)]
    avg_bleu = np.mean(bleu_scores)
    
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = [scorer.score(" ".join(ref), " ".join(pred)) for ref, pred in zip(ref_sentences, pred_sentences)]
    avg_rouge1 = np.mean([s['rouge1'].fmeasure for s in rouge_scores])
    avg_rouge2 = np.mean([s['rouge2'].fmeasure for s in rouge_scores])
    avg_rougeL = np.mean([s['rougeL'].fmeasure for s in rouge_scores])

    cosine_similarities = [cosine_similarity([get_sentence_vector(ref)], [get_sentence_vector(pred)])[0, 0] for ref, pred in zip(ref_sentences, pred_sentences)]
    avg_cosine = np.mean(cosine_similarities)
    
    print(f" Model Perplexity: {perplexity:.4f}")
    print(f" Average Log-Likelihood (Loss): {avg_loss:.4f}")
    print(f" BLEU Score: {avg_bleu:.4f}")
    print(f" ROUGE Scores - ROUGE-1: {avg_rouge1:.4f}, ROUGE-2: {avg_rouge2:.4f}, ROUGE-L: {avg_rougeL:.4f}")
    print(f" Cosine Similarity: {avg_cosine:.4f}")

#  Running Final Evaluation
batch_size = 64  
dummy_input = np.random.randint(0, 100, size=(1000, 10))  
dummy_output = np.random.randint(0, 100, size=(1000, 10))  

dataset = tf.data.Dataset.from_tensor_slices((dummy_input, dummy_output))
dataset = dataset.shuffle(10000).batch(batch_size).prefetch(tf.data.AUTOTUNE)

y_true = np.random.randint(1, 6, size=(5, 5))
y_pred_logits = np.random.rand(5, 5, len(word_index))

idx2word = {i: w for w, i in word_index.items()}

evaluate_model(y_true, y_pred_logits, idx2word)