# Project 2 - Introduction to the Transformer Architecture

### Tehtävä 2: Luku 11.4: Tekstin luokittelu Transformer encoderin avulla

In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization, Embedding, LayerNormalization, Dense, Dropout, MultiHeadAttention, GlobalAveragePooling1D, Layer
from tensorflow.keras import Sequential, Input, Model
from tensorflow.keras.datasets import imdb

# Data Loading
max_features = 20000
sequence_length = 200

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
x_train = tf.keras.preprocessing.sequence.pad_sequences(x_train, maxlen=sequence_length)
x_test = tf.keras.preprocessing.sequence.pad_sequences(x_test, maxlen=sequence_length)

# Transformer Encoder Layer
class TransformerEncoder(Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerEncoder, self).__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = Sequential([Dense(ff_dim, activation="relu"), Dense(embed_dim)])
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training=False):  # Set default value for training
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

# Hyperparameters
embed_dim = 32  # Embedding size for each token
num_heads = 2   # Number of attention heads
ff_dim = 32     # Hidden layer size in feed-forward network

# Model
inputs = Input(shape=(sequence_length,))
embedding_layer = Embedding(input_dim=max_features, output_dim=embed_dim)
x = embedding_layer(inputs)
x = TransformerEncoder(embed_dim, num_heads, ff_dim)(x, training=True)  # Add training argument explicitly
x = GlobalAveragePooling1D()(x)
x = Dropout(0.1)(x)
x = Dense(20, activation="relu")(x)
x = Dropout(0.1)(x)
outputs = Dense(1, activation="sigmoid")(x)

model = Model(inputs=inputs, outputs=outputs)

# Compile and Train
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
history = model.fit(x_train, y_train, batch_size=32, epochs=3, validation_split=0.2)

# Evaluate
test_loss, test_acc = model.evaluate(x_test, y_test)
print(f"Test Accuracy: {test_acc:.2f}")


Epoch 1/3
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 35ms/step - accuracy: 0.6631 - loss: 0.5558 - val_accuracy: 0.8802 - val_loss: 0.2912
Epoch 2/3
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 34ms/step - accuracy: 0.9289 - loss: 0.1902 - val_accuracy: 0.8826 - val_loss: 0.3010
Epoch 3/3
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 34ms/step - accuracy: 0.9653 - loss: 0.1013 - val_accuracy: 0.8734 - val_loss: 0.3773
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 11ms/step - accuracy: 0.8515 - loss: 0.4324
Test Accuracy: 0.85


### Tehtävä 2: Luku 11.5: Kielenkääntäjä Transformerin avulla

## Asenna tässä kohtaa konsolissa: pip install tensorflow-datasets

# Tätä 11.5 kohtaa en saa toimii

In [74]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization, Embedding, LayerNormalization, Dense, Dropout, MultiHeadAttention, Input, Layer
from tensorflow.keras.models import Model
import tensorflow_datasets as tfds

# Data Loading: English-Spanish translation dataset
dataset_name = "ted_hrlr_translate/pt_to_en"  # You can replace with any seq2seq dataset
examples, metadata = tfds.load(dataset_name, with_info=True, as_supervised=True)
train_examples, val_examples = examples['train'], examples['validation']

# Preprocessing
max_tokens = 20000
sequence_length = 40

def tokenize_pairs(en, es):
    en = tf.expand_dims(en, -1)
    es = tf.expand_dims(es, -1)
    return vectorize_layer(en), vectorize_layer(es)

vectorize_layer = TextVectorization(max_tokens=max_tokens, output_mode='int', output_sequence_length=sequence_length)
train_text = train_examples.map(lambda pt, en: pt)
vectorize_layer.adapt(train_text.batch(64))

# Transformer Layers
class TransformerEncoder(Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerEncoder, self).__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential([Dense(ff_dim, activation="relu"), Dense(embed_dim)])
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

class TransformerDecoder(Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerDecoder, self).__init__()
        self.att1 = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.att2 = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential([Dense(ff_dim, activation="relu"), Dense(embed_dim)])
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.layernorm3 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)
        self.dropout3 = Dropout(rate)

    def call(self, enc_output, target, training):
        attn1 = self.att1(target, target)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(target + attn1)
        attn2 = self.att2(out1, enc_output)
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layernorm2(out1 + attn2)
        ffn_output = self.ffn(out2)
        ffn_output = self.dropout3(ffn_output, training=training)
        return self.layernorm3(out2 + ffn_output)

# Model Building
embed_dim = 256
num_heads = 8
ff_dim = 512

encoder_inputs = Input(shape=(sequence_length,), name="encoder_inputs")
x = Embedding(max_tokens, embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, num_heads, ff_dim)(x, training=True)

decoder_inputs = Input(shape=(sequence_length,), name="decoder_inputs")
y = Embedding(max_tokens, embed_dim)(decoder_inputs)
class ReshapeLayer(Layer):
    def __init__(self, target_shape):
        super(ReshapeLayer, self).__init__()
        self.target_shape = target_shape

    def call(self, inputs):
        return tf.reshape(inputs, self.target_shape)

decoder_outputs = TransformerDecoder(embed_dim, num_heads, ff_dim)(encoder_outputs, y, training=True)
decoder_outputs = ReshapeLayer((-1, sequence_length, embed_dim))(decoder_outputs)

outputs = Dense(max_tokens, activation="softmax")(decoder_outputs)
transformer = Model([encoder_inputs, decoder_inputs], outputs)

# Compile Model
transformer.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

# Prepare Data
def prepare_batch(en, es):
    en_vectorized = vectorize_layer(en)
    es_vectorized = vectorize_layer(es)
    es_vectorized = tf.pad(es_vectorized, [[0, 0], [0, 1]], constant_values=0)  # Ensure es_vectorized has the correct shape
    return {"encoder_inputs": en_vectorized, "decoder_inputs": es_vectorized[:, :-1]}, es_vectorized[:, 1:]

train_dataset = train_examples.map(prepare_batch).batch(64).prefetch(tf.data.AUTOTUNE)
val_dataset = val_examples.map(prepare_batch).batch(64).prefetch(tf.data.AUTOTUNE)

# Train Model
transformer.fit(train_dataset, validation_data=val_dataset, epochs=3)


ValueError: in user code:

    File "C:\Users\samim\AppData\Local\Temp\ipykernel_1908\2392415165.py", line 99, in prepare_batch  *
        es_vectorized = tf.pad(es_vectorized, [[0, 0], [0, 1]], constant_values=0)  # Ensure es_vectorized has the correct shape

    ValueError: Shape must be rank 2 but is rank 1 for '{{node Pad}} = Pad[T=DT_INT64, Tpaddings=DT_INT32](text_vectorization_69_3/Pad, Pad/paddings)' with input shapes: [?], [2,2].


# Tehtävä 2: Luku 12.1: Generatiivinen kielimalli Transformer-arkkitehtuurilla

In [39]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, MultiHeadAttention, LayerNormalization, Dense, Dropout, Input, Layer
from tensorflow.keras.models import Model
import numpy as np

# Hyperparameters
max_tokens = 20000
sequence_length = 50
embed_dim = 128
num_heads = 4
ff_dim = 256
dropout_rate = 0.1

# Dataset: Simple example with TensorFlow's Shakespeare dataset
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')

# Tokenizer: Splitting text into sentences for better adaptation
text_split = text.split('\n')  # Split by lines or sentences
vectorize_layer = tf.keras.layers.TextVectorization(max_tokens=max_tokens, output_mode='int', output_sequence_length=sequence_length)
vectorize_layer.adapt(text_split)

# Prepare dataset
sequences = vectorize_layer([text])[0]
inputs = sequences[:-1]
targets = sequences[1:]
inputs = tf.expand_dims(inputs, axis=-1)  # Ensure inputs have shape (None, sequence_length)
targets = tf.expand_dims(targets, axis=-1)  # Ensure targets have shape (None, sequence_length)
dataset = tf.data.Dataset.from_tensor_slices((inputs, targets)).batch(64).prefetch(tf.data.AUTOTUNE)

# Transformer Components
class TransformerDecoder(Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerDecoder, self).__init__()
        self.att1 = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential([Dense(ff_dim, activation="relu"), Dense(embed_dim)])
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training=False):
        attn1 = self.att1(inputs, inputs)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(inputs + attn1)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

# Model Building
decoder_inputs = Input(shape=(sequence_length,), name="input_layer_0")  # Define decoder input layer
embedding_layer = Embedding(input_dim=max_tokens, output_dim=embed_dim)
x = embedding_layer(decoder_inputs)
x = TransformerDecoder(embed_dim, num_heads, ff_dim, dropout_rate)(x, training=True)  # Ensure training=True is passed
outputs = Dense(max_tokens, activation="softmax")(x)

model = Model(inputs=decoder_inputs, outputs=outputs)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

# Train Model
model.fit(dataset, epochs=3)

# Text Generation
def sample_next(predictions, temperature=1.0):
    predictions = np.asarray(predictions).astype("float64")
    predictions = np.log(predictions + 1e-10) / temperature
    exp_preds = np.exp(predictions)
    predictions = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, predictions, 1)
    return np.argmax(probas)

def generate_text(seed_text, num_tokens, temperature=1.0):
    # Vectorize seed_text and reshape it to (1, sequence_length)
    input_text = vectorize_layer([seed_text])  # Shape: (1, sequence_length)
    input_text = input_text[0]  # Remove extra batch dimension
    generated_text = seed_text
    vocab = vectorize_layer.get_vocabulary()  # Ensure the vocabulary is fetched correctly
    
    for _ in range(num_tokens):
        # Ensure input_text has the shape (1, sequence_length) for model prediction
        predictions = model.predict(tf.expand_dims(input_text, axis=0))[0, -1]  # Shape: (max_tokens,)
        next_index = sample_next(predictions, temperature)
        
        # Make sure we are within bounds of vocabulary
        if next_index < len(vocab):
            next_word = vocab[next_index]
        else:
            next_word = "<UNK>"  # Handle out-of-vocabulary index (in case of model mistakes)
        
        generated_text += next_word
        # Update the input for the next prediction (shifting the window)
        input_text = np.append(input_text[1:], [next_index])
    
    return generated_text

# Example of text generation
seed_text = "To be or not to be, that is the question"
generated_text = generate_text(seed_text, num_tokens=50, temperature=0.8)
print("Generated text:\n", generated_text)


Epoch 1/3




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - accuracy: 0.0000e+00 - loss: 9.9169
Epoch 2/3
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 243ms/step - accuracy: 0.0612 - loss: 9.6689
Epoch 3/3
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 215ms/step - accuracy: 0.3673 - loss: 9.4372
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 244ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[

# Conclusion:
While the output is not perfect, it shows that the model is learning and generating text based on its training. The <UNK> tokens are a result of limitations in the vocabulary or training data, and with more epochs and adjustments, the model should improve in generating more sensible text.

If you're aiming for better quality, consider the improvements mentioned above, especially in terms of training time, data, and vocabulary size.