In [None]:
# Imports
import tensorflow as tf
from tensorflow import keras
from keras import layers
from keras.utils import pad_sequences
from keras_preprocessing.text import Tokenizer
import os
import numpy as np

print(tf.config.list_physical_devices('GPU'))

# Conda activate tf-gpu to active the GPU envrioment

In [1]:
# Dataset Preparation


input_texts = []
output_texts = []


with open("Buttons/prompts.txt") as f:
  input_texts = f.read().splitlines()

with open("Buttons/code.txt") as f:
  output_texts = f.read().splitlines()


In [None]:
# Tokenizer

input_tokenizer = Tokenizer(num_words= 3000, oov_token="<OOV>", char_level=False,split=" ",lower=False,filters="")
output_tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>", char_level=False,split=" ", filters="", lower=False)

# So the Tokenizer converts words to a Token ID
# We have two Tokenizers one for Input taking the Prompt in this Case: "Button, blue-500, round, "Click me" "
# And we have one for Outputing the Answer of the Model in this case <Button classname="" ... > and so on
# num words is the ammount of the Top words we keep for the Tokenizer


input_tokenizer.fit_on_texts(texts=input_texts)
output_tokenizer.fit_on_texts(["<Start> " + t + " <End>" for t in output_texts])

import pickle
with open("input_tokenizer.pkl", "wb") as f:
    pickle.dump(input_tokenizer, f)

with open("output_tokenizer.pkl", "wb") as f:
    pickle.dump(output_tokenizer, f)
    
print(f"Actual input vocab: {len(input_tokenizer.word_index)}")
print(f"Actual output vocab: {len(output_tokenizer.word_index)}")


input_tokenizer.word_index = {"<PAD>": 0, **{k: v+1 for k, v in input_tokenizer.word_index.items()}}
input_tokenizer.index_word = {v: k for k, v in input_tokenizer.word_index.items()}

output_tokenizer.word_index = {"<PAD>": 0, **{k: v+1 for k, v in output_tokenizer.word_index.items()}}
output_tokenizer.index_word = {v: k for k, v in output_tokenizer.word_index.items()}
# Training the Tokenizer on the vocab and adding Start and End Tokens

encoder_sequences = input_tokenizer.texts_to_sequences(input_texts)
decoder_sequences = output_tokenizer.texts_to_sequences(["<Start> " + t + " <End>" for t in output_texts])


# So the text_to_sequences is therefore to convert each word or also called Token to its corresponding ID
# In this Case we first train the Tokenizers to generate IDs, and than we let the Tokenizers generate IDs

max_encoder_len = max(len(seq) for seq in encoder_sequences)
max_decoder_len = max(len(seq) for seq in decoder_sequences)

# These are the neccessary lengths so we can pad all sequences in a batch to have the same size this process is calling padding 
# So we get the max length and afterwards check if the length of sequence is long enough else we add zero's till the max_encoder / max_decoder length is reached

encoder_sequences = pad_sequences(encoder_sequences,maxlen=max_encoder_len,padding='post')
decoder_sequences = pad_sequences(decoder_sequences,maxlen=max_decoder_len,padding='post')  # theoretically should improve performance because the context is at teh end so the model learns better

# Padding Done which means all Sequences have the same length now and can be fed to the model
print(f"Max encoder len: {max_encoder_len}")
print(f"Max decoder len: {max_decoder_len}")
print(f"Average encoder len: {np.mean([len(seq) for seq in encoder_sequences])}")

decoder_input = decoder_sequences[:,:-1] 
decoder_target = decoder_sequences[:,1:]



# Decoder Input is the input we fed to the Decoder during Training (<Start> + tokens)
# And the Decoder Output is the target the Model should predict (tokens + <End>)


NameError: name 'Tokenizer' is not defined

In [None]:
embed_dim = 128
num_heads = 4  # attention headers in the Transformers
ff_dim = 512  # inner Dimension in the Transformers feed - forward network
num_layers = 3 # number of transformer/attention layers

input_vocab_size = len(input_tokenizer.word_index) +1
output_vocab_size = len(output_tokenizer.word_index) +1

encoder_inputs = layers.Input(shape=(max_encoder_len,), name="encoder_inputs")
decoder_inputs = layers.Input(shape=(max_decoder_len-1,),name="decoder_inputs")

encoder_embedding = layers.Embedding(input_dim=input_vocab_size, output_dim=embed_dim, mask_zero=True,name="encoder_embedding")(encoder_inputs)
decoder_embedding = layers.Embedding(input_dim=output_vocab_size,output_dim=embed_dim, mask_zero=True, name="decoder_embedding")(decoder_inputs)

# Embeding each Tokenized Token gets a Vector with embed_dim as the Dimension
# nn only understand Numbers and Vectors are to mark similiarites like blue and red are in the embedding World close together

# Positional Encoding first new Thing included in Transformers
decoder_seq_len = max_decoder_len -1 # minus one because we train to predict the next token so we always have a start Token
encoder_seq_len = max_encoder_len

encoder_positions = layers.Embedding(
  input_dim=encoder_seq_len,
  output_dim=embed_dim,
  name="encoder_position_embedding"
)(tf.range(start=0, limit=encoder_seq_len, delta=1))  

# generates a list of Positions starting from 0 to the max length
# and for each position its getting vectorized

decoder_positions = layers.Embedding(
  input_dim=decoder_seq_len,
  output_dim=embed_dim,
  name="decoder_position_embedding"
)(tf.range(start=0, limit=decoder_seq_len, delta=1))

# same for here

encoder_pos = encoder_embedding + encoder_positions

encoder_output = encoder_pos
for i in range(num_layers):
  encoder_selfattention = layers.MultiHeadAttention(
  num_heads = num_heads,
  key_dim= embed_dim // num_heads,
  dropout=0.1,
  name=f"encoder_self_attention_{i}"
  )(
  query=encoder_output,
  value=encoder_output,
  key=encoder_output,
  )

  x0 = layers.LayerNormalization(epsilon=1e-6)(encoder_output + encoder_selfattention)

  ffn_encoder = keras.Sequential([
  layers.Dense(ff_dim,activation="relu"),
  layers.Dense(embed_dim),
  ])

  ffn_encoder_output = ffn_encoder(x0)

  encoder_output = layers.LayerNormalization(epsilon=1e-6)(ffn_encoder_output + x0)

# added Self attention to the encoder 
# with resiudal connections so the Input tokens are all context connected to each other 
# which improves model performance and decoder training

decoder_pos = decoder_embedding + decoder_positions
decoder_outputs = decoder_pos
# than the Semantic vector and the postion vector getting added togeter for the Full Context / attention vector

for i in range(num_layers):
  # look_ahead_mask = tf.linalg.band_part(tf.ones((decoder_seq_len, decoder_seq_len)), -1, 0)
  
  # Creates a Matrix == tf.ones a Matrix of ones with shape of the len 
  # -1 == num_lower == keep all bottom elemets
  # 0 == num_upper = dont keep the upper elemets

  # Reason so the Model doesnt look in the Future when Training or predicting

  self_attention = layers.MultiHeadAttention(
    num_heads = num_heads,
    key_dim= embed_dim // num_heads,
    dropout=0.1,
    name=f"masked_self_attention_{i}"
  )(
    query=decoder_outputs,
    value=decoder_outputs,
    key=decoder_outputs,
    use_causal_mask=True   
)

  # Self Attention is for calculating the similarity between words sourrounding the current Token

  # next we have MultiheadAttention wie 4 heads and the right key diemension and dropout preventing overfitting 
  # query = actual token
  # Key, Value = all tokens in the Sequenz
  # with the mask just looks at the ones not a 0 1 == previous/ current tokesn and 0 == further Tokens

  x1 = layers.LayerNormalization(epsilon=1e-6)(decoder_outputs + self_attention)
  # Residual Connection by adding the semantic and postion Vector to the attention
  # Stabilizes Training by normalizing Vectors

  encoder_decoder_attention = layers.MultiHeadAttention(
    num_heads= num_heads,
    key_dim=embed_dim // num_heads,
    dropout=0.1,
    name=f"encoder_decoder_attention_{i}"
  )(
    query=x1,     # the Decoder tokens with the self attention
    value=encoder_output,    # output from encoder with Information and the context of the self attention
    key=encoder_output,
    attention_mask=None
  )

  # the effect therefore is that every Decoder Tokens with the self attention which is crucial for 
  # understanding the sequence and referencing each word to its sourring words, by calculating the self attention
  # can now look on the full input Sequence with the encoder_decoder attention

  x2 = layers.LayerNormalization(epsilon=1e-6)(x1 + encoder_decoder_attention)
  # Now we stabilize again and add to our vector the new Attention also

  # Summary x2 contains now the semantic Vector, the Position Vector, the self attention
  # (the similiarty to its sourroding words / or the connection between them) and also the encoder_decoder
  # attention which means the Context of the Input sequence

  ffn = tf.keras.Sequential([
    layers.Dense(ff_dim,activation="gelu"),
    layers.Dense(embed_dim)
  ], name=f"feed_forward_{i}")

  # Each individual token goes seperately through the FFN
  # its for learning non linera comibnations of the Token features

  ffn_output = ffn(x2)

  # for each Token

  decoder_outputs = layers.LayerNormalization(epsilon=1e-6)(x2 + ffn_output)

# and we also add everything from before the the fnn output

decoder_outputs = layers.Dense(
  output_vocab_size,
  activation="softmax",
  name="decoder_output_dense"
)(decoder_outputs)


# This is the Dense layer for the Token prediction

transformer_model = keras.Model(
    inputs=[encoder_inputs, decoder_inputs],
    outputs=decoder_outputs,
    name="transformer_decoder_model"
)

transformer_model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

earlyStopping = keras.callbacks.EarlyStopping(
  monitor="val_loss",
  patience=1,
  restore_best_weights=True
)

reduce_lr = keras.callbacks.ReduceLROnPlateau(
  monitor='val_loss',
  factor=0.5,                   
  patience=1,
  min_lr=1e-6,
  verbose=1
)

history = transformer_model.fit(
    [encoder_sequences, decoder_input],
    decoder_target,
    batch_size=128,
    epochs=2,
    validation_split=0.15,
    callbacks=[earlyStopping,reduce_lr],
    verbose=1
)

# Scheduled Sampling Training - paste at the bottom of your code




In [None]:
#Inference
encoder_model = keras.Model(encoder_inputs,encoder_output, name="encoder_model")


decoder_inputs_inf = layers.Input(shape=(1,), name="decoder_inputs_inf")
encoder_outputs_input = layers.Input(shape=(max_encoder_len, embed_dim), name="encoder_outputs_input")


decoder_embedding_inf = layers.Embedding(
    input_dim=output_vocab_size,
    output_dim=embed_dim,
    mask_zero=True,
    name="decoder_embedding_inf"
)(decoder_inputs_inf)

decoder_position_inf = layers.Embedding(
    input_dim=max_decoder_len-1,
    output_dim=embed_dim,
    name="decoder_position_embedding_inf"
)(tf.constant([0]))

decoder_pos_inf = decoder_embedding_inf + decoder_position_inf
decoder_outputs_inf = decoder_pos_inf
for i in range(num_layers):
    
    self_attention_inf = layers.MultiHeadAttention(
        num_heads=num_heads,
        key_dim=embed_dim // num_heads,
        dropout=0.1,
        name=f"masked_self_attention_inf_{i}"
    )(query=decoder_outputs_inf, value=decoder_outputs_inf, key=decoder_outputs_inf, use_causal_mask=True)

    x1_inf = layers.LayerNormalization(epsilon=1e-6)(decoder_outputs_inf + self_attention_inf)

    enc_dec_attention_inf = layers.MultiHeadAttention(
        num_heads=num_heads,
        key_dim=embed_dim // num_heads,
        dropout=0.1,
        name=f"encoder_decoder_attention_inf_{i}"
    )(query=x1_inf, value=encoder_outputs_input, key=encoder_outputs_input)

    x2_inf = layers.LayerNormalization(epsilon=1e-6)(x1_inf + enc_dec_attention_inf)

    ffn_inf = tf.keras.Sequential([
        layers.Dense(ff_dim, activation="gelu"),
        layers.Dense(embed_dim)
    ], name=f"feed_forward_inf_{i}")

    ffn_output_inf = ffn_inf(x2_inf)
    decoder_outputs_inf = layers.LayerNormalization(epsilon=1e-6)(x2_inf + ffn_output_inf)

decoder_outputs_inf = layers.Dense(
    output_vocab_size,
    activation="softmax",
    name="decoder_output_dense_inf"
)(decoder_outputs_inf)

decoder_model = keras.Model(
    inputs=[decoder_inputs_inf, encoder_outputs_input],
    outputs=decoder_outputs_inf,
    name="decoder_inference_model"
)

encoder_model.save("encoderTransformer.keras")
decoder_model.save("decoderTransformer.keras")

In [None]:
# Test Step

encoder_model = keras.models.load_model("encoderTransformer.keras")
decoder_model = keras.models.load_model("decoderTransformer.keras")

with open("input_tokenizer.pkl", "rb") as f:
    input_tokenizer = pickle.load(f)

with open("output_tokenizer.pkl", "rb") as f:
    output_tokenizer = pickle.load(f)
    
prompt = "Button in Dark Slate with 'Contact' text"

input_seq = input_tokenizer.texts_to_sequences([prompt])
input_seq = pad_sequences(input_seq, maxlen=max_encoder_len, padding='post')

encoder_outputs = encoder_model.predict(input_seq)
output_seq = [output_tokenizer.word_index["<Start>"]]

for _ in range(max_decoder_len):
    decoder_input = np.array([output_seq[-1]]).reshape(1, 1)  
    predictions = decoder_model.predict([decoder_input, encoder_outputs])
    
    next_token = np.argmax(predictions[0, 0, :])  
    output_seq.append(next_token)
    
    if next_token == output_tokenizer.word_index["<End>"]:
        break

decoded_words = [output_tokenizer.index_word.get(i, '') for i in output_seq[1:]]  
decoded_sentence = ' '.join(decoded_words)
print(decoded_sentence)


# Notes

- Encoder Decoder works not too good because of teachers forcing
- Made a mistake while training which results in 91 val_accuracy -> Problem was not to map the pad to 0 correctly so the model didnt ignored and learned to predict them