In [21]:
# Imports
import tensorflow as tf
from tensorflow import keras
from keras import layers
from keras.utils import pad_sequences
from keras_preprocessing.text import Tokenizer
import os
import numpy as np
import pickle
print(tf.config.list_physical_devices('GPU'))

# Conda activate tf-gpu to active the GPU envrioment

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [22]:
# Dataset Preparation


input_texts = []
output_texts = []


with open("Button_Dataset_2_5m/prompts.txt") as f:
  input_texts = f.read().splitlines()

with open("Button_Dataset_2_5m/code.txt") as f:
  output_texts = f.read().splitlines()


In [23]:
# Tokenizer

tokenizer = Tokenizer(num_words= 20000, oov_token="<OOV>", char_level=False,split=" ",lower=False,filters="")

# So the Tokenizer converts words to a Token ID
# We have two Tokenizers one for Input taking the Prompt in this Case: "Button, blue-500, round, "Click me" "
# And we have one for Outputing the Answer of the Model in this case <Button classname="" ... > and so on
# num words is the ammount of the Top words we keep for the Tokenizer

combined_texts = []
for prompt, code in zip(input_texts, output_texts):
    combined_text = "<Start> " + prompt + " <SEP> " + code + " <End>"
    combined_texts.append(combined_text)

tokenizer.fit_on_texts(combined_texts)
tokenizer.word_index = {"<PAD>": 0, **{k: v+1 for k, v in tokenizer.word_index.items()}}
tokenizer.index_word = {v: k for k, v in tokenizer.word_index.items()}

# Training the Tokenizer on the vocab and adding Start and End Tokens
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)
    
sequences = tokenizer.texts_to_sequences(combined_texts)


# So the text_to_sequences is therefore to convert each word or also called Token to its corresponding ID
# In this Case we first train the Tokenizers to generate IDs, and than we let the Tokenizers generate IDs

max_seq_len = max(len(seq) for seq in sequences)

# These are the neccessary lengths so we can pad all sequences in a batch to have the same size this process is calling padding 
# So we get the max length and afterwards check if the length of sequence is long enough else we add zero's till the max_encoder / max_decoder length is reached

sequences = pad_sequences(sequences,maxlen=max_seq_len,padding='post',truncating="post")
print(max_seq_len)

vocab_size = len(tokenizer.word_index) + 1




33


In [24]:
embed_dim = 128       
num_heads = 4          
ff_dim = 512        
num_layers = 4

#3125/3125 [==============================] - 459s 147ms/step - loss: 0.7670 - accuracy: 0.8050 - val_loss: 0.8075 - val_accuracy: 0.7951 - lr: 0.0010


inputs = layers.Input(shape=(max_seq_len-1))

embeddinglayer = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim,mask_zero=True,name="embedding_layer")(inputs)

positional_encoding = layers.Embedding(input_dim=max_seq_len-1,output_dim=embed_dim,name="positional_encoding_layer")(tf.range(start=0, limit=max_seq_len-1,delta=1))

x = embeddinglayer + positional_encoding

# Only Self attention no encoder decoder attention
for i in range(num_layers):
  self_attention = layers.MultiHeadAttention(
  num_heads = num_heads,
  key_dim= embed_dim // num_heads,
  dropout=0.1,
  name=f"masked_self_attention_{i}"
  )(
    query=x,
    value=x,
    key=x,
    use_causal_mask=True   
  )
    
  x1 = layers.LayerNormalization(epsilon=1e-6)(x + self_attention) # residual connection by addiding the attention scores to the previous ones plus semantic context

  ffn = tf.keras.Sequential([
    layers.Dense(ff_dim,activation="gelu"),
    layers.Dropout(0.1),
    layers.Dense(embed_dim)
  ], name=f"feed_forward_{i}")

  ffn_output = ffn(x1)

  x = layers.LayerNormalization(epsilon=1e-6)(ffn_output + x1)

outputs = layers.Dense(
  vocab_size,
  activation="softmax",
  name="decoder_output_dense"
)(x)


In [25]:
model = keras.Model(inputs=inputs, outputs=outputs, name="decoder_only_transformer")

initial_learning_rate = 1e-4
lr_schedule = keras.optimizers.schedules.CosineDecay(
    initial_learning_rate=initial_learning_rate,
    decay_steps=1000,
    alpha=0.1
)



model.compile(
    optimizer="adam",
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=False),
    metrics=['accuracy']
)

model.summary()


callbacks = [
    keras.callbacks.EarlyStopping(
        monitor="val_loss",
        patience=5,  
        restore_best_weights=True,
        verbose=1
    ),
    
    keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=3,
        min_lr=1e-7,
        verbose=1
    ),
    
    keras.callbacks.ModelCheckpoint(
        filepath='best_decoder_model1.keras',
        monitor='val_loss',
        save_best_only=True,
        verbose=1
    ),
    
    keras.callbacks.LambdaCallback(
        on_batch_end=lambda batch, logs: tf.clip_by_global_norm([v for v in model.trainable_variables], 1.0)
    )
]


X = sequences[:, :-1]  
y = sequences[:, 1:]   


batch_size = 256
validation_split = 0.2 

history = model.fit(
    X,
    y,
    batch_size=batch_size,
    epochs=20,
    validation_split=validation_split,
    callbacks=callbacks,
    verbose=1,
    shuffle=True
)

model.save("Models/Decoder_Transforer.keras")


Model: "decoder_only_transformer"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 32)]         0           []                               
                                                                                                  
 embedding_layer (Embedding)    (None, 32, 128)      2617856     ['input_3[0][0]']                
                                                                                                  
 tf.__operators__.add_18 (TFOpL  (None, 32, 128)     0           ['embedding_layer[0][0]']        
 ambda)                                                                                           
                                                                                                  
 masked_self_attention_0 (Multi  (None, 32, 128)     66048       ['tf.__ope

KeyboardInterrupt: 

In [None]:
# Test Model

decoder_transformer = keras.models.load_model('Models/best_decoder_model1.keras')
def generate_button_code(prompt):
    prompt = "<Start> " + prompt + " <SEP>"
    
    input_sequence = tokenizer.texts_to_sequences([prompt])[0]
    current_sequence = pad_sequences([input_sequence], maxlen=max_seq_len-1, padding='post')
    
    generated_tokens = []
    for _ in range(50):  
        predictions = decoder_transformer.predict(current_sequence, verbose=0)
        next_token = np.argmax(predictions[0, len(input_sequence) + len(generated_tokens) - 1])
        
        if next_token == tokenizer.word_index.get("<End>", 0) or next_token == 0:
            break
            
        generated_tokens.append(next_token)
        
        new_sequence = input_sequence + generated_tokens
        current_sequence = pad_sequences([new_sequence], maxlen=max_seq_len-1, padding='post')
    
    result = [tokenizer.index_word.get(token, "") for token in generated_tokens]
    return " ".join(result)

test_prompt = "Generate a Blue Button saying Subscribe"
generated = generate_button_code(test_prompt)
print("Generated Button Code:", generated)

Generated Button Code: <Button className="bg-blue-500 py-2 shadow-md">Subscribe</Button>


# Notes

- first Padding with Tokenizer map it 0 good Fix 
- Decoder Only model is very powerful, but if too big == no good results
- Dataset Quality is the second Important Part

# Further Improvements

- Finetuning the Model on specific Problems 
- More Data with more variation esspecially with the Prompt input