In [None]:
# Imports
import tensorflow as tf
import keras 
from keras import layers
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
import os

 Goal:

- Build a Encoder - Decoder Model with attention LSTM (little brother of the transformers)

Roadmap:

- Tokenizer: Inputs the text tokenizes it to token IDs like [152323,11]
-  Embedding Layer: Takes the token Ids and maps them to vectors

In [None]:
# Dataset Preparation
input_texts = []
output_texts = []


dataset_path = "/Dataset"
code_path = os.path.join(dataset_path,"Code")
prompt_path = os.path.join(dataset_path,"prompts")


# Dataset Paths


prompt_files = sorted(os.listdir(prompt_path))  
code_files = sorted(os.listdir(code_path))


# Sorts all files correctly to match like 0.txt , 1.txt, 2.txt

for p_file, c_file in zip(prompt_files,code_files):
  with open(os.path.join(prompt_path,p_file), "r", encoding="utf-8") as f:
    prompt_text = f.read().strip()
  with open(os.path.join(code_path,c_file),"r", encdoing="utf-8") as f:
    code_text = f.read().strip()


# Open each file and reads the Content

input_texts.append(prompt_text)
output_texts.append(code_text)


# Appends the content to the Dataset Arrays




In [None]:
# Tokenizer

input_tokenizer = Tokenizer(num_words= 10000, oov_token="<OOV>")
output_tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")


# So the Tokenizer converts words to a Token ID
# We have two Tokenizers one for Input taking the Prompt in this Case: "Button, blue-500, round, "Click me" "
# And we have one for Outputing the Answer of the Model in this case <Button classname="" ... > and so on
# num words is the ammount of the Top words we keep for the Tokenizer


input_tokenizer.fit_on_texts(texts=input_texts)
output_tokenizer.fit_on_texts(["<Start> " + t + "<End>" for t in output_texts])

# Training the Tokenizer on the vocab and adding Start and End Tokens

encoder_sequences = input_tokenizer.texts_to_sequences(input_texts)
decoder_sequences = output_tokenizer.texts_to_sequences(["<Start> " + t + "<End>" for t in output_texts])


# So the text_to_sequences is therefore to convert each word or also called Token to its corresponding ID
# In this Case we first train the Tokenizers to generate IDs, and than we let the Tokenizers generate IDs

max_encoder_len = max(len(seq) for seq in encoder_sequences)
max_decoder_len = max(len(seq) for seq in decoder_sequences)

# These are the neccessary lengths so we can pad all sequences in a batch to have the same size this process is calling padding 
# So we get the max length and afterwards check if the length of sequence is long enough else we add zero's till the max_encoder / max_decoder length is reached

encoder_sequences = pad_sequences(encoder_sequences,maxlen=max_encoder_len,padding='post')
decoder_sequences = pad_sequences(decoder_sequences,maxlen=max_decoder_len,padding='post')

# Padding Done which means all Sequences have the same length now and can be fed to the model

decoder_input = decoder_sequences[:,:-1] 
decoder_target = decoder_sequences[:,:1]

# Decoder Input is the input we fed to the Decoder during Training (<Start> + tokens)
# And the Decoder Output is the target the Model should predict (tokens + <End>)


ValueError: max() arg is an empty sequence

In [None]:
# LSTM model no attention

embed_dim = 128  
lstm_units = 256

input_vocab_size = len(input_tokenizer.word_index) +1
output_vocab_size = len(output_tokenizer.word_index) +1


# embed_dim is the size of word embeddings
# lstm_units is the number of Long short term memory hidden units
# and vocab sizes


encoder_inputs = keras.Input(shape=(None,))  

# Input is a Sequence of word IDs

x = layers.Embedding(input_vocab_size,embed_dim)(encoder_inputs)
enocder_outputs, state_h, state_c = layers.LSTM(lstm_units,return_state=True)(x)
encoder_states = [state_h, state_c]

# Embedding Layer transforms Token IDs to vectors
# Encoder_states are states so the decoder knows about what the encoder talked about
# LSTM Layers reads the sequence step by step -> returning states that contain all the neccessary information for the Decoder


decoder_inputs = keras.Input(shape=(None,))

# Input are the target Tokens shifted

y = layers.Embedding(output_vocab_size, embed_dim)(decoder_inputs)
decoder_lstm = layers.LSTM(lstm_units, return_sequences=True,return_state=True)
decoder_outputs, _, _= decoder_lstm(y,initial_state=encoder_states)
decoder_dense = layers.Dense(output_vocab_size,activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

# Embedding Layer transforms Token IDs to vectors
# Now the LSTM layer reads the sequence and starts with the inital states provided by the encoder
# Than in the Dense Layer it predicts the words of the vocab with the most probability


model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)




In [None]:
# Model Training and Compiling


model.compile(
  optimizer="adam",
  loss="sparse_categorical_crossentropy",
  metrics=["accuracy"]
)

# Compile the Model with Adam as optimizer and use the accuracy for training metric


batch_size = 32
epochs = 30


model.fit(
    [encoder_sequences, decoder_input],
    decoder_target,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.1
)

Summary:

- Encoder processes the input sequenc and creates states

- Decoder starts processing with inital States from the Encoder
 -> outputs Predictions for each Target Token

