# Decoder Only Transformer

- 18 Million trainable Params

In [99]:
# imports
from datasets import load_dataset
import json
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras import layers
from keras import models
import numpy as np
from keras import mixed_precision
import re

In [100]:
print(tf.config.list_physical_devices('GPU'))


policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    tf.config.experimental.set_memory_growth(gpus[0], True)
else:
    print("Bruh")

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [101]:
# Dataset preparation
dataset = load_dataset("Clinton/Text-to-sql-v1")
training = dataset["train"]

In [102]:

input_text = []
target_text = []

for sample in training:
  input = "Question: " + sample["instruction"] + "\n" + "Context: " + sample["input"]
  input_text.append(input)
  target_text.append(sample["response"])

print(input_text[0])
print(target_text[0])


Question: Name the home team for carlton away team
Context: CREATE TABLE table_name_77 (
    home_team VARCHAR,
    away_team VARCHAR
)
SELECT home_team FROM table_name_77 WHERE away_team = "carlton"


In [103]:

tokenizer = Tokenizer(num_words= 25000, oov_token="<OOV>", char_level=False,lower=False,filters="",split=" " )

texts = [f"<Start> {inp} <Sep> {sql} <End>" for inp, sql in zip(input_text, target_text)]


tokenizer.fit_on_texts(texts) # train on both at same time because Decoder only model
print(len(tokenizer.word_index))

sequences = tokenizer.texts_to_sequences(texts)
max_seq_len = max(len(seq) for seq in sequences)
print(max_seq_len)
sequences = pad_sequences(sequences,max_seq_len,padding="post") 

vocab_size = 25000



397386
876


In [104]:
# Trainings settings
embed_dim = 256
num_heads = 8 
ff_dim = 1024
num_layers = 4

inputs = layers.Input(shape=(max_seq_len-1))
embedding_layer = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim,mask_zero=True, name="Embedding_layer")(inputs)
positional_encoding = layers.Embedding(input_dim=max_seq_len-1,output_dim=embed_dim, name="postional_encoding_layer")(tf.range(start=0, limit=max_seq_len-1,delta=1))

x = embedding_layer + positional_encoding

for i in range(num_layers):
  selfAttention = layers.MultiHeadAttention(num_heads=num_heads,key_dim=embed_dim//num_heads,dropout=0.1,name=f"self_attemtion_{i}")(query=x,value=x,key=x, use_causal_mask=True)

  x1 = layers.LayerNormalization(epsilon=1e-6)(selfAttention + x)

  ffn = keras.Sequential([
    layers.Dense(ff_dim,activation="gelu"),
    layers.Dropout(0.1),
    layers.Dense(embed_dim)
  ], name=f"feed_forward_{i}")

  ffn_output = ffn(x1)

  x = layers.LayerNormalization(epsilon=1e-6)(ffn_output + x1)

outputs = layers.Dense(
  vocab_size,
  activation="softmax",
  name="decoder_output_dense"
)(x)

model = keras.Model(inputs=inputs, outputs=outputs, name="decoder_only_transformer")

initial_learning_rate = 1e-5
lr_schedule = keras.optimizers.schedules.CosineDecay(
    initial_learning_rate=initial_learning_rate,
    decay_steps=1000,
    alpha=0.1
)

model.compile(
    optimizer="adam",
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=False),
    metrics=['accuracy']
)

model.summary()

callbacks = [
    keras.callbacks.EarlyStopping(
        monitor="val_loss",
        patience=5,  
        restore_best_weights=True,
        verbose=1
    ),
    
    keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=3,
        min_lr=1e-7,
        verbose=1
    ),
    
    keras.callbacks.ModelCheckpoint(
        filepath='TextToSQL.keras',
        monitor='val_loss',
        save_best_only=True,
        verbose=1
    ),
    
    keras.callbacks.LambdaCallback(
        on_batch_end=lambda batch, logs: tf.clip_by_global_norm([v for v in model.trainable_variables], 1.0)
    )
]

X = sequences[:, :-1]  
y = sequences[:, 1:]   


batch_size = 4
validation_split = 0.2 

history = model.fit(
    X,
    y,
    batch_size=batch_size,
    epochs=1,
    validation_split=validation_split,
    callbacks=callbacks,
    verbose=1,
    shuffle=True
)

Model: "decoder_only_transformer"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_13 (InputLayer)          [(None, 875)]        0           []                               
                                                                                                  
 Embedding_layer (Embedding)    (None, 875, 256)     6400000     ['input_13[0][0]']               
                                                                                                  
 tf.__operators__.add_99 (TFOpL  (None, 875, 256)    0           ['Embedding_layer[0][0]']        
 ambda)                                                                                           
                                                                                                  
 self_attemtion_0 (MultiHeadAtt  (None, 875, 256)    263168      ['tf.__ope

KeyboardInterrupt: 

In [None]:
# Inference Step
textToSQL = keras.models.load_model('TextToSQL.keras')
def generateTextToSQL(prompt):
    prompt = "<Start> " + prompt + "<Sep>"
    
    input_sequence = tokenizer.texts_to_sequences([prompt])[0]
    current_sequence = pad_sequences([input_sequence], maxlen=max_seq_len-1, padding='post')
    
    generated_tokens = []
    for _ in range(100):  
        predictions = textToSQL.predict(current_sequence, verbose=0)
        next_token = np.argmax(predictions[0, len(input_sequence) + len(generated_tokens) - 1])
        
        if next_token == tokenizer.word_index.get("<End<", 0) or next_token == 0:
            break
            
        generated_tokens.append(next_token)
        
        new_sequence = input_sequence + generated_tokens
        current_sequence = pad_sequences([new_sequence], maxlen=max_seq_len-1, padding='post')
    
    result = [tokenizer.index_word.get(token, "") for token in generated_tokens]
    return " ".join(result)

test_prompt = """
 Question: How many heads of the departments are older than 56 ? 
 Table department: Department_ID number, Name text, Creation text, Ranking number, Budget_in_Billions number, Num_Employees number | Table head: head_ID number, name text, born_state text, age number | Table management: department_ID number, head_ID number, temporary_acting text
 """
generated = generateTextToSQL(test_prompt)
print("Generated SQL Code:", generated)