# Decoder Only Transformer

- 18 Million trainable Params

In [None]:
# imports
from datasets import load_dataset
import json
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras import layers
from keras import models
import numpy as np
from keras import mixed_precision
import re

In [None]:
print(tf.config.list_physical_devices('GPU'))


policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    tf.config.experimental.set_memory_growth(gpus[0], True)
else:
    print("Bruh")

In [None]:
# Dataset preparation
dataset = load_dataset("spider")
training = dataset["train"]

In [None]:
#Schemas

with open("tables.json", "r") as f:
    tables_json = json.load(f)

schemas = {}
for db in tables_json:
    db_id = db["db_id"]
    schemas[db_id] = {
        "tables": db["table_names_original"],  
        "columns": db["column_names_original"], 
        "column_types": db["column_types"],     
        "foreign_keys": db["foreign_keys"],     
        "primary_keys": db["primary_keys"],     
    }



In [None]:
def formatDatabaseSchema(schema):
  tables = schema["tables"]
  columns = schema["columns"]
  col_types = schema["column_types"]
  schema_str = []
  for i, table in enumerate(tables):
      table_cols = [c[1] for c in columns if c[0] == i] 
      table_types = [col_types[j] for j, c in enumerate(columns) if c[0] == i]
      schema_str.append(f"Table: {table}")
      for col_name, col_type in zip(table_cols, table_types):
          schema_str.append(f"  - {col_name} ({col_type})")
  return "\n".join(schema_str)
  

In [None]:
input_text = []
target_text = []

for example in training :
  db_id = example["db_id"]
  schemaText = formatDatabaseSchema(schemas[db_id])
  question = example["question"]
  sqlQuery = example["query"]
 
  input_text.append(f"Question: {question} \n {schemaText}")
  target_text.append(sqlQuery)

print(input_text[0])


In [None]:
# Tokenization
def sqlTokenization(query):
  tokens = re.findall(r"[A-Za-z_][A-Za-z0-9_]*|\d+|[><=!]+|[\(\),;\*]", query)
  return tokens

sql_tokenized_seq = [" ".join(sqlTokenization(q)) for q in target_text]
tokenizer = Tokenizer(num_words= 10000, oov_token="<OOV>", char_level=False,lower=False,filters="")

texts = [
    f"<Start> {inp} <Sep> {sql} <End>"
    for inp, sql in zip(input_text, sql_tokenized_seq)
]

tokenizer.fit_on_texts(texts) # train on both at same time because Decoder only model
print(len(tokenizer.word_index))

sequences = tokenizer.texts_to_sequences(input_text + sql_tokenized_seq)
max_seq_len = max(len(seq) for seq in sequences)
print(max_seq_len)
sequences = pad_sequences(sequences,max_seq_len,padding="post",truncating="post") 

vocab_size = len(tokenizer.word_index) + 1


In [None]:
# Trainings settings
embed_dim = 256
num_heads = 8 
ff_dim = 1024
num_layers = 4

inputs = layers.Input(shape=(max_seq_len-1))
embedding_layer = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim,mask_zero=True, name="Embedding_layer")(inputs)
positional_encoding = layers.Embedding(input_dim=max_seq_len,output_dim=embed_dim, name="postional_encoding_layer")(tf.range(start=0, limit=max_seq_len-1,delta=1))

x = embedding_layer + positional_encoding

for i in range(num_layers):
  selfAttention = layers.MultiHeadAttention(num_heads=num_heads,key_dim=embed_dim//num_heads,dropout=0.1,name=f"self_attemtion_{i}")(query=x,value=x,key=x, use_causal_mask=True)

  x1 = layers.LayerNormalization(epsilon=1e-6)(selfAttention + x)

  ffn = keras.Sequential([
    layers.Dense(ff_dim,activation="gelu"),
    layers.Dropout(0.1),
    layers.Dense(embed_dim)
  ], name=f"feed_forward_{i}")

  ffn_output = ffn(x1)

  x = layers.LayerNormalization(epsilon=1e-6)(ffn_output + x1)

outputs = layers.Dense(
  vocab_size,
  activation="softmax",
  name="decoder_output_dense"
)(x)

model = keras.Model(inputs=inputs, outputs=outputs, name="decoder_only_transformer")

initial_learning_rate = 1e-4
lr_schedule = keras.optimizers.schedules.CosineDecay(
    initial_learning_rate=initial_learning_rate,
    decay_steps=1000,
    alpha=0.1
)

model.compile(
    optimizer="adam",
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=False),
    metrics=['accuracy']
)

model.summary()

callbacks = [
    keras.callbacks.EarlyStopping(
        monitor="val_loss",
        patience=5,  
        restore_best_weights=True,
        verbose=1
    ),
    
    keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=3,
        min_lr=1e-7,
        verbose=1
    ),
    
    keras.callbacks.ModelCheckpoint(
        filepath='TextToSQL.keras',
        monitor='val_loss',
        save_best_only=True,
        verbose=1
    ),
    
    keras.callbacks.LambdaCallback(
        on_batch_end=lambda batch, logs: tf.clip_by_global_norm([v for v in model.trainable_variables], 1.0)
    )
]

X = sequences[:, :-1]  
y = sequences[:, 1:]   

batch_size = 4
validation_split = 0.2 

history = model.fit(
    X,
    y,
    batch_size=batch_size,
    epochs=1,
    validation_split=validation_split,
    callbacks=callbacks,
    verbose=1,
    shuffle=True
)



In [None]:
# Inference Step