# Decoder Only Transformer

In [None]:
# imports
from datasets import load_dataset
import json
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras import layers
from keras import models
import numpy as np
import re

In [None]:
# Dataset preparation
dataset = load_dataset("spider")
training = dataset["train"]

In [None]:
#Schemas

with open("tables.json", "r") as f:
    tables_json = json.load(f)

schemas = {}
for db in tables_json:
    db_id = db["db_id"]
    schemas[db_id] = {
        "tables": db["table_names_original"],  
        "columns": db["column_names_original"], 
        "column_types": db["column_types"],     
        "foreign_keys": db["foreign_keys"],     
        "primary_keys": db["primary_keys"],     
    }



In [None]:
def formatDatabaseSchema(schema):
  tables = schema["tables"]
  columns = schema["columns"]
  col_types = schema["column_types"]
  schema_str = []
  for i, table in enumerate(tables):
      table_cols = [c[1] for c in columns if c[0] == i] 
      table_types = [col_types[j] for j, c in enumerate(columns) if c[0] == i]
      schema_str.append(f"Table: {table}")
      for col_name, col_type in zip(table_cols, table_types):
          schema_str.append(f"  - {col_name} ({col_type})")
  return "\n".join(schema_str)
  

In [None]:
inputs = []
targets = []

for example in training :
  db_id = example["db_id"]
  schemaText = formatDatabaseSchema(schemas[db_id])
  question = example["question"]
  sqlQuery = example["query"]
 
  inputs.append(f"Question: {question} \n {schemaText}")
  targets.append(sqlQuery)

print(inputs[0])


In [None]:
# Tokenization
def sqlTokenization(query):
  tokens = re.findall(r"[A-Za-z_][A-Za-z0-9_]*|\d+|[><=!]+|[\(\),;\*]", query)
  return tokens

sql_tokenized_seq = [" ".join(sqlTokenization(q)) for q in targets]
tokenizer = Tokenizer(num_words= 10000, oov_token="<OOV>", char_level=False,lower=False,filters="")

tokenizer.fit_on_texts(inputs + sql_tokenized_seq) # train on both at same time because Decoder only model
print(len(tokenizer.word_index))

sequences = tokenizer.texts_to_sequences(inputs + sql_tokenized_seq)
max_seq_len = max(len(seq) for seq in sequences)
sequences = pad_sequences(sequences,max_seq_len,padding="post",truncating="post") 

vocab_size = len(tokenizer.word_index) + 1


In [None]:
# Trainings settings

embed_dim = 384
num_heads = 12
ff_dim = 1536
num_layers = 6

inputs = layers.Input(shape=(max_seq_len-1))
embedding_layer = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim,mask_zero=True, name="Embedding_layer")(inputs)
positional_encoding = layers.Embedding(input_dim=max_seq_len,output_dim=embed_dim, name="postional_encoding_layer")(tf.range(start=0, limit=max_seq_len-1,delta=1))

x = embedding_layer + positional_encoding

for i in range(num_layers):
  selfAttention = layers.MultiHeadAttention(num_heads=num_heads,key_dim=embed_dim//num_heads,dropout=0.1,name=f"self_attemtion_{i}")(query=x,value=x,key=x, use_causal_mask=True)

  x1 = layers.LayerNormalization(epsilon=1e-6)(selfAttention + x)

  ffn = keras.Sequential([
    layers.Dense(ff_dim,activation="gelu"),
    layers.Dropout(0.1),
    layers.Dense(embed_dim)
  ], name=f"feed_forward_{i}")

  ffn_output = ffn(x1)

  x = layers.LayerNormalization(epsilon=1e-6)(ffn_output + x1)

outputs = layers.Dense(
  vocab_size,
  activation="softmax",
  name="decoder_output_dense"
)

