🔹 Mini Transformer Block (TensorFlow / Keras)

Large Language Models (LLMs) are a subcategory of machine learning models — more specifically, they are deep learning models that use a neural network architecture called the Transformer (introduced in 2017).

So the hierarchy looks like this:

Artificial Intelligence (AI)

  * Machine Learning (ML)

    * Deep Learning (DL)

      * Neural Networks

        * Transformers

          * LLMs (like GPT, LLaMA, PaLM, etc.)

In [None]:
import tensorflow as tf
from tensorflow.keras import layers

# --- Transformer Block ---
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential([
            layers.Dense(ff_dim, activation="relu"),
            layers.Dense(embed_dim),
        ])
        self.norm1 = layers.LayerNormalization(epsilon=1e-6)
        self.norm2 = layers.LayerNormalization(epsilon=1e-6)

    def call(self, x):
        attn_out = self.att(x, x)
        out1 = self.norm1(x + attn_out)
        ffn_out = self.ffn(out1)
        return self.norm2(out1 + ffn_out)

# --- Tiny GPT-like Model ---
def build_tiny_gpt(vocab_size, max_len, embed_dim=32, num_heads=2, ff_dim=64):
    inputs = layers.Input(shape=(max_len,))
    tok_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)(inputs)
    pos_emb = layers.Embedding(input_dim=max_len, output_dim=embed_dim)(
        tf.range(start=0, limit=max_len)
    )
    x = tok_emb + pos_emb
    x = TransformerBlock(embed_dim, num_heads, ff_dim)(x)
    outputs = layers.Dense(vocab_size, activation="softmax")(x)
    return tf.keras.Model(inputs=inputs, outputs=outputs)

# --- Prepare Toy Dataset ---
text = "hello world " * 100  # repeat phrase
vocab = {"hello": 0, "world": 1}  # tiny vocab
reverse_vocab = {v: k for k, v in vocab.items()}

# Encode text as numbers
encoded = [vocab[word] for word in text.split()]  # [0,1,0,1,0,1,...]

# Create sequences (input → next word)
seq_len = 2
inputs, targets = [], []
for i in range(len(encoded) - seq_len):
    inputs.append(encoded[i:i+seq_len])
    targets.append(encoded[i+1:i+seq_len+1])

inputs = tf.constant(inputs)
targets = tf.constant(targets)

# --- Build and Train ---
model = build_tiny_gpt(vocab_size=2, max_len=seq_len)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

model.fit(inputs, targets, epochs=10, batch_size=8)

# --- Test prediction ---
import numpy as np

def predict_next(seed):
    tokens = [vocab[w] for w in seed.split()]
    tokens = tf.constant([tokens])  # batch of 1
    preds = model(tokens)[0, -1].numpy()  # get last position predictions
    next_id = np.argmax(preds)
    return reverse_vocab[next_id]

print("Input: 'hello' → Next word:", predict_next("hello"))
print("Input: 'world' → Next word:", predict_next("world"))
