In [None]:
import numpy as np
def dropout(x, rate=0.1, training=True):
    if not training or rate == 0:
        return x
    mask = (np.random.rand(*x.shape) > rate).astype(np.float32)
    return x * mask / (1.0 - rate)

In [None]:
class Tokenizer:
    def __init__(self, text):
        cleaned = ''.join(c.lower() if c.isalnum() or c.isspace() else ' ' for c in text)
        words = cleaned.split()

        unique_words = sorted(list(set(words)))
        unique_words.append("<unk>")  

        self.word2idx = {w: i for i, w in enumerate(unique_words)}
        self.idx2word = {i: w for w, i in self.word2idx.items()}
        self.vocab_size = len(unique_words)

    def encode(self, text):
        cleaned = ''.join(c.lower() if c.isalnum() or c.isspace() else ' ' for c in text)
        words = cleaned.split()
        return [self.word2idx.get(w, self.word2idx["<unk>"]) for w in words]

    def decode(self, indices):
        return ' '.join([self.idx2word[i] for i in indices])


In [20]:
class Embedding:
    def __init__(self, vocab_size, embed_dim):
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.weights = np.random.randn(vocab_size, embed_dim) * 0.01

    def forward(self, indices):
        return self.weights[indices] 

In [None]:
class PositionalEncoding:
    def __init__(self, max_len, embed_dim):
        self.encoding = np.zeros((max_len, embed_dim))
        for pos in range(max_len):
            for i in range(0, embed_dim, 2):
                self.encoding[pos, i] = np.sin(pos / (10000 ** ((2 * i)/embed_dim)))
                if i + 1 < embed_dim:
                    self.encoding[pos, i + 1] = np.cos(pos / (10000 ** ((2 * i)/embed_dim)))

    def forward(self, x):
        seq_len = x.shape[0]
        return x + self.encoding[:seq_len]
def softmax(x, axis=-1):
    x = x - np.max(x, axis=axis, keepdims=True)  
    exp_x = np.exp(x)
    return exp_x / np.sum(exp_x, axis=axis, keepdims=True)

In [22]:
class ScaledDotProductAttention:
    def __init__(self, embed_dim):
        self.embed_dim = embed_dim
        self.scale = np.sqrt(embed_dim)

    def forward(self, Q, K, V, mask=None):
        scores = np.dot(Q, K.T) / self.scale 
        
        if mask is not None:
            scores = np.where(mask == 0, -1e9, scores)  

        weights = softmax(scores, axis=-1)  
        output = np.dot(weights, V) 
        return output, weights

In [23]:
class MultiHeadAttention:
    def __init__(self, embed_dim, num_heads):
        assert embed_dim % num_heads == 0
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        self.W_q = np.random.randn(embed_dim, embed_dim) * 0.01
        self.W_k = np.random.randn(embed_dim, embed_dim) * 0.01
        self.W_v = np.random.randn(embed_dim, embed_dim) * 0.01
        self.W_o = np.random.randn(embed_dim, embed_dim) * 0.01

        self.attn = ScaledDotProductAttention(self.head_dim)

    def split_heads(self, x):
        return x.reshape(x.shape[0], self.num_heads, self.head_dim)

    def combine_heads(self, x):
        return x.reshape(x.shape[0], self.embed_dim)

    def forward(self, x, mask=None):
        Q = np.dot(x, self.W_q)
        K = np.dot(x, self.W_k)
        V = np.dot(x, self.W_v)

        Q = self.split_heads(Q)
        K = self.split_heads(K)
        V = self.split_heads(V)

        heads_output = []
        for i in range(self.num_heads):
            out, _ = self.attn.forward(Q[:, i], K[:, i], V[:, i], mask)
            heads_output.append(out)

        concat = np.concatenate(heads_output, axis=-1)
        return np.dot(concat, self.W_o)

In [24]:
class FeedForward:
    def __init__(self, embed_dim, hidden_dim):
        self.W1 = np.random.randn(embed_dim, hidden_dim) * 0.01
        self.b1 = np.zeros((hidden_dim,))
        self.W2 = np.random.randn(hidden_dim, embed_dim) * 0.01
        self.b2 = np.zeros((embed_dim,))

    def relu(self, x):
        return np.maximum(0, x)

    def forward(self, x):
        x = self.relu(np.dot(x, self.W1) + self.b1)
        return np.dot(x, self.W2) + self.b2

In [25]:
class LayerNorm:
    def __init__(self, embed_dim, eps=1e-5):
        self.gamma = np.ones((embed_dim,))
        self.beta = np.zeros((embed_dim,))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(axis=-1, keepdims=True)
        var = ((x - mean) ** 2).mean(axis=-1, keepdims=True)
        norm = (x - mean) / np.sqrt(var + self.eps)
        return self.gamma * norm + self.beta

In [26]:
class DecoderBlock:
    def __init__(self, embed_dim, num_heads, hidden_dim, dropout_rate=0.1):
        self.ln1 = LayerNorm(embed_dim)
        self.attn = MultiHeadAttention(embed_dim, num_heads)
        self.ln2 = LayerNorm(embed_dim)
        self.ffn = FeedForward(embed_dim, hidden_dim)
        self.dropout_rate = dropout_rate

    def forward(self, x, mask, training=True):
        attn_out = self.attn.forward(self.ln1.forward(x), mask)
        attn_out = dropout(attn_out, rate=self.dropout_rate, training=training)
        x = x + attn_out
        ffn_out = self.ffn.forward(self.ln2.forward(x))
        ffn_out = dropout(ffn_out, rate=self.dropout_rate, training=training)
        x = x + ffn_out

        return x

In [None]:
class DecoderOnlyTransformer:
    def __init__(self, vocab_size, max_len, embed_dim, num_heads, hidden_dim, num_layers):
        self.embed = Embedding(vocab_size, embed_dim)
        self.pos_enc = PositionalEncoding(max_len, embed_dim)
        self.blocks = [DecoderBlock(embed_dim, num_heads, hidden_dim, dropout_rate=0.1) for _ in range(num_layers)]
        self.ln = LayerNorm(embed_dim)
        self.output_layer = np.random.randn(embed_dim, vocab_size) * 0.01
        
    def forward(self, x_indices, training=True):
        x = self.embed.forward(x_indices)
        x = self.pos_enc.forward(x)
        seq_len = x.shape[0]
        mask = np.tril(np.ones((seq_len, seq_len)))

        for block in self.blocks:
            x = block.forward(x, mask, training=training)

        x = self.ln.forward(x)
        logits = np.dot(x, self.output_layer)  
        return logits, x

    def backward_output_layer(self, x, dlogits, lr):
        dW_out = np.dot(x.T, dlogits)              
        dx = np.dot(dlogits, self.output_layer.T)  
        self.output_layer -= lr * dW_out
        return dx  

In [None]:
def softmax(x, axis=-1):
    x = x - np.max(x, axis=axis, keepdims=True)
    exp_x = np.exp(x)
    return exp_x / np.sum(exp_x, axis=axis, keepdims=True)

def cross_entropy_loss_with_grad(logits, targets):
    probs = softmax(logits)
    loss = -np.log(probs[np.arange(len(targets)), targets] + 1e-9)
    avg_loss = np.mean(loss)
    dlogits = probs.copy()
    dlogits[np.arange(len(targets)), targets] -= 1
    dlogits /= len(targets) 

    return avg_loss, dlogits

In [29]:
with open("Story.txt", "r", encoding="utf-8") as f:
    raw_text = f.read().strip()

In [30]:
tokenizer = Tokenizer(raw_text)

if "<sep>" not in tokenizer.word2idx:
    idx = len(tokenizer.word2idx)
    tokenizer.word2idx["<sep>"] = idx
    tokenizer.idx2word[idx] = "<sep>"
    tokenizer.vocab_size += 1

encoded = tokenizer.encode(raw_text)

In [None]:
def create_dataset(encoded, seq_len):
    inputs, targets = [], []
    for i in range(len(encoded) - seq_len):
        inputs.append(encoded[i:i+seq_len])
        targets.append(encoded[i+1:i+seq_len+1])
    return np.array(inputs), np.array(targets)

In [32]:
seq_len = 128  
X, Y = create_dataset(encoded, seq_len)
embed_dim = 32
num_heads = 2
hidden_dim = 64
num_layers = 4
max_len = seq_len 
model = DecoderOnlyTransformer(
    vocab_size=tokenizer.vocab_size,
    max_len=max_len,
    embed_dim=embed_dim,
    num_heads=num_heads,
    hidden_dim=hidden_dim,
    num_layers=num_layers
)

In [None]:
x_example = X[0]  
y_true = Y[0]    
logits, x_out = model.forward(x_example, training=True)
loss, dlogits = cross_entropy_loss_with_grad(logits, y_true)  
dx = model.backward_output_layer(x_out, dlogits, lr=0.01)  
print("Logits shape:", logits.shape)
print("Loss:", loss)

Logits shape: (128, 669)
Loss: 6.500964440275334


In [None]:
def sample(model, tokenizer, start_text, length, k=5):
    model_input = tokenizer.encode(start_text)
    for _ in range(length):
        x = np.array(model_input[-seq_len:])
        logits, _ = model.forward(x, training=False)

        last_logits = logits[-1]
        
        top_k_indices = np.argsort(last_logits)[-k:]  
        top_k_probs = softmax(last_logits[top_k_indices])
        next_token = np.random.choice(top_k_indices, p=top_k_probs)

        model_input.append(next_token)
    return tokenizer.decode(model_input)

In [35]:
def get_batches(X, Y, batch_size):
    for i in range(0, len(X), batch_size):
        yield np.array(X[i:i+batch_size]), np.array(Y[i:i+batch_size])

num_epochs = 500
batch_size = 32
learning_rate = 0.01
losses = []
for epoch in range(num_epochs):
    total_loss = 0
    batch_count = 0

    for x_batch, y_batch in get_batches(X, Y, batch_size):
        for x_seq, y_seq in zip(x_batch, y_batch):
            logits, x_out = model.forward(x_seq)
            loss, dlogits = cross_entropy_loss_with_grad(logits, y_seq)
            model.backward_output_layer(x_out, dlogits, lr=learning_rate)
            total_loss += loss
            batch_count += 1

    avg_loss = total_loss / batch_count
    losses.append(avg_loss)

    if epoch % 20 == 0:
        print(f"Epoch {epoch}, Loss: {avg_loss:.4f}")

Epoch 0, Loss: 5.7499
Epoch 20, Loss: 5.5097
Epoch 40, Loss: 5.5057
Epoch 60, Loss: 5.5026
Epoch 80, Loss: 5.4998
Epoch 100, Loss: 5.4972
Epoch 120, Loss: 5.4948
Epoch 140, Loss: 5.4924
Epoch 160, Loss: 5.4901
Epoch 180, Loss: 5.4878
Epoch 200, Loss: 5.4856
Epoch 220, Loss: 5.4834
Epoch 240, Loss: 5.4813
Epoch 260, Loss: 5.4791
Epoch 280, Loss: 5.4770
Epoch 300, Loss: 5.4750
Epoch 320, Loss: 5.4729
Epoch 340, Loss: 5.4709
Epoch 360, Loss: 5.4688
Epoch 380, Loss: 5.4669
Epoch 400, Loss: 5.4649
Epoch 420, Loss: 5.4629
Epoch 440, Loss: 5.4609
Epoch 460, Loss: 5.4590
Epoch 480, Loss: 5.4571


In [41]:
prompt = "beauty"
input_seq = prompt + " <sep>"
output = sample(model, tokenizer, start_text=input_seq, length=14)
print("Generated:", output)

Generated: beauty <unk> the the the is and is in in the the the the of in
