In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
device = 'cuda' if torch.cuda.is_available() else 'cpu'
import mmap
import random
import pickle

import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

block_size = 64
batch_size = 128
max_iters = 3000
learning_rate = 3e-4
eval_iters = 50
n_embd = 384
n_head = 8
n_layer = 8
dropout = 0.2

In [None]:
with open('romeo_juliet.txt', 'r', encoding='utf-8') as f:
    text = f.read()
chars = sorted(set(text))
vocab_size = len(chars)
print(chars)

In [None]:
string_to_int = {ch:i for i,ch in enumerate(chars) }
int_to_string = {i:ch for i,ch in enumerate(chars) }
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

data = torch.tensor(encode(text), dtype=torch.long)
print(data[:100])

In [9]:
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    
    # Generate random indices within a valid range
    ix = torch.randint(0, len(data) - block_size, (batch_size,))

    # Debug: Print the generated indices
    #print(f"Generated indices (ix): {ix}")

    # Create input (x) and target (y) sequences based on the indices
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])

    # Debug: Print the shapes of x and y
    #print(f"x shape: {x.shape}, y shape: {y.shape}")

    # Move x and y to the correct device (GPU or CPU)
    try:
        x, y = x.to(device), y.to(device)
    except Exception as e:
        print(f"Error when transferring data to device: {e}")
        raise e  # Re-raise the exception for further investigation

    return x, y

x, y = get_batch('train')
print('inputs:')
print(x.shape)
print(x)
print('targets:')
print(y)

inputs:
torch.Size([128, 64])
tensor([[59,  2, 60,  ..., 73, 10,  2],
        [74,  2, 74,  ..., 38, 37, 30],
        [72,  2, 66,  ..., 72, 59, 55],
        ...,
        [68, 74, 72,  ..., 58, 67, 63],
        [ 2, 67, 75,  ..., 77, 62, 59],
        [ 2, 72, 59,  ..., 58, 63, 73]], device='cuda:0')
targets:
tensor([[ 2, 60, 66,  ..., 10,  2, 55],
        [ 2, 74, 62,  ..., 37, 30, 45],
        [ 2, 66, 69,  ..., 59, 55, 58],
        ...,
        [74, 72, 55,  ..., 67, 63, 74],
        [67, 75, 57,  ..., 62, 59, 68],
        [72, 59, 74,  ..., 63, 73, 74]], device='cuda:0')


In [10]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print('when input is', context, 'target is', target)

when input is tensor([90]) target is tensor(1)
when input is tensor([90,  1]) target is tensor(33)
when input is tensor([90,  1, 33]) target is tensor(26)
when input is tensor([90,  1, 33, 26]) target is tensor(38)
when input is tensor([90,  1, 33, 26, 38]) target is tensor(37)
when input is tensor([90,  1, 33, 26, 38, 37]) target is tensor(30)
when input is tensor([90,  1, 33, 26, 38, 37, 30]) target is tensor(45)
when input is tensor([90,  1, 33, 26, 38, 37, 30, 45]) target is tensor(12)
when input is tensor([90,  1, 33, 26, 38, 37, 30, 45, 12]) target is tensor(1)
when input is tensor([90,  1, 33, 26, 38, 37, 30, 45, 12,  1]) target is tensor(31)
when input is tensor([90,  1, 33, 26, 38, 37, 30, 45, 12,  1, 31]) target is tensor(69)
when input is tensor([90,  1, 33, 26, 38, 37, 30, 45, 12,  1, 31, 69]) target is tensor(72)
when input is tensor([90,  1, 33, 26, 38, 37, 30, 45, 12,  1, 31, 69, 72]) target is tensor(2)
when input is tensor([90,  1, 33, 26, 38, 37, 30, 45, 12,  1, 31, 6

In [11]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [15]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B,T,C = x.shape
        k = self.key(x)   # (B,T,hs)
        q = self.query(x) # (B,T,hs)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,hs)
        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out

# [1, 0, 0]
# [1, 0.6, 0]
# [1, 0.6, 0.4]
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1) # (B, T, F) -> (B, T, [h1, h1, h1, h1, h2, h2, h2, h2, h3, h3, h3, h3])
        out = self.dropout(self.proj(out))
        return out
    

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)
    
class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        y = self.sa(x)
        x = self.ln1(x + y)
        y = self.ffwd(x)
        x = self.ln2(x + y)
        return x
    
class GPTLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)
        
        
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    """ def forward(self, index, targets=None):
        B, T = index.shape
        
        
        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(index) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss

        """
    def forward(self, index, targets=None):
        B, T = index.shape
    
        # Check if any index is out of bounds for the token embedding (should be between 0 and vocab_size-1)
        if index.max() >= vocab_size or index.min() < 0:
            print(f"Warning: Index out of range in 'index' tensor!")
            print(f"Max index: {index.max().item()}, Min index: {index.min().item()}")
    
        # Token embeddings
        tok_emb = self.token_embedding_table(index)  # (B,T,C)
    
        # Position embeddings
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))  # (T,C)
    
        # Add token and position embeddings
        x = tok_emb + pos_emb  # (B,T,C)
        
        # Pass through transformer blocks
        x = self.blocks(x)  # (B,T,C)
    
        # Final layer normalization
        x = self.ln_f(x)  # (B,T,C)
    
        # Compute logits for the language model head
        logits = self.lm_head(x)  # (B,T,vocab_size)
    
        if targets is None:
            loss = None
        else:
            # Compute loss if targets are provided
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            #print(f"Logits shape for loss: {logits.shape}")
            loss = F.cross_entropy(logits, targets)
    
        return logits, loss


    

        
    
    def generate(self, index, max_new_tokens):
        # index is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            index_cond = index[:, -block_size:]
            # get the predictions
            logits, loss = self.forward(index_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            index = torch.cat((index, index_next), dim=1) # (B, T+1)
        return index


model = GPTLanguageModel(vocab_size)
m = model.to(device)
print('loading model parameters...')
with open('model-01.pkl', 'rb') as f:
    model = pickle.load(f)
print('loaded successfully!')

loading model parameters...
loaded successfully!


In [13]:
# Create a PyTorch Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"step: {iter}, train loss: {losses['train']:.3f}, val loss: {losses['val']:.3f}")

    #sample a batch of data
    xb, yb = get_batch('train')

    #evaluate the loss
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

with open('model-01.pkl', 'wb') as f:
    pickle.dump(model, f)

step: 0, train loss: 4.610, val loss: 4.630
step: 50, train loss: 2.548, val loss: 2.540
step: 100, train loss: 2.380, val loss: 2.351
step: 150, train loss: 2.170, val loss: 2.140
step: 200, train loss: 2.032, val loss: 2.007
step: 250, train loss: 1.918, val loss: 1.897
step: 300, train loss: 1.842, val loss: 1.825
step: 350, train loss: 1.778, val loss: 1.759
step: 400, train loss: 1.736, val loss: 1.721
step: 450, train loss: 1.677, val loss: 1.669
step: 500, train loss: 1.649, val loss: 1.656
step: 550, train loss: 1.611, val loss: 1.618
step: 600, train loss: 1.594, val loss: 1.604
step: 650, train loss: 1.550, val loss: 1.569
step: 700, train loss: 1.533, val loss: 1.558
step: 750, train loss: 1.512, val loss: 1.525
step: 800, train loss: 1.491, val loss: 1.516
step: 850, train loss: 1.471, val loss: 1.506
step: 900, train loss: 1.452, val loss: 1.494
step: 950, train loss: 1.441, val loss: 1.489
step: 1000, train loss: 1.424, val loss: 1.475
step: 1050, train loss: 1.420, val l

In [14]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)

	your Gonzalo; and she faces all itself
To line in’s ganer away, and for all one to o’erher
Of heaven, from his power faith art through well be asleep,
And help that lives to sudden their senses are poor, foul that go streen are
they I find: thou’dst empt here pawn
Indion mortal are: and the fice of love o’er-happen cruelly
Him to hears not a round: they were well go a pleasure.

_Ste._ Thy king's a king of sister, bore, if he is the which flies, sir?

_Pros._ By, nympher, if you fancience that s
