# Imports and Hyperparameters

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 64 # how many independent sequences will we process in parallel?
block_size = 256 # what is the maximum context length for predictions?
max_iters = 100
eval_interval = 10
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 50
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2
# ------------

# seed
torch.manual_seed(1)

<torch._C.Generator at 0x7e5a2af799d0>

# Loading the Dataset

In [None]:
# download the shakespeare dataset

!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2023-12-03 08:12:06--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.6’


2023-12-03 08:12:06 (16.0 MB/s) - ‘input.txt.6’ saved [1115394/1115394]



In [None]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

### Data Exploration

How long is the dataset in characters?
What does the text look like?

In [None]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  1115394


In [None]:
# print(text[:1000])

### Make Vocab

Find all unique characters, map them to index and back to character

Index can be inputted into model, itos allows decoding model output

In [None]:
# sort a list of the unique characters in the text, set takes unique elements on any iterable in this case string
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [None]:
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print(encode("hii there"))
print(decode(encode("hii there")))

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


### Preprocess Data
Just convert everything into indexes and put into tensor

In [None]:
# encode entire dataset (pass everything through stoi), store indexes into a tensor
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
# print(data[:1000]) # the 1000 characters we looked at earier will to the GPT look like this

torch.Size([1115394]) torch.int64


### Train Val Split

In [None]:
# Let's now split up the data into train and validation sets
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# Set up Batch function
We get random batches of blocks as offsets to starting index using torch.randint

The number of batches is our batch dimesnsion (B

#### Explaination of individual blocks:
For each block we grab (blocksize) number of characters and then we increment by 1 to get the subsequent character after each index of the x array as our labels. We want the model to predict the next character given all previous characters in sequence.

Now for each individual block, we provide the context xb[b, :t+1] (all characters prior to the current input and including the current input) and our target the next character y[b,i], b is our batch dimension


In [None]:
# data loading
def get_batch(split):
  # generate a small batch of data of inputs x and targets y
  data = train_data if split == 'train' else val_data
  ix = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])
  x, y = x.to(device), y.to(device)
  return x, y

# Evaluate Loss
Since we are sampling batches, we take the average across several sample batches for less fluctuation

In [None]:
@torch.no_grad()
def estimate_loss():
  out = {}
  model.eval()
  for split in ['train', 'val']:
      losses = torch.zeros(eval_iters)
      for k in range(eval_iters):
          X, Y = get_batch(split)
          logits, loss = model(X, Y)
          losses[k] = loss.item()
      out[split] = losses.mean()
  model.train()
  return out

# Define Model

Self-attention head

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1)

"""
Single attention head

Each index gets its corresponding key, query, and value matices by passing through the dense layer
Query is matmuled with key (transposed so that the rows of querys are multiplied with the rows of keys that become columns)
This is rescaled by the sqrt of embedding dims to avoid sharpening and keeping unit gaussian distribution

Because this is decoder, use masked fill to prevent the model from seeing the future elements that its meant to generate

rescaled key query product is multiplied by values to get the values that match
"""
class Head(nn.Module):
  def __init__(self, head_size):
    super().__init__()
    self.key = nn.Linear(n_embd, head_size, bias=False)
    self.query = nn.Linear(n_embd, head_size, bias=False)
    self.value = nn.Linear(n_embd, head_size, bias = False)
    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

  def forward(self,x):
    B,T,C = x.shape
    k = self.key(x)
    q = self.query(x)
    wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
    wei = wei.masked_fill(self.tril[:T,:T] == 0, float('-inf')) # (B, T, T)
    wei = F.softmax(wei, dim=-1) # (B, T, T)
    # perform the weighted aggregation of the values
    v = self.value(x)
    out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
    return out

class MultiHeadAttention(nn.Module):
  def __init__(self, n_head, head_size):
    super().__init__()
    # concatenate n_head heads
    self.heads = nn.ModuleList([Head(head_size) for _ in range(n_head)])
    self.proj = nn.Linear(n_embd, n_embd)

  def forward(self, x):
    # concatenate all the outputs of passing x through every head
    out = torch.cat([h(x) for h in self.heads], dim=-1)
    out = self.proj(out)
    return out

"""
FeedForward

Linear transformation and ReLU followed by projection to n_embd dims
Dropout to avoid overfitting for complex model
"""
class FeedForward(nn.Module):

  def __init__(self, n_embd):
      super().__init__();
      self.net = nn.Sequential(
          nn.Linear(n_embd, 4 * n_embd), #grow channels for the residual block
          nn.ReLU(),
          nn.Linear(4 * n_embd, n_embd), # projection layer, linear transformation no non-linear activation
          nn.Dropout(dropout),
      )

  def forward(self, x):
    return self.net(x)

"""
Encoder Block

These are stacked sequentially to form the full model
Composed of self-attention, feedforwarad, and residual connections around both of the previous that get layernorm-ed and added
"""
class Block(nn.Module):

  def __init__(self, n_embd, n_head):
    super().__init__()
    head_size = n_embd // n_head;
    self.sa_heads = MultiHeadAttention(n_head, head_size)
    self.ffwd = FeedForward(n_embd)
    self.ln1 = nn.LayerNorm(n_embd) # layer norms to normalize across the time dimension (across the tokens within each batch)
    self.ln2 = nn.LayerNorm(n_embd)

  def forward(self, x):
    # residual connections skipping over self attention and ffwd
    x = x + self.sa(self.ln1(x))
    x = x + self.ffwd(self.ln2(x))
    return x

class GPTLanguageModel(nn.Module):

  def __init__(self):
    super().__init__()
    # each token directly reads off the logits for the next token from a lookup table

    # every single index will retrieve an embedding corresponding to it
    self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
    self.position_embedding_table = nn.Embedding(block_size, n_embd)
    self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
    self.ln_f = nn.LayerNorm(n_embd) # final layer norm
    self.lm_head = nn.Linear(n_embd, vocab_size) # embedding dims -> vocab size logits

  def forward(self, idx, targets=None):
    B, T = idx.shape

    # map idx to embedding
    tok_emb = self.token_embedding_table(idx) # (B,T,C), C is the embedding dimensions
    pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C) giving each token position from 0 - T in order
    x = tok_emb + pos_emb # (B, T, C)
    x = self.blocks(x)
    x = self.ln_f(x) # (B,T,C)
    logits = self.lm_head(x) # put through decoder (B,T,vocab_size)

    # targets optional in the cases that we want to generate new content
    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      # Pytorch cross entropy expects flattened B*T so we do the same, note that -1 can be used to fill in remaining dim
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)

    return logits, loss


  def generate(self, idx, max_new_tokens):
    # idx is (B, T) array of indices in the current context
    for _ in range(max_new_tokens):
      # crop idx to the last block_size tokens
      idx_cond = idx[:, -block_size:]
      # get the predictions
      logits, loss = self(idx_cond)
      # focus only on the last time step
      logits = logits[:, -1, :] # becomes (B, C)
      # apply softmax to get probabilities
      probs = F.softmax(logits, dim=-1) # (B, C)
      # sample from the distribution
      idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
      # append sampled index to the running sequence
      idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
    return idx


# Initialize Model and Train

In [None]:
model = GPTLanguageModel()
m = model.to(device)

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))

step 0: train loss 4.5064, val loss 4.5199
step 10: train loss 2.9636, val loss 2.9959
step 20: train loss 2.7465, val loss 2.7748
step 30: train loss 2.6402, val loss 2.6641
step 40: train loss 2.5837, val loss 2.6023
step 50: train loss 2.5508, val loss 2.5736
step 60: train loss 2.5254, val loss 2.5522
step 70: train loss 2.5099, val loss 2.5397
step 80: train loss 2.4982, val loss 2.5220
step 90: train loss 2.4910, val loss 2.5211

Mu thith him h tes hor theredowo ysod ombch aktre mise dageanth trh,


THhis mpr hobloanimblore tes ous winddraisak'shemben ore

Cl t heath m-hsean ieth' wis pro thawore ME: lse:

Harshoosend, y e d, nor g sheque! ollt fs. n s memhe an fine ind be
I'ly. ie
Hed, witomar twe s te thond, wcoom odswhe garccere k.
FAnan thir woman's at wapolowin,
Thenthesis w t mea kld methe ven 'I f eieve n arand hinongissomy th h whathe ter me itl gorsthio her p ower os.
Me s we d angs d s w'd d he, tobled:

Hifid 
