# Import libraries

In [45]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Hyperparameters

In [33]:
batch_size = 16
block_size = 32
max_iteration = 5000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iteration = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.0
torch.manual_seed(1337) # Random number generator

<torch._C.Generator at 0x7a3fd13a8c50>

# Read text file

In [21]:
file_path = '/content/drive/MyDrive/harryPotter/hp1_ss.txt'

with open(file_path, 'r', encoding='utf-8') as f:
    text = f.read()

# Data Processing

In [22]:
char = sorted(list(set(text)))
vocab_size = len(char)
print(''.join(char))
print(vocab_size)

	
 !"'()*,-.0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZ\abcdefghijklmnopqrstuvwxyz~
79


In [23]:
stoi = { ch:i for i,ch in enumerate(char)}
itos = { i:ch for i,ch in enumerate(char)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

# Split train and test set

In [24]:
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

# Data loading


In [25]:
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [38]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iteration)
        for k in range(eval_iteration):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

# Classes for self-attention and transformer

In [43]:
class Head(nn.Module):

  def __init__(self, head_size):
    super().__init__()
    self.key = nn.Linear(n_embd, head_size, bias=False)
    self.query = nn.Linear(n_embd, head_size, bias=False)
    self.value = nn.Linear(n_embd, head_size, bias=False)
    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    B, T, C = x.shape
    k = self.key(x)                                                               # (B, T, C)
    q = self.query(x)                                                             # (B, T, C)

    w = q @ k.transpose(-2, -1) * C**-0.5                                         # (B, T, C) @ (B, C, T) -> (B, T, T) and scale down with C**-0.5
    w = w.masked_fill(self.tril[:T, :T] == 0, float('-inf'))                      # (B, T, T)
    w = F.softmax(w, dim=-1)                                                      # (B, T, T)
    w = self.dropout(w)
    v = self.value(x)                                                             # (B, T, C)
    out = w @ v                                                                   # (B, T, T) @ (B, T, C) -> (B, T, C)

    return out

In [40]:
class MultiHeadAttention(nn.Module):

  def __init__(self, num_heads, head_size):
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
    self.projection = nn.Linear(n_embd, n_embd)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    out = torch.cat([h(x) for h in self.heads], dim=-1)
    out = self.dropout(self.projection(out))
    return out

In [29]:
class FeedForward(nn.Module):

  def __init__(self, n_embd):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(n_embd, 4 * n_embd),
        nn.ReLU(),
        nn.Linear(4 * n_embd, n_embd),
        nn.Dropout(dropout)
    )

  def forward(self, x):
    return self.net(x)

In [30]:
class Block(nn.Module):

  def __init__(self, n_embd, n_head):
    super().__init__()
    head_size = n_embd // n_head
    self.sa = MultiHeadAttention(n_head, head_size)
    self.ffwd = FeedForward(n_embd)
    self.ln1 = nn.LayerNorm(n_embd)
    self.ln2 = nn.LayerNorm(n_embd)

  def forward(self, x):
    x = x + self.sa(self.ln1(x))
    x = x + self.ffwd(self.ln2(x))
    return x

# Bigram Model

In [51]:
class BigramModel(nn.Module):

  def __init__(self):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
    self.position_embedding_table = nn.Embedding(block_size, n_embd)
    self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
    self.ln_f = nn.LayerNorm(n_embd)
    self.lm_head = nn.Linear(n_embd, vocab_size)

  def forward(self, idx, targets=None):
    B, T = idx.shape

    token_emb = self.token_embedding_table(idx)                                   # (B, T, C)
    position_emb = self.position_embedding_table(torch.arange(T, device=device))  # (T, C)
    x = token_emb + position_emb                                                  # (B, T, C)
    x = self.blocks(x)                                                            # (B, T, C)
    x = self.ln_f(x)                                                              # (B, T, C)
    logits = self.lm_head(x)                                                      # (B, T, vocab_size)

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B * T, C)
      targets = targets.view(B * T)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_tokens):
    for _ in range(max_new_tokens):
      # Ensure that the input to the model doesn't exceed block_size
      idx_cond = idx[:, -block_size:]
      logits, loss = self(idx_cond)
      logits = logits[:, -1, :]                                                   # (B, C)
      probs = F.softmax(logits, dim=-1)                                           # (B, C)
      idx_next = torch.multinomial(probs, num_samples=1)                          # (B, 1)
      idx = torch.cat((idx, idx_next), dim=1)                                     # (B, T + 1)
    return idx

# Shall we train?

In [53]:
model = BigramModel()
m = model.to(device)
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iteration):

  if iter % eval_interval == 0:
    losses = estimate_loss()
    print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

  xb, yb = get_batch('train')

  logits, loss = model(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

0.211535 M parameters
step 0: train loss 4.5778, val loss 4.5828
step 100: train loss 2.6603, val loss 2.6433
step 200: train loss 2.5191, val loss 2.5136
step 300: train loss 2.4372, val loss 2.4376
step 400: train loss 2.3319, val loss 2.3287
step 500: train loss 2.2572, val loss 2.2507
step 600: train loss 2.1928, val loss 2.1875
step 700: train loss 2.1530, val loss 2.1386
step 800: train loss 2.0813, val loss 2.0695
step 900: train loss 2.0395, val loss 2.0423
step 1000: train loss 1.9930, val loss 1.9903
step 1100: train loss 1.9526, val loss 1.9515
step 1200: train loss 1.9303, val loss 1.9317
step 1300: train loss 1.8998, val loss 1.9112
step 1400: train loss 1.8765, val loss 1.8865
step 1500: train loss 1.8502, val loss 1.8621
step 1600: train loss 1.8359, val loss 1.8412
step 1700: train loss 1.7989, val loss 1.8064
step 1800: train loss 1.7848, val loss 1.7890
step 1900: train loss 1.7654, val loss 1.7779
step 2000: train loss 1.7665, val loss 1.7745
step 2100: train loss 1.

# Generate from the model

In [54]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))

	pered in alnother bust of himselt spripperming away he showed lievos around If Professor McOnand It's look ever nobe
leed, me kit up there wasn't see Malfoy, Mr, All was a
 heard up abount. Chreel?"

Hagrid Guzpes, I
roddmember there.

If chaspeled someone coart.

"Thank dread the hall over nearn to cat -- see Mistice from the flitter; He'd
bed from you, was around mat to ask heard his stears.

On his from stimed Huddordy hese. Filch on Cade. "

"Yes' yecle off, I began watch
once.

"Fivand altry get ary suggelly.

Harry live every snake the really worl, they get ween at all usuady.

"Cangely to heard admagal not this hand rasom, a Vernon playle spagetaked a tent three had down.

"Frices. every use, -- ghost going an, though he had in up. Tut he Fant, I villonge his best event it this head,
Hagrid, and Wood, I'd trittents go teled.

"I wand very silects told off his sinuth
a Pilvus, Sniffle's spuff, Weasleys.

"You! I've behalt believe. Hermione was, alodd comporing to
casin? Thessy w