# Setup

In [1]:
import torch.optim as optim
import torch.nn.functional as F
from torch.optim.lr_scheduler import ReduceLROnPlateau
import wandb
from utilities import *
from config import *
from dataloading import *
from tqdm import tqdm
from transformer import *
import os

Loading data...


Loading data: 100%|██████████| 100000/100000 [00:00<00:00, 190006.75it/s]
Loading data: 100%|██████████| 100000/100000 [00:00<00:00, 220385.15it/s]
Loading data: 100%|██████████| 100000/100000 [00:00<00:00, 200607.52it/s]


In [2]:
for x in train_dataloader:
    data, train = x
    break

In [11]:
data[:10]

tensor([[10,  5,  5,  3,  6,  5, 16,  3,  2,  1, 17],
        [ 5,  9, 14,  1,  2,  2, 16,  2, 17, 17, 17],
        [ 2,  9, 14,  6,  1,  8, 16,  0,  2,  3, 17],
        [ 1, 13,  4,  6,  9,  4, 16,  1,  3,  2, 17],
        [14,  2, 13,  4,  1,  6, 16,  3,  0,  2, 17],
        [12, 13, 10, 11,  0,  1, 16,  0, 17, 17, 17],
        [13, 14,  3, 12,  2,  5, 16,  1,  3,  0, 17],
        [13, 12,  3,  6,  8,  2, 16,  0, 17, 17, 17],
        [10, 12, 14, 13,  2, 13, 16,  0,  1,  3, 17],
        [ 4,  7,  9,  5,  3,  5, 16,  0,  2, 17, 17]])

In [12]:
train[:10]

tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [3]:
model = BigramLanguageModel()

In [4]:
torch.no_grad()
model.eval()

BigramLanguageModel(
  (token_embedding_table): Embedding(18, 384)
  (position_embedding): Embedding(11, 384)
  (sa_heads): MultiHeadAttention(
    (heads): ModuleList(
      (0-5): 6 x Head(
        (key): Linear(in_features=384, out_features=64, bias=False)
        (query): Linear(in_features=384, out_features=64, bias=False)
        (value): Linear(in_features=384, out_features=64, bias=False)
        (dropout): Dropout(p=0, inplace=False)
      )
    )
    (proj): Linear(in_features=384, out_features=384, bias=True)
    (dropout): Dropout(p=0, inplace=False)
  )
  (blocks): Sequential(
    (0): Block(
      (sa): MultiHeadAttention(
        (heads): ModuleList(
          (0-5): 6 x Head(
            (key): Linear(in_features=384, out_features=64, bias=False)
            (query): Linear(in_features=384, out_features=64, bias=False)
            (value): Linear(in_features=384, out_features=64, bias=False)
            (dropout): Dropout(p=0, inplace=False)
          )
        )
      

In [6]:
# calculate validation stats
criterion = nn.CrossEntropyLoss()

total_accuracy = 0.0
total_loss = 0.0

num_batches = 0

print("Evaluating...")
for inputs, targets in tqdm(val_dataloader):
    outputs = model(inputs)

    # calculate the val accuracy
    accuracy = calculate_accuracy(outputs, targets)
    total_accuracy += accuracy

    # Calculate the val loss
    loss = criterion(outputs, targets)
    total_loss += loss.item()
    num_batches += 1

average_accuracy = total_accuracy / num_batches
val_loss = total_loss / num_batches

Evaluating...


100%|██████████| 6250/6250 [04:00<00:00, 26.02it/s]


In [7]:
val_loss

2.890560711326599

In [8]:
vocab_size

18

In [5]:
model(data)

tensor([[-0.2297,  0.0810,  0.9202,  ...,  0.1275, -0.8816,  1.3060],
        [ 0.1212,  0.0911,  0.2244,  ..., -0.1222, -0.6712,  0.9082],
        [ 0.4794, -0.1039,  0.6033,  ...,  0.3061, -0.6988,  0.7974],
        ...,
        [ 0.5819, -0.0972,  0.4204,  ...,  0.3567, -0.7478,  1.0166],
        [ 0.1484,  0.0101,  0.7274,  ...,  0.4728, -0.5825,  0.9725],
        [ 0.4157, -0.0944,  0.6138,  ...,  0.7540, -0.7934,  0.9451]],
       grad_fn=<SliceBackward0>)

In [6]:
train

tensor([1, 0, 2, 0, 2, 2, 0, 2, 3, 2, 0, 0, 3, 1, 0, 1, 0, 1, 0, 2, 1, 3, 3, 1,
        2, 2, 0, 0, 2, 0, 2, 0, 2, 3, 1, 2, 3, 2, 2, 3, 0, 2, 0, 2, 2, 2, 3, 0,
        0, 0, 2, 2, 2, 3, 1, 0, 0, 0, 2, 2, 3, 0, 3, 3])