In [12]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
! cd /kaggle/working/
! cp -r /kaggle/input/erav1-s17/S17 .

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [14]:
cd /kaggle/working/S17

/kaggle/working/S17


In [15]:
import torch
from src.model import Transformer
from transformers import AutoTokenizer
from src.gpt_utils import (
    BATCH_SIZE,
    BLOCK_SIZE,
    DEVICE,
    DROPOUT,
    LEARNING_RATE,
    NUM_EMBED,
    NUM_HEAD,
    NUM_LAYER,
    MAX_ITER,
    EVAL_INTER,
    encode,
    decode,
    get_batch,
    save_model_to_chekpoint,
    estimate_loss,
)
import torch.nn as nn
import torch.nn.functional as F
# Try to get torchinfo, install it if it doesn't work
try:
    from torchinfo import summary
except:
    print("[INFO] Couldn't find torchinfo... installing it.")
    !pip install -q torchinfo
    from torchinfo import summary

In [16]:
# raw data
path_do_data = "data/english.txt"
data_raw = open(path_do_data, encoding="utf-8").read()
# we use pretrained BERT tokenizer for performance improvements
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
vocab_size = tokenizer.vocab_size
# data_raw = data_raw[4000000:] # short dataset

In [17]:
# train/val split
data = encode(text_seq=data_raw, tokenizer=tokenizer)
n = int(0.9 * len(data))  # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

Token indices sequence length is longer than the specified maximum sequence length for this model (37443 > 512). Running this sequence through the model will result in indexing errors


In [18]:
model = Transformer(
    embed_dim=NUM_EMBED,
        num_heads=NUM_HEAD,
        attn_dropout=DROPOUT,
        mlp_dim=4*NUM_EMBED,
        mlp_dropout=DROPOUT,
        mlp_activation=nn.ReLU(),
        num_layers=NUM_LAYER,
        embed_dict_size=vocab_size,
        max_seq_len=BLOCK_SIZE,
        pad_idx=tokenizer.pad_token_id,
        add_cls_token=False,
        pe_requires_grad=False,
        need_embedding = True,)

In [19]:
class GPT(nn.Module):
    def __init__(self, model, block_size):
        super(GPT, self).__init__()
        self.model = model
        self.linear = nn.Linear(NUM_EMBED, vocab_size, bias=False)
        # self.linear.weight = self.model.token_embed_layer.weight
        self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))

    def forward(self, idx, targets=None):
        B, T = idx.shape
        x = self.model(idx, attn_mask=(self.tril[:T, :T] == 0))
        x = self.linear(x)

        # compute the loss
        if targets != None:
            # cross_entropy accepts inputs in a (batch_size, num_classes)
            # so we need to reformat our logits dimensions to
            # (batch_size * time, dim_vocabulary), time = block_size
            B, T, C = x.shape
            x = torch.reshape(x, (B * T, C))
            targets = torch.reshape(targets, (B * T,))
            loss = F.cross_entropy(x, targets)
        else:
            loss = None

        return x, loss

In [20]:
# load model to GPU if available
m = GPT(model, BLOCK_SIZE).to(DEVICE)
# print the number of parameters in the model
print(
    "Model with {:.2f}M parameters".format(sum(p.numel() for p in m.parameters()) / 1e6)
)

Model with 89.41M parameters


In [21]:
from torchinfo import summary
summary(m)

Layer (type:depth-idx)                                                 Param #
GPT                                                                    --
├─Transformer: 1-1                                                     --
│    └─ReLU: 2-1                                                       --
│    └─Embedding: 2-2                                                  23,440,896
│    └─PositionalEmbedding: 2-3                                        --
│    └─Sequential: 2-4                                                 --
│    │    └─TransformerBlock: 3-1                                      7,087,872
│    │    └─TransformerBlock: 3-2                                      7,087,872
│    │    └─TransformerBlock: 3-3                                      7,087,872
│    │    └─TransformerBlock: 3-4                                      7,087,872
│    │    └─TransformerBlock: 3-5                                      7,087,872
│    │    └─TransformerBlock: 3-6                               

In [22]:
# optimizer takes the model's parameters and the learning rate as input,
# and updates the parameters during the training process in order to
# minimize the loss function.
optimizer = torch.optim.AdamW(m.parameters(), lr=LEARNING_RATE)
MAX_ITER = 500
for step in range(MAX_ITER):

    # every EVAL_INTER evaluate the loss on train and val sets
    if step % EVAL_INTER == 0 or step == MAX_ITER - 1:
        loss_train = estimate_loss(
            data=train_data, model=m, block_size=BLOCK_SIZE, batch_size=BATCH_SIZE
        )
        loss_val = estimate_loss(
            data=val_data, model=m, block_size=BLOCK_SIZE, batch_size=BATCH_SIZE
        )
        print("step {:10} | train loss {:6.4f} | val loss {:6.4f}".format(step, loss_train, loss_val))

    # sample a batch of data
    xb, yb = get_batch(data=train_data, block_size=BLOCK_SIZE, batch_size=BATCH_SIZE)
    logits, loss = m.forward(xb, yb)
    print("step {:10} | loss {:6.4f}".format(step, loss.item()))
    # zero_grad() method sets the gradients of all parameters in the optimizer to zero
    optimizer.zero_grad(set_to_none=True)
    # backward() method on the loss variable calculates the gradients 
    # of the loss with respect to the model's parameters.
    loss.backward()
    # step() method on the optimizer updates the model's parameters 
    # using the calculated gradients, in order to minimize the loss.
    optimizer.step()



step          0 | train loss 10.5290 | val loss 10.5405
step          0 | loss 10.5447
step          1 | loss 9.5983
step          2 | loss 9.1837
step          3 | loss 8.8108
step          4 | loss 8.6106
step          5 | loss 8.2534
step          6 | loss 8.1428
step          7 | loss 7.9234
step          8 | loss 7.6984
step          9 | loss 7.4967
step         10 | loss 7.2215
step         11 | loss 7.1858
step         12 | loss 6.8793
step         13 | loss 6.8041
step         14 | loss 6.7497
step         15 | loss 6.6594
step         16 | loss 6.5199
step         17 | loss 6.5680
step         18 | loss 6.3590
step         19 | loss 6.4314
step         20 | loss 6.2648
step         21 | loss 6.4353
step         22 | loss 6.3949
step         23 | loss 6.4226
step         24 | loss 6.3417
step         25 | loss 6.1645
step         26 | loss 6.2899
step         27 | loss 6.2696
step         28 | loss 6.1990
step         29 | loss 6.2662
step         30 | loss 6.3897
step         