In [15]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [16]:
! cd /kaggle/working/
! cp -r /kaggle/input/erav1-s17/S17 .

In [17]:
cd /kaggle/working/S17

/kaggle/working/S17


In [18]:
from torch.utils.data import DataLoader
import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn
import torch

from src.bert_utils import get_batch, save_model_embeddings
from src.datasets import SentencesDataset, prepare_sentences_dataset
from src.model import Transformer
from src.engines import bert_training_step
try:
    from torchinfo import summary
except:
    print("[INFO] Couldn't find torchinfo... installing it.")
    !pip install -q torchinfo
    from torchinfo import summary

In [19]:
# Config
batch_size = 512
seq_len = 20
embed_size = 128
inner_ff_size = embed_size * 4
n_heads = 8
n_code = 8
n_vocab = 40000
dropout = 0.1
n_workers = 2

# Optimizer
optim_kwargs = {'lr':1e-4, 'weight_decay':1e-4, 'betas':(.9,.999)}

In [20]:
sentences, vocab = prepare_sentences_dataset(dataset_path='data/training.txt', vocab_path='data/vocab.txt', vocab_size=n_vocab)

Loading dataset from data/training.txt
Tokenizing dataset!
Loading vocabulary from data/vocab.txt


In [21]:
dataset = SentencesDataset(sentences, vocab, seq_len=seq_len)

In [22]:
kwargs = {'shuffle':True,  'drop_last':True, 'pin_memory':True, 'batch_size':batch_size}
data_loader = DataLoader(dataset, **kwargs)

In [23]:
model = Transformer(
    embed_dim=embed_size,
    num_heads=n_heads,
    attn_dropout=dropout,
    mlp_dim=inner_ff_size,
    mlp_dropout=dropout,
    mlp_activation=nn.ReLU(),
    num_layers=n_code,
    embed_dict_size=n_vocab,
    max_seq_len=seq_len,
    pad_idx=dataset.IGNORE_IDX,
    add_cls_token=False,
    pe_requires_grad=False,
)

In [24]:
class BERT(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model
        self.output_linear = nn.Linear(embed_size, n_vocab, bias=False)
        self.token_embed_layer = model.token_embed_layer

    def forward(self, x, attn_mask):
        x = self.model(x, attn_mask)
        return self.output_linear(x)
    

In [25]:
device=torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
bert_model = BERT(model)
bert_model.to(device)

BERT(
  (model): Transformer(
    (mlp_activation): ReLU()
    (token_embed_layer): Embedding(40000, 128, padding_idx=23945)
    (pos_embed_layer): PositionalEmbedding()
    (transformer_blocks): Sequential(
      (0): TransformerBlock(
        (mha_block): MultiheadSelfAttentionBlock(
          (layer_norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
          (multihead_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
          )
        )
        (mlp_block): MultiLayerPerceptronBlock(
          (layer_norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (0): Linear(in_features=128, out_features=512, bias=True)
            (1): ReLU()
            (2): Dropout(p=0.1, inplace=False)
            (3): Linear(in_features=512, out_features=128, bias=True)
            (4): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (1): Transform

In [26]:
summary(bert_model)

Layer (type:depth-idx)                                                 Param #
BERT                                                                   --
├─Transformer: 1-1                                                     --
│    └─ReLU: 2-1                                                       --
│    └─Embedding: 2-2                                                  5,120,000
│    └─PositionalEmbedding: 2-3                                        --
│    └─Sequential: 2-4                                                 --
│    │    └─TransformerBlock: 3-1                                      198,272
│    │    └─TransformerBlock: 3-2                                      198,272
│    │    └─TransformerBlock: 3-3                                      198,272
│    │    └─TransformerBlock: 3-4                                      198,272
│    │    └─TransformerBlock: 3-5                                      198,272
│    │    └─TransformerBlock: 3-6                                      198,

In [27]:
optimizer = optim.Adam(bert_model.parameters(), **optim_kwargs)
loss_fn = nn.CrossEntropyLoss(ignore_index=dataset.IGNORE_IDX)

In [28]:
print('Training...')
print_each=10
model.train()
batch_iter = iter(data_loader)
n_iteration = 10000
for itr in range(n_iteration):
    bert_training_step(
    itr=itr, 
    model=bert_model, 
    data_loader=data_loader, 
    batch_iter=batch_iter, 
    loss_fn=loss_fn, 
    optimizer=optimizer, 
    print_each=print_each,
    device=device
)

Training...
it: 0  | loss 10.72  | Δw: 1.44
it: 10  | loss 10.05  | Δw: 0.753
it: 20  | loss 9.79  | Δw: 0.508
it: 30  | loss 9.67  | Δw: 0.422
it: 40  | loss 9.52  | Δw: 0.333
it: 50  | loss 9.33  | Δw: 0.312
it: 60  | loss 9.1  | Δw: 0.297
it: 70  | loss 8.97  | Δw: 0.288
it: 80  | loss 8.76  | Δw: 0.256
it: 90  | loss 8.63  | Δw: 0.232
it: 100  | loss 8.49  | Δw: 0.228
it: 110  | loss 8.36  | Δw: 0.232
it: 120  | loss 8.15  | Δw: 0.225
it: 130  | loss 8.05  | Δw: 0.214
it: 140  | loss 7.95  | Δw: 0.212
it: 150  | loss 7.77  | Δw: 0.2
it: 160  | loss 7.67  | Δw: 0.191
it: 170  | loss 7.55  | Δw: 0.188
it: 180  | loss 7.41  | Δw: 0.183
it: 190  | loss 7.29  | Δw: 0.181
it: 200  | loss 7.22  | Δw: 0.182
it: 210  | loss 7.19  | Δw: 0.178
it: 220  | loss 7.03  | Δw: 0.185
it: 230  | loss 6.92  | Δw: 0.176
it: 240  | loss 6.88  | Δw: 0.176
it: 250  | loss 6.85  | Δw: 0.173
it: 260  | loss 6.9  | Δw: 0.177
it: 270  | loss 6.79  | Δw: 0.175
it: 280  | loss 6.84  | Δw: 0.186
it: 290  | loss 