In [1]:
import os, sys

dir2 = os.path.abspath('')
dir1 = os.path.dirname(dir2)
if not dir1 in sys.path:
    sys.path.append(dir1)

In [2]:
import numpy as np
import torch
import matplotlib.pyplot as plt
import tqdm
from src.data.make_dataset import TextDetoxificationDataset, Evaluator

from src.models.train_model import BaselineTranslationModel
import torch.nn.functional as F

In [3]:
!nvidia-smi

Sat Oct 21 17:41:18 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 536.67                 Driver Version: 536.67       CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3050 ...  WDDM  | 00000000:01:00.0 Off |                  N/A |
| N/A   80C    P0              35W /  35W |   3192MiB /  4096MiB |    100%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [4]:
train_dataset = TextDetoxificationDataset(mode='train')
val_dataset = TextDetoxificationDataset(mode='val', vocab=train_dataset.vocab)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mirak\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[32m2023-10-21 17:41:21.836[0m | [1mINFO    [0m | [36msrc.data.make_dataset[0m:[36m__init__[0m:[36m212[0m - [1mStarted building vocab[0m


Collecting vocab: 0it [00:00, ?it/s]

[32m2023-10-21 17:43:13.940[0m | [1mINFO    [0m | [36msrc.data.make_dataset[0m:[36m__init__[0m:[36m219[0m - [1mVocab built successfully[0m
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mirak\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
model = BaselineTranslationModel(vocab=train_dataset.vocab, emb_dim=200, hidden_dim=200, n_layers=1)

Initializing with pretrained embeddings:   0%|          | 0/20085 [00:00<?, ?it/s]

  self.embeddings.weight.data[self.vocab[word]] = torch.Tensor(glove_model.get_vector(word))


In [6]:
with torch.no_grad():
    w1 = model.embeddings(torch.Tensor([model.vocab['king']]).long()).numpy().squeeze()
    w2 = model.embeddings(torch.Tensor([model.vocab['queen']]).long()).numpy().squeeze()
    w3 = model.embeddings(torch.Tensor([model.vocab['man']]).long()).numpy().squeeze()
    w4 = model.embeddings(torch.Tensor([model.vocab['woman']]).long()).numpy().squeeze()
    w5 = model.embeddings(torch.Tensor([model.vocab['carrot']]).long()).numpy().squeeze()
royal = w1 - w3
queen2queen = (royal + w4).T @ w2 / np.linalg.norm(royal + w4) / np.linalg.norm(w2)
queen2carrot = (royal + w4).T @ w5 / np.linalg.norm(royal + w4) / np.linalg.norm(w5)
print(queen2queen, queen2carrot)

0.65601796 0.19445348


In [7]:
# code taken from https://discuss.pytorch.org/t/pytorch-model-size-in-mbs/149002
size_model = 0
for param in model.parameters():
    if param.data.is_floating_point():
        size_model += param.numel() * torch.finfo(param.data.dtype).bits
    else:
        size_model += param.numel() * torch.iinfo(param.data.dtype).bits
print(f"model size: {size_model} bit | {size_model / 8e6:.2f} MB")

model size: 145909920 bit | 18.24 MB


### DataLoaders

In [19]:
def compute_loss(logits_seq, out):
    mask = out != model.vocab['<pad>'] # [batch_size, out_len]
    targets_1hot = F.one_hot(out, len(model.vocab)).to(torch.float32)

    # outputs of the model, [batch_size, out_len, num_tokens]
    # logits_seq = model(inp, out, **inference_params)

    # log-probabilities of all tokens at all steps, [batch_size, out_len, num_tokens]
    logprobs_seq = torch.log(logits_seq.softmax(dim=-1) + 1e-6)

    # log-probabilities of correct outputs, [batch_size, out_len]
    logp_out = (logprobs_seq * targets_1hot).sum(dim=-1)

    # average cross-entropy over non-padding tokens
    return - torch.masked_select(logp_out, mask).mean() # average loss, scalar

In [9]:
def collate_batch(batch, max_len=64):
    source, target = [], []
    for src_sentence, tgt_sentence, _ in batch:
        source.append(torch.Tensor([train_dataset.BOS_IDX] + src_sentence[:max_len].tolist() + [train_dataset.EOS_IDX]).long())
        target.append(torch.Tensor([train_dataset.BOS_IDX] + tgt_sentence[:max_len].tolist() + [train_dataset.EOS_IDX]).long())
    
    source = torch.nn.utils.rnn.pad_sequence(source, batch_first=True, padding_value=train_dataset.PAD_IDX)
    target = torch.nn.utils.rnn.pad_sequence(target, batch_first=True, padding_value=train_dataset.PAD_IDX)

    return source.to(device), target.to(device)

In [13]:
batch_size = 32
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)

In [16]:
# some santiy check
with torch.no_grad():
    model.to(device)
    for src, tgt in tqdm.auto.tqdm(train_loader):
        out = model(src, tgt)
        print(src.shape)
        print(tgt.shape)
        print(out.shape, len(train_dataset.vocab))
        loss = compute_loss(out, tgt)
        print(loss.item())
        break

  0%|          | 0/14445 [00:00<?, ?it/s]

torch.Size([32, 45])
torch.Size([32, 43])
torch.Size([32, 43, 20085]) 20085
11.783910751342773


In [None]:
# Overfit a single batch
batch = next(iter(train_loader))
dummy_model = BaselineTranslationModel(vocab=train_dataset.vocab, emb_dim=100, hidden_dim=100, n_layers=1).to(device)
optimizer = torch.optim.Adam(dummy_model.parameters(), lr=3e-4)
loss_fn = compute_loss
for i in range(1000):
    optimizer.zero_grad()
    out = dummy_model(src, tgt)
    loss = compute_loss(out, tgt)
    loss.backward()
    optimizer.step()
    if i % 200 == 0:
        print(loss.item())

11.750530242919922
1.4347864389419556


the model successfully overfit

### Training

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
epochs = 5
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, epochs)
model.to(device);

In [None]:
metrics = {'train_loss': [], 'dev_bleu': [], 'dev_loss': []}
for _ in tqdm(range(epochs)):
    for src, tgt in train_loader:
        step = len(metrics['train_loss']) + 1
        optimizer.zero_grad()
        loss_fn = compute_loss(model(src), tgt)
        loss_fn.backward()
        optimizer.step()
        scheduler.step()
        metrics['train_loss'].append((step, loss_t.item()))

    with torch.no_grad():
        metrics['dev_bleu'].append((step, bleu_score(model, dev_inp, dev_out)))

    clear_output(True)
    plt.figure(figsize=(12,4))
    for i, (name, history) in enumerate(sorted(metrics.items())):
        plt.subplot(1, len(metrics), i + 1)
        plt.title(name)
        plt.plot(*zip(*history))
        plt.grid()
    plt.show()
    print("Mean loss=%.3f" % np.mean(metrics['train_loss'][-10:], axis=0)[1], flush=True)

### Evaluation on test set