In [1]:
import os, sys

dir2 = os.path.abspath('')
dir1 = os.path.dirname(dir2)
if not dir1 in sys.path:
    sys.path.append(dir1)

In [6]:
import numpy as np
import torch
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from src.data.make_dataset import TextDetoxificationDataset, bleu_score
from src.models.train_model import BaselineTranslationModel

In [3]:
!nvidia-smi

Tue Oct 17 21:25:54 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 536.67                 Driver Version: 536.67       CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3050 ...  WDDM  | 00000000:01:00.0 Off |                  N/A |
| N/A    0C    P3              11W /  30W |      0MiB /  4096MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [4]:
train_dataset = TextDetoxificationDataset(mode='train')
val_dataset = TextDetoxificationDataset(mode='val', vocab=train_dataset.vocab)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mirak\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Building vocab:   0%|          | 0/462221 [00:00<?, ?it/s]

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mirak\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
model = BaselineTranslationModel(token2idx=train_dataset.token2idx, emb_dim=200, hidden_dim=200, n_layers=1)

Initializing with pretrained embeddings:   0%|          | 0/21707 [00:00<?, ?it/s]

  self.embeddings.weight.data[self.token2idx[word]] = torch.Tensor(glove_model.get_vector(word))


In [18]:
with torch.no_grad():
    w1 = model.embeddings(torch.Tensor([model.token2idx['king']]).long()).numpy().squeeze()
    w2 = model.embeddings(torch.Tensor([model.token2idx['queen']]).long()).numpy().squeeze()
    w3 = model.embeddings(torch.Tensor([model.token2idx['man']]).long()).numpy().squeeze()
    w4 = model.embeddings(torch.Tensor([model.token2idx['woman']]).long()).numpy().squeeze()
    w5 = model.embeddings(torch.Tensor([model.token2idx['carrot']]).long()).numpy().squeeze()
royal = w1 - w3
s1 = (royal + w4).T @ w2 / np.linalg.norm(royal + w4) / np.linalg.norm(w2)
s2 = (royal + w4).T @ w5 / np.linalg.norm(royal + w4) / np.linalg.norm(w5)
print(s1, s2)

0.65601796 0.19445348


In [19]:
def compute_loss(model, inp, out, **inference_params):
    mask = out != model.token2idx['<pad>'] # [batch_size, out_len]
    targets_1hot = F.one_hot(out, len(model.out_voc)).to(torch.float32)

    # outputs of the model, [batch_size, out_len, num_tokens]
    logits_seq = model(inp, out, **inference_params)

    # log-probabilities of all tokens at all steps, [batch_size, out_len, num_tokens]
    logprobs_seq = torch.log(F.softmax(logits_seq, dim=-1))

    # log-probabilities of correct outputs, [batch_size, out_len]
    logp_out = (logprobs_seq * targets_1hot).sum(dim=-1)

    # average cross-entropy over non-padding tokens
    return - torch.masked_select(logp_out, mask).mean() # average loss, scalar

In [22]:
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
loss_fn = compute_loss
num_steps = 15000
batch_size = 16
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, num_steps)\
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

### Training

In [None]:
for _ in tqdm(range(num_steps)):
    step = len(metrics['train_loss']) + 1
    batch_ix = np.random.randint(len(train_inp), size=batch_size)
    batch_inp = train_dataset.vocab.to_matrix(train_inp[batch_ix]).to(device)
    batch_out = train_dataset.vocab.to_matrix(train_out[batch_ix]).to(device)

    optimizer.zero_grad()
    loss_fn = compute_loss(model, batch_inp, batch_out)
    loss_fn.backward()
    optimizer.step()
    scheduler.step()

    metrics['train_loss'].append((step, loss_t.item()))

    if step % 100 == 0:
        metrics['dev_bleu'].append((step, bleu_score(model, dev_inp, dev_out)))

        clear_output(True)
        plt.figure(figsize=(12,4))
        for i, (name, history) in enumerate(sorted(metrics.items())):
            plt.subplot(1, len(metrics), i + 1)
            plt.title(name)
            plt.plot(*zip(*history))
            plt.grid()
        plt.show()
        print("Mean loss=%.3f" % np.mean(metrics['train_loss'][-10:], axis=0)[1], flush=True)

### Evaluation on test set