In [17]:
import warnings
warnings.filterwarnings('ignore')

# Python 
import os
import warnings
import logging
from typing import Mapping, List
from pprint import pprint

# Numpy and Pandas 
import numpy as np
import pandas as pd

# PyTorch 
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchtext.data import Field, BucketIterator
from torchtext import data
from torchtext import vocab

# Transformers 
from transformers import AutoConfig, AutoModel, AutoTokenizer

# Catalyst
from catalyst.dl import SupervisedRunner
from catalyst.dl.callbacks import AccuracyCallback, F1ScoreCallback, OptimizerCallback
from catalyst.dl.callbacks import CheckpointCallback, InferCallback
from catalyst.utils import set_global_seed, prepare_cudnn

from arxiv_title_generation import generate_csv
from arxiv_title_generation import tokenize
from arxiv_title_generation import Encoder, Attention, Decoder, Seq2Seq, init_weights
from arxiv_title_generation import train, evaluate, count_parameters, epoch_time
from arxiv_title_generation import translate_sentence, display_attention
from arxiv_title_generation import BatchGenerator

In [9]:
SEED = 42
set_global_seed(SEED)
prepare_cudnn(deterministic=True)

In [11]:
ROOT_PATH = "../../data/raw/title-generation/"
MODEL_NAME = 'base-seq2seq-01' # pretrained model from Transformers
LOG_DIR = "./logdir"                   # for training logs and tensorboard visualizations
NUM_EPOCHS = 5                         # smth around 2-6 epochs is typically fine when finetuning transformers
BATCH_SIZE = 64                       # depends on your available GPU memory (in combination with max seq length)
LEARN_RATE = 5e-5                      # learning rate is typically ~1e-5 for transformers
ACCUM_STEPS = 1                        # one optimization step for that many backward passes
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [12]:
TEXT = Field(tokenize=tokenize,
            init_token = '<sos>', 
            eos_token = '<eos>', 
            include_lengths = True,
            lower = True)

trn_data_fields = [("src", TEXT),
                   ("trg", TEXT)]

dataset = data.TabularDataset(
    path=f"{ROOT_PATH}train.csv",
    format='csv',
    skip_header=True,
    fields=trn_data_fields
)

train_data, valid_data, test_data = dataset.split(split_ratio=[0.98, 0.01, 0.01])

In [13]:
# python -m gensim.downloader --download glove-wiki-gigaword-300
vec = vocab.Vectors('/home/science/gensim-data/glove-wiki-gigaword-300/glove-wiki-gigaword-300.gz')
TEXT.build_vocab(train_data, valid_data, test_data, vectors=vec, min_freq = 7)
print(f"Unique tokens in vocabulary: {len(TEXT.vocab)}")

In [35]:
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
     batch_size = BATCH_SIZE,
     sort_within_batch = True,
     sort_key = lambda x : len(x.src),
     device = device)

loaders = {
    "train": train_iterator,
    "valid": valid_iterator
}

In [None]:
def generate_batch(batch):
    label = torch.tensor([entry[0] for entry in batch])
    text = [entry[1] for entry in batch]
    offsets = [0] + [len(entry) for entry in text]
    # torch.Tensor.cumsum returns the cumulative sum
    # of elements in the dimension dim.
    # torch.Tensor([1.0, 2.0, 3.0]).cumsum(dim=0)

    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text = torch.cat(text)
    output = {
        "text": text,
        "offsets": offsets,
        "label": label
    }
    return output


train_loader = torch.utils.data.DataLoader(
    train_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=True,
    collate_fn=generate_batch,
)

valid_loader = torch.utils.data.DataLoader(
    valid_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=False,
    collate_fn=generate_batch,
)

loaders = {
    "train": train_batch_it,
    "valid": valid_batch_it
}

In [31]:
train_batch_it = BatchGenerator(train_iterator, 'src', 'trg')
valid_batch_it = BatchGenerator(valid_iterator, 'src', 'trg')
loaders = {
    "train": train_batch_it,
    "valid": valid_batch_it
}

In [25]:
%load_ext tensorboard
%tensorboard --logdir {LOG_DIR}

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6011 (pid 5401), started 0:14:22 ago. (Use '!kill 5401' to kill it.)

In [36]:
INPUT_DIM = len(TEXT.vocab)
OUTPUT_DIM = len(TEXT.vocab)
ENC_EMB_DIM = 300
DEC_EMB_DIM = 300
ENC_HID_DIM = 64
DEC_HID_DIM = 64
ENC_DROPOUT = 0.8
DEC_DROPOUT = 0.8
PAD_IDX = TEXT.vocab.stoi['<pad>']
SOS_IDX = TEXT.vocab.stoi['<sos>']
EOS_IDX = TEXT.vocab.stoi['<eos>']

attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)
model = Seq2Seq(enc, dec, PAD_IDX, SOS_IDX, EOS_IDX, device).to(device)
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(40938, 300)
    (rnn): GRU(300, 64, bidirectional=True)
    (fc): Linear(in_features=128, out_features=64, bias=True)
    (dropout): Dropout(p=0.8, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=192, out_features=64, bias=True)
    )
    (embedding): Embedding(40938, 300)
    (rnn): GRU(428, 64)
    (out): Linear(in_features=492, out_features=40938, bias=True)
    (dropout): Dropout(p=0.8, inplace=False)
  )
)

In [37]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARN_RATE)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer)

In [38]:
%%time


runner = SupervisedRunner(device=device)

runner.train(
    model=model,
    criterion=criterion,
    optimizer=optimizer,
    scheduler=scheduler,
    # our dataloaders
    loaders=loaders,
    # We can specify the callbacks list for the experiment;
    # For this task, we will check accuracy, AUC and F1 metrics
#     callbacks=callbacks,
    # path to save logs
    logdir=LOG_DIR,
    num_epochs=NUM_EPOCHS,
    # save our best checkpoint by AUC metric
#     main_metric="auc/_mean",
    # AUC needs to be maximized.
    minimize_metric=False,
    # for FP16. It uses the variable from the very first cell
#     fp16=fp16_params,
    # prints train logs
    verbose=True,
)

AttributeError: 'BucketIterator' object has no attribute 'sampler'