In [None]:
# Initialization Cell
# path to folder that data exists
PATH_DATA = 'Masked Corpus'

In [3]:
import torch 
import torch.optim as optim

import seq2seq_multilayer_gru_with_pad
from sequence_model_trainer import TrainModel

from torchtext.data import Field, LabelField
from torchtext.data import TabularDataset
from torchtext.data import Iterator, BucketIterator

%load_ext autoreload
%autoreload 2

/content/gdrive/My Drive/Colab Notebooks/Notebooks/Luodingo


In [0]:
MASKED_TEXT = Field(
                sequential=True,
                tokenize=lambda x: x.split(), 
                init_token = '<sos>', 
                eos_token = '<eos>', 
                lower = True, 
                include_lengths = True
              )

TARGET_TEXT = Field(
                sequential=True,
                tokenize=lambda x: x.split(), 
                init_token = '<sos>', 
                eos_token = '<eos>', 
                lower = True
             )

fields = [('id', None), ('keywords', MASKED_TEXT), ('target', TARGET_TEXT)]

In [5]:
train, val, test = TabularDataset.splits(
                            path=PATH_DATA,
                            train='train.csv',
                            validation='val.csv',
                            test='test.csv',
                            format='csv',
                            skip_header=True,
                            fields=fields
                    )

/content/gdrive/My Drive/Colab Notebooks/Datasets


In [0]:
MASKED_TEXT.build_vocab(train)
TARGET_TEXT.build_vocab(train)

In [0]:
BATCH_SIZE = 32

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iter, val_iter, test_iter = BucketIterator.splits(
                                    (train, val, test),
                                    batch_size=BATCH_SIZE,
                                    sort_within_batch = True,
                                    sort_key = lambda x : len(x.keywords),
                                    device = device
                                  )

In [0]:
EMB_DIM=256
ENC_INPUT_DIM=len(MASKED_TEXT.vocab)
DEC_INPUT_DIM=len(TARGET_TEXT.vocab)
OUTPUT_DIM=DEC_INPUT_DIM
N_LAYER=4
HID_DIM=1024
DROPOUT=0.3
TRG_PAD_IDX = TARGET_TEXT.vocab.stoi[TARGET_TEXT.pad_token]

model = seq2seq_multilayer_gru_with_pad.Seq2Seq(
              enc_input_dim=ENC_INPUT_DIM,
              dec_input_dim=DEC_INPUT_DIM,
              emb_dim=EMB_DIM,
              enc_hid_dim=HID_DIM,
              dec_hid_dim=HID_DIM,
              n_layers=N_LAYER,
              output_dim=OUTPUT_DIM,    
              device=device,
              dropout=DROPOUT
         ).to(device)

In [0]:
LEARNING_RATE = 0.0001
adam = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
cross_e = torch.nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)

In [0]:
trainer = TrainModel(
            model=model,
            train_iterator=train_iter,
            val_iterator=val_iter,
            optimizer=adam,
            criterion=cross_e,
            output_dim=OUTPUT_DIM
          )

In [12]:
%cd /content/gdrive/My\ Drive/Colab\ Notebooks/Notebooks/Luodingo
N_EPOCHS = 200
CLIP = 1
trainer.epoch(n_epochs=N_EPOCHS, clip=CLIP, model_name='seq2seq-multilayer-gru.pt')

/content/gdrive/My Drive/Colab Notebooks/Notebooks/Luodingo
Epoch: 01 | Time: 0m 59s
	Train Loss: 5.035 | Train PPL: 153.761
	 Val. Loss: 4.743 |  Val. PPL: 114.774


KeyboardInterrupt: ignored

In [0]:
test_loss = trainer.test(iterator=test_iter,
                         model_name='seq2seq-multilayer-gru.pt')

In [14]:
import math

print(f'| Test Loss: {round(test_loss, 4)} | Test PPL: {round(math.exp(test_loss),4)} |')

| Test Loss: 4.743 | Test PPL: 114.7747 |
