In [1]:
# Add src module directory to system path for subsecuent imports.
import os
import sys
sys.path.insert(0, '../src')

In [2]:
from util import is_notebook

# Settings (only in Jupyter Notebooks).
if is_notebook():
    # Module reloading.
    %load_ext autoreload
    # aimport?
    %autoreload 2
    # Plot settings.
    %matplotlib inline

In [3]:
# Imports.
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from torch.utils.data import DataLoader
from datasets import load_metric

from constants import *
from util import *
from transformer import Transformer
from tokenizer import load_tokenizer
from data import download_data, load_data
from plotting import plot_metric
from metric_logging import MetricLogger

In [4]:
# Set seed.
from pytorch_lightning import seed_everything
seed_everything(0, workers=True)

Global seed set to 0


0

In [6]:
# Experiment paramereters.
hparams = dotdict({
    'src_lang': 'de',
    'tgt_lang': 'en',
    'model_path': '',
    'batch_size': 80,
    'max_epochs': 10,
    'max_examples': -1,
    'gpus': 1,
    'num_workers': 4,
    'ckpt_path': None,
})

print('Experiment paramereters:')
print(hparams)

In [8]:
# Constant directories.
data_dir = os.path.join('./', 'data')
tokenizers_dir = os.path.join('./', 'tokenizers')
runs_dir = os.path.join('./', 'runs')

# Experiment directories.
run_dir = os.path.join(runs_dir, f'eval-{src_lang}-{tgt_lang}-{get_time_as_string()}')
results_dir = os.path.join(run_dir, 'results')

dirs = [data_dir, tokenizers_dir, runs_dir, run_dir, results_dir]
for dir in dirs:
    create_dir(dir)

print('Created directories.')

Dir "./data" already exists.
Dir "./runs" already exists.
Dir "./runs\eval-de-en-2022.08.09-11.28.26" does not exist, creating it.
Dir "./runs\eval-de-en-2022.08.09-11.28.26\results" does not exist, creating it.


In [9]:
# Load Metrics.
score_metric = load_metric('sacrebleu')

print('Loaded metrics.')

Loaded metrics


In [10]:
# Download data.
download_data(hparams.src_lang, hparams.tgt_lang)

Loaded tokenizers.


In [12]:
# Load tokenizers.
src_tokenizer = load_tokenizer(hparams.src_lang, hparams.tgt_lang)
tgt_tokenizer = load_tokenizer(hparams.tgt_lang, hparams.src_lang)

print('Loaded tokenizers.')

Preprocessed data exists, loading from disk...
Splitting de-en data...
Data (de-en) proprocessed.
Preprocessed data (de-en)
	Training data:   441000
	Validation data: 4500
	Test data:       4500


In [None]:
# Load data.
train_dataset, val_dataset, test_dataset = load_data(hparams.src_lang,
                                                     hparams.tgt_lang,
                                                     src_tokenizer,
                                                     tgt_tokenizer,
                                                     hparams.max_examples)

print(f'Preprocessed data ({hparams.src_lang}-{hparams.tgt_lang})')
print(f'\tTraining data:   {len(train_dataset)}')
print(f'\tValidation data: {len(val_dataset)}')
print(f'\tTest data:       {len(test_dataset)}')

In [13]:
# Create data loaders.
train_dataloader = DataLoader(train_dataset, batch_size=hparams.batch_size, num_workers=hparams.num_workers)
val_dataloader = DataLoader(val_dataset, batch_size=hparams.batch_size, num_workers=hparams.num_workers)
test_dataloader = DataLoader(test_dataset, batch_size=hparams.batch_size, num_workers=hparams.num_workers)

print('Created data loaders.')

Created data loaders.


In [14]:
# Create model.
model = LitSeq2SeqTransformer(hparams.lr,
                                 hparams.num_encoder_layers,
                                 hparams.num_decoder_layers,
                                 hparams.emb_size,
                                 hparams.n_head,
                                 src_lang_tokenizer,
                                 tgt_lang_tokenizer,
                                 dim_feedforward = hparams.ffn_hid_dim,
                                 tgt_lang_bos_token_id = tgt_lang_tokenizer.cls_token_id,
                                 tgt_lang_bos_token = tgt_lang_tokenizer.cls_token,
                                 tgt_lang_eos_token_id = tgt_lang_tokenizer.sep_token_id,
                                 tgt_lang_eos_token = tgt_lang_tokenizer.sep_token,
                                 score_metric = score_metric)

In [None]:
# Load models.
model.load_state_dict(torch.load(hparams.model_path))

model.to(device)

print('Loaded model.')

In [18]:
# Create trainer.
metric_logger = MetricLogger()
checkpoint_callback = pl.callbacks.ModelCheckpoint(
          dirpath=model_checkpoints_dir,
          verbose=True,
          save_last=True
      )

trainer = Trainer(deterministic=True,
                  fast_dev_run=False,
                  max_epochs=hparams.max_epochs,
                  logger=metric_logger,
                  log_every_n_steps=1,
                  enable_checkpointing=True,
                  default_root_dir=model_checkpoints_dir,
                  callbacks=[checkpoint_callback],
                  gpus=hparams.gpus if str(device) == 'cuda' else 0)

print('Created trainer.')

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [19]:
# Testing.
test_metrics = trainer.test(model, dataloaders=test_dataloader)
print(test_metrics)

Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     test_loss_epoch        10.323444366455078
    test_score_epoch      0.00020494360069278628
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
[{'test_loss_epoch': 10.323444366455078, 'test_score_epoch': 0.00020494360069278628}]


In [23]:
# Save hyper parameters.
save_dict(run_dir, hparams, 'hparams')

In [25]:
# Save recorded metrics.
metric_logger.manual_save(results_dir)