# Train Baseline

smol explanation...

## Setup

### Environment

In [None]:
# If this is a notebook which is executed in colab [in_colab=True]:
#  1. Mount google drive and use the repository in there [mount_drive=True] (the repository must be in your google drive root folder).
#  2. Clone repository to remote machine [mount_drive=False].
in_colab = False
mount_drive = True

try:
    # Check if running in colab.
    in_colab = 'google.colab' in str(get_ipython())
except:
    pass

if in_colab:
    if mount_drive:
        # Mount google drive and navigate to it.
        from google.colab import drive
        drive.mount('/content/drive')
        %cd drive/MyDrive
    else:
        # Pull repository.
        !git clone https://github.com/HenningBuhl/low-resource-machine-translation

    # Workaround for problem with undefined symbols (https://github.com/scverse/scvi-tools/issues/1464).
    !pip install --quiet scvi-colab
    from scvi_colab import install
    install()

    # Navigate to the repository and install requirements.
    %cd low-resource-machine-translation
    !pip install -r requirements.txt

    # Navigate to notebook location.
    %cd experiments

In [2]:
# Add src module directory to system path for subsecuent imports.
import sys
sys.path.insert(0, '../src')

In [45]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [40]:
# If this is a notebook, execute this cell in order to reload changes made to the source files.
from util import is_notebook

# Settings and module reloading (only in Jupyter Notebooks).
if is_notebook():
    # Module reloading.
    %load_ext autoreload
    %autoreload 2

    # Plot settings.
    %matplotlib inline

### Imports

In [60]:
# From packages.
import pytorch_lightning as pl

# From repository.
from arguments import *
from benchmark import *
from calc import *
from constants import *
from data import *
from layers import *
from metric_logging import *
from plotting import *
from path_management import *
from tokenizer import *
from transformer import *
from util import *

### Arguments

In [91]:
# Define arguments with argparse.
import argparse
parser = argparse.ArgumentParser(
    formatter_class=argparse.ArgumentDefaultsHelpFormatter
)

# TODO help string: explain funciton \n show arguments that are ignored or overwritten by it (when true or some other value)

# Experiment.
parser.add_argument('--dev-run', default=False, type=bool, help='')
parser.add_argument('--fresh-run', default=False, type=bool, help='Ignores all cashed data on disk, reruns generation and overwrites data')
parser.add_argument('--seed', default=0, type=int, help='')
parser.add_argument('--src-lang', default='de', type=str, help='if help string is empty, default value will not be shown...')
parser.add_argument('--tgt-lang', default='nl', type=str, help='')
#parser.add_argument('--eval-before-train', action='store_true', help='Evaluation (train + val) before training')

# Metrics.
#parser.add_argument('--track-batch', default=True, type=bool, help='if additional metrics are evaluated on batches')  # Add option to only eval on validation and test data to speed up training?
#parser.add_argument('--track-epoch', default=True, type=bool, help='')
parser.add_argument('--track-score', default=True, type=bool, help='SacreBLEU score')
#parser.add_argument('--track-ter', default=False, type=bool, help='translation error rate')
#parser.add_argument('--track-tp', default=False, type=bool, help='translation perplexity')

# Data.
#parser.add_argument('--dataset', default='WikiMatrix', type=str, help='Only one dataset. ALL setups that want to use more than one file as input should use data-dir because the cached data cannot be atributed to the correct experiments settings otherwise (or that would be bothersome)')
# TODO build data-dir from CONST_DATA_DIR and src-tgt key combination (look in which order it is present) if argument is none.
parser.add_argument('--data-dir', default=None, type=str, help='1. empty dir, populated with data from datasets arg 2. en.txt + de.txt 3. en/**.txt + de/**.txt 4. en/train.txt, en/val.txt, en/test.txt + de/train.txt, de/val.txt, de/test.txt')
parser.add_argument('--shuffle-before-split', default=False, type=bool, help='')
parser.add_argument('--val-test-examples', default=[3000, 3000], type=int, nargs=2, help='val, test...') # NOT IMPLEMENTED float (percentages) or int (count( [-1 for remaining] for (val, test) data
#parser.add_argument('--cache-combined-data', default=False, type=bool, help='save combined (single .txt file) data to disk')
#parser.add_argument('--cache-split-data', default=False, type=bool, help='save split data (train, val, test) data to disk')
#parser.add_argument('--cache-tokenized-data', default=True, type=bool, help='save tokenized data to disk')
#parser.add_argument('--cache-shifted-data', default=False, type=bool, help='save preprocessed to disk')
#parser.add_argument('--cache-padded-data', default=False, type=bool, help='save preprocessed to disk')
#parser.add_argument('--use-collate-fn', default=False, type=bool, help='(basically, everything that is not already cached is done in collate_fn instead of fully loading the data) If the fully collated (preprocessed) data is not cached and loaded, use collate_fn (for either padding+shifting OR tokenization+padding+shifting depending on cache settings)')
#parser.add_argument('--lazy-load-data', default=False, type=bool, type=str, help='!use custom dataset with chunksize (or similar method [add another param for max size MB/GB memory available?]) in getitem (also for saving? data would not fit all in memory to save tokenized or padded+shifted data at once)! if the data is so big, that not even the raw text can be used in memory completely, this will dynamically load batches from disk if set to false')

# Tokenization.
parser.add_argument('--mono-data-dir', default=None, type=str, help='dir containing dirs for each lang with file or files of monlingual data for tokenization')
# Add more tokenizers (+different set of arguments for src, pvt and tgt tokenier each...)?
#parser.add_argument('--tokenizer-dir', default=None, type=str, help='Use tokenizer in path or save to the location if not exist.')
#parser.add_argument('--pad-id', default=0, type=int, help='')
#parser.add_argument('--sos-id', default=1, type=int, help='')
#parser.add_argument('--eos-id', default=2, type=int, help='')
#parser.add_argument('--unk-id', default=3, type=int, help='')
#parser.add_argument('--vocab-size', default=16000, type=int, help='')
#parser.add_argument('--character-coverage', default=1.0, type=float, help='')
#parser.add_argument('--model-type', default='unigram', type=str, choices=['unigram', 'bpe', 'char'], help='')

# Architecture.
parser.add_argument('--num-layers', default=6, type=int, help='')
parser.add_argument('--d-model', default=512, type=int, help='')
parser.add_argument('--drop-out-rate', default=0.1, type=float, help='')
parser.add_argument('--num-heads', default=8, type=int, help='')
parser.add_argument('--d-ff', default=2048, type=int, help='')
#parser.add_argument('--max-len', default=128, type=int, help='')

# Optimizer.
parser.add_argument('--learning-rate', default=1e-4, type=float, help='')
parser.add_argument('--weight-decay', default=0, type=float, help='')
#parser.add_argument('--beta-1', default=0.9, type=float, help='')
#parser.add_argument('--beta-2', default=0.999, type=float, help='')

# Training.
parser.add_argument('--batch-size', default=80, type=int, help='')
parser.add_argument('--max-epochs', default=10, type=int, help='')
parser.add_argument('--max-examples', default=-1, type=int, help='')
parser.add_argument('--shuffle-train-data', default=True, type=bool, help='')
parser.add_argument('--gpus', default=1, type=int, help='')
parser.add_argument('--num-workers', default=4, type=int, help='')
parser.add_argument('--ckpt-path', default=None, type=str, help='')
#parser.add_argument('--resume-last-ckpt', default=False, type=bool, help='uses last.ckpt file...')

# Early Stopping + Model Checkpoint.
parser.add_argument('--monitor', default='val_loss', type=str, help='')
parser.add_argument('--min-delta', default=0, type=float, help='')
parser.add_argument('--patience', default=3, type=int, help='')
parser.add_argument('--mode', default='min', type=str, help='')
# TODO save last model, best model, last checkpoint.

# Result Exporting.
#parser.add_argument('--export-batch-data', default=False, type=bool, help='export batch data (not only epoch data)')
#parser.add_argument('--export-svgs', default=True, type=bool, help='exports an svg with plt for each actively tracked metric')
#parser.add_argument('--export-metric-files', default=True, type=bool, help='Additionally saves each metric (per key in metrics dict) to a separate file. Values are separated by a new line.')

# Parse args.
if is_notebook(): sys.argv = ['-f']  # Used to make argparse work in jupyter notebooks (all args must be optional).
args, _ = parser.parse_known_args()
print('Arguments:')
print(args)

Arguments:
Namespace(batch_size=80, ckpt_path=None, d_ff=2048, d_model=512, data_dir=None, dev_run=False, drop_out_rate=0.1, fresh_run=False, gpus=1, learning_rate=0.0001, max_epochs=10, max_examples=-1, min_delta=0, mode='min', monitor='val_loss', mono_data_dir=None, num_heads=8, num_layers=6, num_workers=4, patience=3, shuffle_before_split=False, shuffle_train_data=True, src_lang='de', tgt_lang='nl', track_score=True, val_test_examples=[3000, 3000], weight_decay=0)


In [92]:
# Auto-infer args.
auto_infer_args(args)

In [None]:
# Adjust arguments for test purposes.
if is_notebook() and False:  # Quickly turn on and off with 'and True/False'.
    args.dev_run = True
    args.fresh_run = True

In [None]:
# Sanity check args.
sanity_check_args(args)

### Seed

In [None]:
# Set seed.
from pytorch_lightning import seed_everything
seed_everything(args.seed, workers=True)

### Paths

In [58]:
# Create directories and create file names.
pm = ExperimentPathManager(f'baseline-{args.src_lang}-{args.tgt_lang}', 'baseline')
pm.init()

Dir "./data" already exists.
Dir "./models" already exists.
Dir "./runs" already exists.
Dir "./tokenizers" already exists.
Dir "./runs/baseline-de-nl-2022.09.29-19.38.28" does not exist, creating it.
Dir "./runs/baseline-de-nl-2022.09.29-19.38.28/baseline" does not exist, creating it.
Dir "./runs/baseline-de-nl-2022.09.29-19.38.28/checkpoints" does not exist, creating it.
Dir "./runs/baseline-de-nl-2022.09.29-19.38.28/metrics" does not exist, creating it.


## Data Preprocessing

In [61]:
pp = PreProcessor(args.src_lang, args.tgt_lang, args.data_dir)

### Data Split

In [69]:
# Split data into (train, val, test) sets.
pp.split_data(args.shuffle_before_split, args.val_test_examples)

### Tokenizers

In [73]:
# Load tokenizers.
src_tokenizer = TokenizerBuilder(args.src_lang, args.data_dir, args.mono_data_dir).build()
tgt_tokenizer = TokenizerBuilder(args.tgt_lang, args.data_dir, args.mono_data_dir).build()

Tokenizer exists. Skipping training.
Tokenizer exists. Skipping training.


### Preprocessing

In [None]:
# Load dataloaders.
train_dataloader, val_dataloader, test_dataloader = pp.pre_process(src_tokenizer, tgt_tokenizer, args.batch_size, args.shuffle_train_data, args.max_examples)

## Experiment

### Model

In [81]:
# Create model.
model = Transformer(src_tokenizer,
                    tgt_tokenizer,
                    args.learning_rate,
                    args.weight_decay,
                    args.num_layers,
                    args.d_model,
                    args.drop_out_rate,
                    args.num_heads,
                    args.d_ff,
                    )

print('Created model.')

  score_metric = load_metric('sacrebleu')


Downloading builder script:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

Created model.


### Training

In [89]:
# Create trainer.
metric_logger = MetricLogger()

checkpoint_callback = pl.callbacks.ModelCheckpoint(
    monitor=args.monitor,
    dirpath=pm.baseline.checkpoint_dir,
    filename='{epoch}-{val_loss:.2f}',
    save_top_k=1,
    save_last=True,
    every_n_epochs=1,
    verbose=True,
)

early_stop_callback = pl.callbacks.EarlyStopping(
    monitor=args.monitor,
    min_delta=args.min_delta,
    patience=args.patience,
    mode=args.mode,
    verbose=True,
)

trainer = pl.Trainer(deterministic=True,
                  fast_dev_run=args.dev_run,
                  max_epochs=args.max_epochs,
                  logger=metric_logger,
                  log_every_n_steps=1,
                  enable_checkpointing=True,
                  default_root_dir=pm.baseline.checkpoint_dir,
                  callbacks=[checkpoint_callback, early_stop_callback],
                  gpus=args.gpus if str(device) == 'cuda' else 0)

print('Created trainer.')

  f"Setting `Trainer(gpus={gpus!r})` is deprecated in v1.7 and will be removed"
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Created trainer.


In [99]:
# Training.
trainer.fit(model,
            train_dataloaders=train_dataloader,
            val_dataloaders=val_dataloader,
            ckpt_path=args.ckpt_path)

INFO:pytorch_lightning.callbacks.model_summary:
  | Name               | Type              | Params
---------------------------------------------------------
0 | src_embedding      | Embedding         | 8.2 M 
1 | tgt_embedding      | Embedding         | 8.2 M 
2 | positional_encoder | PositionalEncoder | 0     
3 | encoder            | Encoder           | 18.9 M
4 | decoder            | Decoder           | 25.2 M
5 | output_linear      | Linear            | 8.2 M 
6 | softmax            | LogSoftmax        | 0     
---------------------------------------------------------
68.7 M    Trainable params
0         Non-trainable params
68.7 M    Total params
274.930   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

[2428, 77, 3445, 940, 31, 5098, 628, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[802, 292, 10845, 14, 147, 3104, 37, 47, 914, 94, 2573, 405, 196, 37, 2342, 177, 14, 6895, 5, 3023, 346, 917, 2583, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [None]:
# Save model.
model_path = os.path.join(run_dir, 'model.pt')
torch.save(model.state_dict(), model_path)

### Testing

In [None]:
# Testing.
test_metrics = trainer.test(model, dataloaders=test_dataloader)
print(test_metrics)

## Exporting Results

In [None]:
# Plot loss metrics.
save_path = os.path.join(results_dir, 'loss.svg')
plot_metric(metric_logger.metrics, 'loss', 'Loss', save_path=save_path)

In [None]:
# Plot score metric.
save_path = os.path.join(results_dir, 'score.svg')
plot_metric(metric_logger.metrics, 'score', 'Score', save_path=save_path)

In [None]:
# Save hyper parameters.  # TODO move to after folders are created + format json to make it readable.
save_dict(run_dir, args, 'args')

In [None]:
# Save recorded metrics.
metric_logger.manual_save(results_dir)