# Comparison of optimizers using language translation

In [2]:
%matplotlib inline
! pip install madgrad
! pip install adabelief-pytorch==0.2.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting madgrad
  Downloading madgrad-1.2-py3-none-any.whl (11 kB)
Installing collected packages: madgrad
Successfully installed madgrad-1.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting adabelief-pytorch==0.2.0
  Downloading adabelief_pytorch-0.2.0-py3-none-any.whl (5.7 kB)
Collecting colorama>=0.4.0
  Downloading colorama-0.4.4-py2.py3-none-any.whl (16 kB)
Installing collected packages: colorama, adabelief-pytorch
Successfully installed adabelief-pytorch-0.2.0 colorama-0.4.4


In [3]:
import math
import torch
from torch import Tensor
import torch.nn as nn
import torchtext
from torchtext.vocab import vocab
from torchtext.utils import download_from_url, extract_archive
from torchtext.data.utils import get_tokenizer
from collections import Counter
import io
import time
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torch.nn import (TransformerEncoder, TransformerDecoder,
                      TransformerEncoderLayer, TransformerDecoderLayer)
from adabelief_pytorch import AdaBelief
from madgrad import MADGRAD
import numpy as np


### Warning

This code is inspired from https://torchtutorialstaging.z5.web.core.windows.net/beginner/translation_transformer.html. 

### Goal

The purpose of this code is to operate translation from French to English, using a model trained with transformers.

### Download and process the data

In [4]:
# Install spacy and import download the data

!pip install -U pip setuptools wheel
!pip install -U 'spacy[cuda113]'
#!pip install -U spacy

!python -m spacy download en_core_web_sm
!python -m spacy download fr_core_news_sm

!spacy download fr_core_news_sm
!spacy download en_core_web_sm

import spacy
import en_core_web_sm
import fr_core_news_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pip
  Downloading pip-22.1.2-py3-none-any.whl (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 8.9 MB/s 
Collecting setuptools
  Downloading setuptools-62.3.3-py3-none-any.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 60.0 MB/s 
Installing collected packages: setuptools, pip
  Attempting uninstall: setuptools
    Found existing installation: setuptools 57.4.0
    Uninstalling setuptools-57.4.0:
      Successfully uninstalled setuptools-57.4.0
  Attempting uninstall: pip
    Found existing installation: pip 21.1.3
    Uninstalling pip-21.1.3:
      Successfully uninstalled pip-21.1.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.[

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spacy[cuda113]
  Downloading spacy-3.3.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.2/6.2 MB[0m [31m61.6 MB/s[0m eta [36m0:00:00[0m
Collecting spacy-legacy<3.1.0,>=3.0.9
  Downloading spacy_legacy-3.0.9-py2.py3-none-any.whl (20 kB)
Collecting srsly<3.0.0,>=2.4.3
  Downloading srsly-2.4.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (457 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m457.1/457.1 kB[0m [31m42.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4
  Downloading pydantic-1.8.2-cp37-cp37m-manylinux2014_x86_64.whl (10.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m92.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting thinc<8.1.0,>=8.0.14
  Downloading thinc-8.0.17-cp3

In [5]:
from google.colab import drive
drive.mount('/content/drive')
import sys
sys.path.append('/content/drive/MyDrive/EPFL/OptML/')

Mounted at /content/drive


In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.cuda.get_device_name(0)

'Tesla T4'

In [7]:
#torch.use_deterministic_algorithms(True)

# Download the datasets

url = 'https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/'

train_paths = [extract_archive(download_from_url(url + u))[0] for u in ['train.fr.gz', 'train.en.gz']]
val_paths = [extract_archive(download_from_url(url + u))[0] for u in ['val.fr.gz', 'val.en.gz']]
test_paths = [extract_archive(download_from_url(url + u))[0] for u in ['test_2016_flickr.fr.gz', 'test_2016_flickr.en.gz']]

fr_tokenizer = get_tokenizer('spacy', language='fr_core_news_sm')
en_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

def build_vocab(path, tokenizer):
  counter = Counter()
  with io.open(path, encoding="utf8") as f:
    for string_ in f:
      counter.update(tokenizer(string_))
  v = vocab(counter, min_freq = 1, specials = ['<unk>', '<pad>', '<bos>', '<eos>'])
  v.set_default_index(v['<unk>'])
  return v

# Build the vocabularies

fr_vocab = build_vocab(train_paths[0], fr_tokenizer)
en_vocab = build_vocab(train_paths[1], en_tokenizer)

# Process the datasets

def data_process(paths):
  raw_fr_data = iter(io.open(paths[0], encoding="utf8"))
  raw_en_data = iter(io.open(paths[1], encoding="utf8"))
  data = []
  for (raw_fr, raw_en) in zip(raw_fr_data, raw_en_data):
    fr_tensor_ = torch.tensor([fr_vocab[token] for token in fr_tokenizer(raw_fr.rstrip("\n"))],
                            dtype=torch.long)
    en_tensor_ = torch.tensor([en_vocab[token] for token in en_tokenizer(raw_en.rstrip("\n"))],
                            dtype=torch.long)
    data.append((fr_tensor_, en_tensor_))
  return data


train_process = data_process(train_paths)
val_process = data_process(val_paths)
test_process = data_process(test_paths)

batch_size = 128
pad_i = fr_vocab['<pad>']
bos_i = fr_vocab['<bos>']
eos_i = fr_vocab['<eos>']

100%|██████████| 604k/604k [00:00<00:00, 22.1MB/s]
100%|██████████| 569k/569k [00:00<00:00, 21.4MB/s]
100%|██████████| 23.0k/23.0k [00:00<00:00, 8.81MB/s]
100%|██████████| 21.6k/21.6k [00:00<00:00, 5.21MB/s]
100%|██████████| 22.3k/22.3k [00:00<00:00, 12.9MB/s]
100%|██████████| 21.1k/21.1k [00:00<00:00, 8.14MB/s]


In [8]:
# Load the training set and the testing set

def generate_batch(data_batch):
  fr_batch, en_batch = [], []
  for (fr_item, en_item) in data_batch:
    fr_batch.append(torch.cat([torch.tensor([bos_i]), fr_item, torch.tensor([eos_i])], dim=0))
    en_batch.append(torch.cat([torch.tensor([bos_i]), en_item, torch.tensor([eos_i])], dim=0))
  fr_batch = pad_sequence(fr_batch, padding_value=pad_i)
  en_batch = pad_sequence(en_batch, padding_value=pad_i)
  return fr_batch, en_batch

train_data = DataLoader(train_process, batch_size=batch_size,
                        shuffle=True, collate_fn=generate_batch)

valid_iter = DataLoader(val_process, batch_size=batch_size,
                        shuffle=True, collate_fn=generate_batch)
                        
test_data = DataLoader(test_process, batch_size=batch_size,
                       shuffle=True, collate_fn=generate_batch)

### Implement the Seq2Seq transformer

In [9]:
class Seq2SeqTransformer(nn.Module):
    def __init__(self, num_enc_layers: int, num_dec_layers: int,
                 embedding_size: int, fr_vocab_size: int, en_vocab_size: int,
                 dim_feedforward:int = 512, dropout:float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        encoder_layer = TransformerEncoderLayer(d_model=embedding_size, nhead=nhead,
                                                dim_feedforward=dim_feedforward)
        self.transformer_encoder = TransformerEncoder(encoder_layer, num_layers=num_enc_layers)
        decoder_layer = TransformerDecoderLayer(d_model=embedding_size, nhead=nhead,
                                                dim_feedforward=dim_feedforward)
        self.transformer_decoder = TransformerDecoder(decoder_layer, num_layers=num_dec_layers)
                
        self.generator = nn.Linear(embedding_size, en_vocab_size)
        self.fr_tok_emb = TokenEmbedding(fr_vocab_size, embedding_size)
        self.en_tok_emb = TokenEmbedding(en_vocab_size, embedding_size)
        self.positional_encoding = PositionalEncoding(embedding_size, dropout=dropout)

    def forward(self, fr: Tensor, trg: Tensor, fr_mask: Tensor,
                en_mask: Tensor, fr_padding_mask: Tensor,
                en_padding_mask: Tensor, memory_key_padding_mask: Tensor):
        fr_emb = self.positional_encoding(self.fr_tok_emb(fr))
        en_emb = self.positional_encoding(self.en_tok_emb(trg))
        memory = self.transformer_encoder(fr_emb, fr_mask, fr_padding_mask)
        outs = self.transformer_decoder(en_emb, memory, en_mask, None,
                                        en_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, fr: Tensor, fr_mask: Tensor):
        return self.transformer_encoder(self.positional_encoding(
                            self.fr_tok_emb(fr)), fr_mask)

    def decode(self, en: Tensor, memory: Tensor, en_mask: Tensor):
        return self.transformer_decoder(self.positional_encoding(
                          self.en_tok_emb(en)), memory,
                          en_mask)

class PositionalEncoding(nn.Module):
    def __init__(self, embedding_size: int, dropout, maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, embedding_size, 2) * math.log(10000)/embedding_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, embedding_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + 
                            self.pos_embedding[:token_embedding.size(0),:])

class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, embedding_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.embedding_size = embedding_size
    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.embedding_size)
    
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

def create_mask(fr, en):
  fr_seq_len = fr.shape[0]
  en_seq_len = en.shape[0]
  en_mask = generate_square_subsequent_mask(en_seq_len)
  fr_mask = torch.zeros((fr_seq_len, fr_seq_len), device=DEVICE).type(torch.bool)
  fr_padding_mask = (fr == pad_i).transpose(0, 1)
  en_padding_mask = (en == pad_i).transpose(0, 1)
  return fr_mask, en_mask, fr_padding_mask, en_padding_mask

### Implement the model

In [10]:
fr_vocab_size = len(fr_vocab)
en_vocab_size = len(en_vocab)
embedding_size = 512
nhead = 8
ffn_dimension = 512
batch_size = 128
num_enc_layers = 3
num_dec_layers = 3
num_epochs = 100

DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

### Train the model

In [11]:
def train_epoch(model, train_data, optimizer):
  model.train()
  losses = 0
  for i, (fr, en) in enumerate(train_data):
      fr = fr.to(device)
      en = en.to(device)
      en_input = en[:-1, :]
      en_expected = en[1:,:]

      fr_mask, en_mask, fr_padding_mask, en_padding_mask = create_mask(fr, en_input)

      en_predicted = model(fr, en_input, fr_mask, en_mask,
                                fr_padding_mask, en_padding_mask, fr_padding_mask)
      
      optimizer.zero_grad()
      loss = loss_fn(en_predicted.reshape(-1, en_predicted.shape[-1]), en_expected.reshape(-1))
      loss.backward()
      optimizer.step()
      losses += loss.item()

  return losses/len(train_data)


def evaluate(model, val_data):
  model.eval()
  losses = 0
  for i, (fr, en) in (enumerate(valid_iter)):
    fr = fr.to(device)
    en = en.to(device)
    en_input = en[:-1, :]
    en_expected = en[1:,:]

    fr_mask, en_mask, fr_padding_mask, en_padding_mask = create_mask(fr, en_input)

    en_predicted = model(fr, en_input, fr_mask, en_mask,
                              fr_padding_mask, en_padding_mask, fr_padding_mask)
    
    loss = loss_fn(en_predicted.reshape(-1, en_predicted.shape[-1]), en_expected.reshape(-1))
    losses += loss.item()
  return losses/len(val_data)

In [None]:
for i in range(5):
    print(i) 
    start_time = time.time()
    torch.manual_seed(i) 

    model = Seq2SeqTransformer(num_enc_layers, num_dec_layers, 
                                 embedding_size, fr_vocab_size, en_vocab_size,
                                 ffn_dimension)

    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)

    model = model.to(device)
    loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_i)

    optimizer = AdaBelief(model.parameters(), lr=0.001, betas=(0.9,0.999), eps=1e-16, weight_decouple=False, rectify=False) #fixed_decay=False, amsgrad=False, weight_decay=5e-4
    #optimizer = MADGRAD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=0, eps=1e-6, decouple_decay=False)
    #optimizer = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)
    #optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

    loss_trial = []
    acc_trial = []

    for epoch in range(num_epochs):
        train_loss = train_epoch(model, train_data, optimizer)
        print((f"Epoch: {epoch + 1}, Train loss: {train_loss:.3f}"))
        loss_trial.append(train_loss)
        # Test the model
        model.eval()
        with torch.no_grad():
            correct = 0
            total = 0
            for fr, en in test_data:
                fr = fr.to(device)
                en = en.to(device)
                en_input = en[:-1, :]
                en_expected = en[1:,:].reshape(-1)

                fr_mask, en_mask, fr_padding_mask, en_padding_mask = create_mask(fr, en_input)
                en_predicted = model(fr, en_input, fr_mask, en_mask,
                                      fr_padding_mask, en_padding_mask, fr_padding_mask)
        
                _, predicted = torch.max(en_predicted.reshape(-1, en_predicted.shape[-1]).data, 1)
                total += en_expected.size(0)
                correct += (en_expected == predicted).sum().item()

            print('Accuracy of the model: {} %'.format(100*correct/total))
            acc_trial.append(100*correct/total)
    train_time = time.time() - start_time
    print(train_time)
    np.save('/content/drive/MyDrive/EPFL/OptML/loss_translation_adabelief_episode_{}.npy'.format(i), loss_trial)
    np.save('/content/drive/MyDrive/EPFL/OptML/acc_translation_adabelief_episode_{}.npy'.format(i), acc_trial)

0
[31mPlease check your arguments if you have upgraded adabelief-pytorch from version 0.0.5.
[31mModifications to default arguments:
[31m                           eps  weight_decouple    rectify
-----------------------  -----  -----------------  ---------
adabelief-pytorch=0.0.5  1e-08  False              False
>=0.1.0 (Current 0.2.0)  1e-16  True               True
[34mSGD better than Adam (e.g. CNN for Image Classification)    Adam better than SGD (e.g. Transformer, GAN)
----------------------------------------------------------  ----------------------------------------------
Recommended eps = 1e-8                                      Recommended eps = 1e-16
[34mFor a complete table of recommended hyperparameters, see
[34mhttps://github.com/juntang-zhuang/Adabelief-Optimizer
[32mYou can disable the log message by setting "print_change_log = False", though it is recommended to keep as a reminder.
[0m
Epoch: 1, Train loss: 5.741
Accuracy of the model: 3.759875846501129 %
Epoch

In [None]:
# Test the model
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for fr, en in test_data:
        fr = fr.to(device)
        en = en.to(device)
        en_input = en[:-1, :]
        en_expected = en[1:,:].reshape(-1)

        fr_mask, en_mask, fr_padding_mask, en_padding_mask = create_mask(fr, en_input)
        en_predicted = model(fr, en_input, fr_mask, en_mask,
                              fr_padding_mask, en_padding_mask, fr_padding_mask)
        
        _, predicted = torch.max(en_predicted.reshape(-1, en_predicted.shape[-1]).data, 1)

        total += en_expected.size(0)
        correct += (en_expected == predicted).sum().item()

    print('Accuracy of the model: {} %'.format(100 * correct/total))

NameError: ignored