In [1]:
import torch
import torch.nn as nn
import os
from tokenizers import Tokenizer
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from pathlib import Path
from xformers.factory.model_factory import xFormer, xFormerConfig

A matching Triton is not available, some optimizations will not be enabled.
Error caught was: No module named 'triton'
Triton is not available, some optimizations will not be enabled.
Triton is not available, FusedMLP will not be enabled.
Either FairScale or torch distributed is not available, MixtureOfExperts will not be exposed. Please install them if you would like to use MoE


In [2]:
# Load translation dataset from huggingface
os.environ['HF_DATASETS_OFFLINE'] = '1' # Comment this line if you need to download the dataset from huggingface
dataset = load_dataset('wmt19', 'zh-en')
print(dataset)
SRC_LANGUAGE = 'zh'
TGT_LANGUAGE = 'en'

Found cached dataset wmt19 (D:/Archives/HuggingfaceCache/datasets/wmt19/zh-en/1.0.0/29e210fae5690e843cae5dc43b53db36c4e02f927db50cd5235a22ab42dde90a)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 25984574
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 3981
    })
})


In [3]:
# Hyper-parameters
SUBSET_SIZE = 50000

BATCH_SIZE = 48
LEARNING_RATE = 0.0001
NUM_EPOCHS = 15
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
NUM_ENCODER_LAYERS = 5
NUM_DECODER_LAYERS = 5
DROPOUT = 0.1
MAX_LEN = 512

MODEL_SAVE_PATH = './Model/AdvancedTranslationModel.pth'

In [4]:
# Load the tokenizers pretrained in Preprocessing/BuildWordPieceTokenizerUsingTokenizersLibrary.IPYNB
tokenizer = {SRC_LANGUAGE: Tokenizer.from_file('../Preprocessing/Model/tokenizer-wmt19-zh.json'),
             TGT_LANGUAGE: Tokenizer.from_file('../Preprocessing/Model/tokenizer-wmt19-en.json')}
SPECIAL_TOKENS = ['[UNK]', '[PAD]', '[BOS]', '[EOS]'] # Don't change this, it's defined in the tokenizer
UNK_IDX = tokenizer[SRC_LANGUAGE].token_to_id(SPECIAL_TOKENS[0]) # 0
PAD_IDX = tokenizer[SRC_LANGUAGE].token_to_id(SPECIAL_TOKENS[1]) # 1
BOS_IDX = tokenizer[SRC_LANGUAGE].token_to_id(SPECIAL_TOKENS[2]) # 2
EOS_IDX = tokenizer[SRC_LANGUAGE].token_to_id(SPECIAL_TOKENS[3]) # 3
SRC_VOCAB_SIZE = tokenizer[SRC_LANGUAGE].get_vocab_size(with_added_tokens=True) # 8623
TGT_VOCAB_SIZE = tokenizer[TGT_LANGUAGE].get_vocab_size(with_added_tokens=True) # 8000

In [5]:
class WMT19Dataset(Dataset):
    def __init__(self, dataset, subset_size = None):
        self.dataset = dataset
        self.subset_size = subset_size

    def __len__(self):
        if self.subset_size is None:
            return len(self.dataset)
        return self.subset_size

    def __getitem__(self, idx):
        return self.dataset[idx]['translation'][SRC_LANGUAGE], self.dataset[idx]['translation'][TGT_LANGUAGE]

def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        src_batch.append(src_sample.rstrip("\n"))
        tgt_batch.append(tgt_sample.rstrip("\n"))

    src_batch = torch.tensor([encoded.ids for encoded in tokenizer[SRC_LANGUAGE].encode_batch(src_batch)])
    tgt_batch = torch.tensor([encoded.ids for encoded in tokenizer[TGT_LANGUAGE].encode_batch(tgt_batch)])

    return src_batch, tgt_batch
    
train_dataset = WMT19Dataset(dataset['train'], SUBSET_SIZE)
valid_dataset = WMT19Dataset(dataset['validation'])

print(f'Train dataset size: {len(train_dataset)}')
print(f'Validation dataset size: {len(valid_dataset)}')

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)
valid_dataloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)

# a, b = next(iter(train_dataloader))
# print(a, a.shape)
# print(b, b.shape)
# print(tokenizer[SRC_LANGUAGE].decode_batch(a.tolist()))
# print(tokenizer[TGT_LANGUAGE].decode_batch(b.tolist()))

Train dataset size: 50000
Validation dataset size: 3981


In [6]:
model_config = [
    # A list of the encoder or decoder blocks which constitute the Transformer.
    # Note that a sequence of different encoder blocks can be used, same for decoders
    {
        "reversible": True,  # Optionally make these layers reversible, to save memory
        "block_type": "encoder",
        "num_layers": NUM_ENCODER_LAYERS,  # Optional, this means that this config will repeat N times
        "dim_model": EMB_SIZE,
        "residual_norm_style": "pre",  # Optional, pre/post
        "position_encoding_config": {
            "name": "vocab",  # whatever position encodinhg makes sense
            "seq_len": MAX_LEN,
            "vocab_size": SRC_VOCAB_SIZE,
        },
        "multi_head_config": {
            "num_heads": NHEAD,
            "residual_dropout": 0,
            "attention": {
                "name": "linformer",  # whatever attention mechanism
                "dropout": 0,
                "causal": False,
                "seq_len": MAX_LEN,
            },
        },
        "feedforward_config": {
            "name": "MLP",
            "dropout": DROPOUT,
            "activation": "relu",
            "hidden_layer_multiplier": 4,
        },
    },
    {
        "reversible": False,  # Optionally make these layers reversible, to save memory
        "block_type": "decoder",
        "num_layers": NUM_DECODER_LAYERS,  # Optional, this means that this config will repeat N times
        "dim_model": EMB_SIZE,
        "residual_norm_style": "pre",  # Optional, pre/post
        "position_encoding_config": {
            "name": "vocab",  # whatever position encodinhg makes sense
            "seq_len": MAX_LEN,
            "vocab_size": TGT_VOCAB_SIZE,
        },
        "multi_head_config_masked": {
            "num_heads": NHEAD,
            "residual_dropout": 0,
            "attention": {
                "name": "nystrom",  # whatever attention mechanism
                "dropout": 0,
                "causal": True,
                "seq_len": MAX_LEN,
            },
        },
        "multi_head_config_cross": {
            "num_heads": NHEAD,
            "residual_dropout": 0,
            "attention": {
                "name": "favor",  # whatever attention mechanism
                "dropout": 0,
                "causal": False,
                "seq_len": MAX_LEN,
            },
        },
        "feedforward_config": {
            "name": "MLP",
            "dropout": DROPOUT,
            "activation": "relu",
            "hidden_layer_multiplier": 4,
        },
    },
]


class Seq2SeqTransformer(nn.Module):
    def __init__(self, xformer_config):
        super(Seq2SeqTransformer, self).__init__()
        self.xformers_config = xFormerConfig(xformer_config)
        self.xformer = xFormer.from_config(self.xformers_config)
        self.generator = nn.Linear(xformer_config[1]['dim_model'], xformer_config[1]['position_encoding_config']['vocab_size'])

    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        xformer_out = self.xformer(src, tgt, src_mask, tgt_mask)
        return self.generator(xformer_out)
    
    def encode(self, src, src_mask=None):
        memory = src.clone()
        for encoder in self.xformer.encoders:
            memory = encoder(memory, input_mask=src_mask)
        return memory
    
    def decode(self, tgt, memory, tgt_mask=None):
        for decoder in self.xformer.decoders:
            tgt = decoder(target=tgt, memory=memory, input_mask=tgt_mask)
        return tgt
        


model = Seq2SeqTransformer(model_config)
print(f'Model Params: {sum(p.numel() for p in model.parameters() if p.requires_grad)/1e6:.2f} M')

model = model.to(DEVICE)
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

Model Params: 50.58 M


In [7]:
def create_mask(src, tgt):
    # Create padding masks, note that a mask value of "True" will keep the value
    src_padding_mask = (src != PAD_IDX)
    tgt_padding_mask = (tgt != PAD_IDX)
    return src_padding_mask, tgt_padding_mask

In [8]:
def train_epoch(model, optimizer):
    model.train()
    losses = 0
    
    total_steps = 0
    for src, tgt in tqdm(train_dataloader):
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)
        tgt_input = tgt[:, :-1]
        src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)
        optimizer.zero_grad()
        logits = model(src, tgt_input, src_mask=src_padding_mask, tgt_mask=tgt_padding_mask)
        tgt_out = tgt[:, 1:]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()

        optimizer.step()
        losses += loss.item()
        total_steps += 1
    return losses / total_steps


def evaluate(model):
    model.eval()
    losses = 0

    total_steps = 0
    for src, tgt in valid_dataloader:
        src = src.transpose(0, 1).to(DEVICE)
        tgt = tgt.transpose(0, 1).to(DEVICE)

        tgt_input = tgt[:, :-1]

        src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask=src_padding_mask, tgt_mask=tgt_padding_mask)

        tgt_out = tgt[:, 1:]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        losses += loss.item()
        total_steps += 1
    return losses / total_steps

In [9]:
from timeit import default_timer as timer

for epoch in range(NUM_EPOCHS):
    start_time = timer()
    print("-" * 40)
    print("Start epoch {}/{}".format(epoch + 1, NUM_EPOCHS))
    print("-" * 40)
    train_loss = train_epoch(model, optimizer)
    end_time = timer()
    val_loss = evaluate(model)
    torch.save(model.state_dict(), MODEL_SAVE_PATH)
    print((f"Finished epoch: {epoch + 1}| Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, Epoch time = {(end_time - start_time):.3f}s"))


----------------------------------------
Start epoch 1/15
----------------------------------------


  0%|          | 0/1042 [00:00<?, ?it/s]