In [1]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import jieba
import os
import pickle
import math
from tqdm.notebook import tqdm
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from torch.nn.utils.rnn import pad_sequence
from pathlib import Path
from torch import Tensor
from torch.nn import Transformer
from xformers.factory.model_factory import xFormer, xFormerConfig

A matching Triton is not available, some optimizations will not be enabled.
Error caught was: No module named 'triton'
Triton is not available, some optimizations will not be enabled.
Triton is not available, FusedMLP will not be enabled.
Either FairScale or torch distributed is not available, MixtureOfExperts will not be exposed. Please install them if you would like to use MoE


In [2]:
# Load translation dataset from huggingface
os.environ['HF_DATASETS_OFFLINE'] = '1' # Comment this line if you need to download the dataset from huggingface
dataset = load_dataset('wmt19', 'zh-en')
print(dataset)
SRC_LANGUAGE = 'zh'
TGT_LANGUAGE = 'en'

Found cached dataset wmt19 (D:/Archives/HuggingfaceCache/datasets/wmt19/zh-en/1.0.0/29e210fae5690e843cae5dc43b53db36c4e02f927db50cd5235a22ab42dde90a)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 25984574
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 3981
    })
})


In [3]:
# Hyper-parameters
SUBSET_SIZE = 50000
VOCAB_MIN_FREQ = 10

UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
SPECIAL_SYMBOLS = ['<UNK>', '<PAD>', '<BOS>', '<EOS>']
VOCAB_PATH = './Model/Vocab.pkl'

BATCH_SIZE = 12
LEARNING_RATE = 0.0001
NUM_EPOCHS = 15
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
NUM_ENCODER_LAYERS = 5
NUM_DECODER_LAYERS = 5
DROPOUT = 0.1
MAX_LEN = 256

In [4]:
# Make token transformers that can be used to tokenize text into list of tokens(words).
# I use the basic english tokenizer from torchtext for English.
# And use jieba for Chinese.
token_transform = {}
token_transform[TGT_LANGUAGE] = get_tokenizer('basic_english')
token_transform[SRC_LANGUAGE] = lambda text: ([x for x in jieba.lcut(text) if x not in {' ', '\t'}])

# test_sentence_zh = '但后来他们逐渐意识到所探测到的信号可能完全来源于星际尘埃。'
# test_sentence_en = 'It was later realized that the signal they had detected could be entirely attributed to interstellar dust.'
# assert token_transform[SRC_LANGUAGE](test_sentence_zh) == ['但', '后来', '他们', '逐渐', '意识', '到', '所', '探测', '到', '的', '信号', '可能', '完全', '来源于', '星际', '尘埃', '。']
# assert token_transform[TGT_LANGUAGE](test_sentence_en) == ['it', 'was', 'later', 'realized', 'that', 'the', 'signal', 'they', 'had', 'detected', 'could', 'be', 'entirely', 'attributed', 'to', 'interstellar', 'dust', '.']

# Yield tokens from data iterator. For each data {'en':'...', 'zh':'...'} in data_iter, yield a list of tokens in corresponding language using token_transform.
def yield_tokens(data_iter, language):
    for data in data_iter:
        yield token_transform[language](data[language])

# Build the vocabulary that can be used to encode token(word) into integer.
if Path(VOCAB_PATH).exists():
    # If we already have the vocab, load it
    with open(VOCAB_PATH, 'rb') as f:
        vocab_transform = pickle.load(f)
else:
    # Otherwise, build the vocab
    vocab_transform = {}
    for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
        train_iter = iter(dataset['train'][:SUBSET_SIZE]['translation'])
        vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(train_iter, ln), min_freq=VOCAB_MIN_FREQ, specials=SPECIAL_SYMBOLS, special_first=True)
    for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
        vocab_transform[ln].set_default_index(UNK_IDX)
    with open(VOCAB_PATH, 'wb') as f:
        pickle.dump(vocab_transform, f)

SRC_VOCAB_SIZE = len(vocab_transform[SRC_LANGUAGE])
TGT_VOCAB_SIZE = len(vocab_transform[TGT_LANGUAGE])
print(f'Vocab({TGT_LANGUAGE}) Size: {TGT_VOCAB_SIZE}')
print(f'Vocab({SRC_LANGUAGE}) Size: {SRC_VOCAB_SIZE}')

Vocab(en) Size: 7884
Vocab(zh) Size: 7751


In [5]:
# Now we need to make the real tokenizer that can tokenize a string of text into a sequence of integer tensors.

# Helper function that passes a string into a list of transforms.
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

# Helper function that adds a BOS and EOS token to a list of tokens. E.g. [BOS_IDX, 5, 7, ..., 456, EOS_IDX]
def tensor_transform(token_ids):
    return torch.cat((torch.tensor([BOS_IDX]),
                      torch.tensor(token_ids),
                      torch.tensor([EOS_IDX])))

# The real tokenizer.
tokenizer = {}
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    tokenizer[ln] = sequential_transforms(token_transform[ln], vocab_transform[ln], tensor_transform)

# print(tokenizer[TGT_LANGUAGE](test_sentence_en))
# Output:
# tensor([   2,   17,   38,  660, 3413,   12,    5, 3510,   37,  103,    0,   60, 18, 1667, 4340,    7,    0, 5568,    6,    3])

# print(tokenizer[SRC_LANGUAGE](test_sentence_zh))
# Output:
# tensor([   2,   13, 2221,   36,  843, 1092,   47,   49,    0,   47,    4, 1358, 37,  361, 3490,    0,    0,    6,    3])

In [6]:
def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        src_batch.append(tokenizer[SRC_LANGUAGE](src_sample.rstrip("\n")))
        tgt_batch.append(tokenizer[TGT_LANGUAGE](tgt_sample.rstrip("\n")))

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return src_batch, tgt_batch

class WMT19Dataset(Dataset):
    def __init__(self, dataset, subset_size = None):
        self.dataset = dataset
        self.subset_size = subset_size

    def __len__(self):
        if self.subset_size is None:
            return len(self.dataset)
        return self.subset_size

    def __getitem__(self, idx):
        return self.dataset[idx]['translation'][SRC_LANGUAGE], self.dataset[idx]['translation'][TGT_LANGUAGE]
    
train_dataset = WMT19Dataset(dataset['train'], SUBSET_SIZE)
valid_dataset = WMT19Dataset(dataset['validation'])

print(f'Train dataset size: {len(train_dataset)}')
print(f'Validation dataset size: {len(valid_dataset)}')

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)
valid_dataloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)

Train dataset size: 50000
Validation dataset size: 3981


In [7]:
model_config = [
    # A list of the encoder or decoder blocks which constitute the Transformer.
    # Note that a sequence of different encoder blocks can be used, same for decoders
    {
        "reversible": True,  # Optionally make these layers reversible, to save memory
        "block_type": "encoder",
        "num_layers": NUM_ENCODER_LAYERS,  # Optional, this means that this config will repeat N times
        "dim_model": EMB_SIZE,
        "residual_norm_style": "pre",  # Optional, pre/post
        "position_encoding_config": {
            "name": "vocab",  # whatever position encodinhg makes sense
            "seq_len": MAX_LEN,
            "vocab_size": SRC_VOCAB_SIZE,
        },
        "multi_head_config": {
            "num_heads": NHEAD,
            "residual_dropout": 0,
            "attention": {
                "name": "linformer",  # whatever attention mechanism
                "dropout": 0,
                "causal": False,
                "seq_len": MAX_LEN,
            },
        },
        "feedforward_config": {
            "name": "MLP",
            "dropout": DROPOUT,
            "activation": "relu",
            "hidden_layer_multiplier": 4,
        },
    },
    {
        "reversible": False,  # Optionally make these layers reversible, to save memory
        "block_type": "decoder",
        "num_layers": NUM_DECODER_LAYERS,  # Optional, this means that this config will repeat N times
        "dim_model": EMB_SIZE,
        "residual_norm_style": "pre",  # Optional, pre/post
        "position_encoding_config": {
            "name": "vocab",  # whatever position encodinhg makes sense
            "seq_len": MAX_LEN,
            "vocab_size": TGT_VOCAB_SIZE,
        },
        "multi_head_config_masked": {
            "num_heads": NHEAD,
            "residual_dropout": 0,
            "attention": {
                "name": "nystrom",  # whatever attention mechanism
                "dropout": 0,
                "causal": True,
                "seq_len": MAX_LEN,
            },
        },
        "multi_head_config_cross": {
            "num_heads": NHEAD,
            "residual_dropout": 0,
            "attention": {
                "name": "favor",  # whatever attention mechanism
                "dropout": 0,
                "causal": True,
                "seq_len": MAX_LEN,
            },
        },
        "feedforward_config": {
            "name": "MLP",
            "dropout": DROPOUT,
            "activation": "relu",
            "hidden_layer_multiplier": 4,
        },
    },
]

class Seq2SeqTransformer(nn.Module):
    def __init__(self, xformer_config):
        super(Seq2SeqTransformer, self).__init__()
        self.xformers_config = xFormerConfig(xformer_config)
        self.xformer = xFormer.from_config(self.xformers_config)
        self.generator = nn.Linear(xformer_config[1]['dim_model'], xformer_config[1]['position_encoding_config']['vocab_size'])

    def forward(self, src, tgt, src_mask, tgt_mask):
        xformer_out = self.xformer(src, tgt, src_mask, tgt_mask)
        return self.generator(xformer_out)
    
    def encode(self, src, src_mask):
        return self.xformer.encoders(src, src_mask)
    
    def decode(self, tgt, memory, tgt_mask):
        return self.xformer.decoders(tgt, memory, tgt_mask)
        


# This part of xFormers is entirely type checked and needs a config object,
# could be changed in the future
config = xFormerConfig(model_config)
transformer = xFormer.from_config(config)

print(f'Model params: {sum(p.numel() for p in transformer.parameters() if p.requires_grad)/1000000:.2f}M', transformer)
transformer = transformer.to(DEVICE)
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer = torch.optim.AdamW(transformer.parameters(), lr=LEARNING_RATE)

Model params: 45.21M xFormer(
  (rev_enc_pose_encoding): VocabEmbedding(
    (dropout): Dropout(p=0.0, inplace=False)
    (position_embeddings): Embedding(256, 512)
    (word_embeddings): Embedding(7751, 512)
  )
  (encoders): ReversibleSequence(
    (blocks): ModuleList(
      (0-4): 5 x ReversibleBlock(
        (f): Deterministic(
          (net): PreNorm(
            (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
            (sublayer): MultiHeadDispatch(
              (attention): LinformerAttention(
                (E): Linear(in_features=256, out_features=64, bias=False)
                (F): Linear(in_features=256, out_features=64, bias=False)
                (attn_drop): Dropout(p=0, inplace=False)
              )
              (in_proj_container): InputProjection(
                (q_proj): Linear(in_features=512, out_features=512, bias=True)
                (k_proj): Linear(in_features=512, out_features=512, bias=True)
                (v_proj): Linear(in_features

In [8]:
def create_mask(src, tgt):
    src_padding_mask = (src == PAD_IDX)
    tgt_padding_mask = (tgt == PAD_IDX)
    return src_padding_mask, tgt_padding_mask

In [9]:
def train_epoch(model, optimizer):
    model.train()
    losses = 0
    
    total_steps = 0
    for src, tgt in tqdm(train_dataloader):
        src = src.transpose(0, 1).to(DEVICE)
        tgt = tgt.transpose(0, 1).to(DEVICE)
        # After transpose, the shape is (BATCH_SIZE, SEQ_LEN)
        tgt_input = tgt[:, :-1]
        src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)
        logits = model(src, tgt_input, encoder_input_mask=src_padding_mask, decoder_input_mask=tgt_padding_mask)

        optimizer.zero_grad()

        tgt_out = tgt[:, 1:]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()

        optimizer.step()
        losses += loss.item()
        total_steps += 1
    return losses / total_steps


def evaluate(model):
    model.eval()
    losses = 0

    total_steps = 0
    for src, tgt in valid_dataloader:
        src = src.transpose(0, 1).to(DEVICE)
        tgt = tgt.transpose(0, 1).to(DEVICE)

        tgt_input = tgt[:, :-1]

        src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, encoder_input_mask=src_padding_mask, decoder_input_mask=tgt_padding_mask)

        tgt_out = tgt[:, 1:]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        losses += loss.item()
        total_steps += 1
    return losses / total_steps

In [12]:
x1 = (torch.rand((BATCH_SIZE, MAX_LEN))*SRC_VOCAB_SIZE).abs().to(torch.int).to(DEVICE)
x2 = (torch.rand((BATCH_SIZE, MAX_LEN))*TGT_VOCAB_SIZE).abs().to(torch.int).to(DEVICE)
y = transformer(src=x1, tgt=x2)
print(y.shape)

OutOfMemoryError: CUDA out of memory. Tried to allocate 384.00 MiB (GPU 0; 8.00 GiB total capacity; 6.98 GiB already allocated; 0 bytes free; 7.00 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF