In [1]:
!pip install datasets



-----------------
# **Libraries**

In [2]:
import os
from tokenizers import Tokenizer, pre_tokenizers, trainers, models
from datasets import load_dataset

-------------
# **Data Samples**

In [3]:
data = load_dataset("thainq107/iwslt2015-en-vi")

README.md:   0%|          | 0.00/522 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/17.8M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/181k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/133317 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1268 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1268 [00:00<?, ? examples/s]

In [4]:
data

DatasetDict({
    train: Dataset({
        features: ['en', 'vi'],
        num_rows: 133317
    })
    validation: Dataset({
        features: ['en', 'vi'],
        num_rows: 1268
    })
    test: Dataset({
        features: ['en', 'vi'],
        num_rows: 1268
    })
})

In [5]:
print(f"This is the data type of English Data: {type(data['train']['en'])}")
print(f"Length of Englosh train Dataset: {len(data['train']['en'])}")
print(f"Samples Sentence: {data['train']['en'][0]}")

print("-"*59)

print(f"This is the data type of Vietnamese Data: {type(data['train']['vi'])}")
print(f"Length of Vietnamese train Dataset: {len(data['train']['vi'])}")
print(f"Samples Sentence: {data['train']['vi'][0]}")

This is the data type of English Data: <class 'list'>
Length of Englosh train Dataset: 133317
Samples Sentence: Rachel Pike : The science behind a climate headline
-----------------------------------------------------------
This is the data type of Vietnamese Data: <class 'list'>
Length of Vietnamese train Dataset: 133317
Samples Sentence: Khoa học đằng sau một tiêu đề về khí hậu


-------------
# **Build Tokenizers**

In [6]:
# Init Tokenizers for English and Vietnamese
tokenizer_en = Tokenizer(models.WordLevel(unk_token = "<unk>"))
tokenizer_vi = Tokenizer(models.WordLevel(unk_token = "<unk>"))

tokenizer_en.pre_tokenizer = pre_tokenizers.Whitespace()
tokenizer_vi.pre_tokenizer = pre_tokenizers.Whitespace()

In [7]:
trainer = trainers.WordLevelTrainer(
    vocab_size = 15000,
    mn_frequency = 2,
    special_tokens = ["<pad>", "<unk>", "<bos>", "<eos>"] # Begin of Sentence = bos
)

# Train Tokenizers
tokenizer_en.train_from_iterator(data["train"]["en"], trainer)
tokenizer_vi.train_from_iterator(data["train"]["vi"], trainer)

# Save Trained Tokenizers
tokenizer_en.save("tokenizer_en.json")
tokenizer_vi.save("tokenizer_vi.json")

-------------
# **Data Preprocessing**

In [8]:
from transformers import PreTrainedTokenizerFast
seq_len = 75

# Load Tokenizers
tokenizer_en = PreTrainedTokenizerFast(
    tokenizer_file = "tokenizer_en.json",
    unk_token = "<unk>", pad_token = '<pad>', bos_token = "<bos>",
    eos_token = "<eos>"
)

tokenizer_vi = PreTrainedTokenizerFast(
    tokenizer_file = "tokenizer_vi.json",
    unk_token = "<unk>", pad_token = '<pad>', bos_token = "<bos>",
    eos_token = "<eos>"
)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [9]:
def data_preprocessing(data):
    """
        We translate English Sentences -> Vietnamese
            -> Input = English
            -> Target/Label = Vietnamese
    """
    ipt = data['en']
    label = []
    for sentence in data['vi']:
        
        # Full Sentence = <bos> <Sentence> <eos>
        full = "<bos>" + sentence + "<eos>"
        label.append(full)

    ipt_tokenized = tokenizer_en(
        ipt, padding = "max_length", truncation = True, max_length = seq_len
    )

    label_tokenized = tokenizer_en(
        label, padding = "max_length", truncation = True, max_length = seq_len
    )

    # Return applicable format for HuggingFace Trainers
    return{
        "input_ids": ipt_tokenized['input_ids'],
        "labels": label_tokenized['input_ids']
    }

pp_data = data.map(data_preprocessing, batched=True)

Map:   0%|          | 0/133317 [00:00<?, ? examples/s]

Map:   0%|          | 0/1268 [00:00<?, ? examples/s]

Map:   0%|          | 0/1268 [00:00<?, ? examples/s]

------
# **Models**

In [10]:
import torch
import torch.nn as nn
from transformers import PreTrainedModel, PretrainedConfig

class Seq2SeqRNNConfig(PretrainedConfig):
    def __init__(self, vocab_size_ipt = 10000, vocab_size_label = 10000,
                embed_dim = 128, hidden_dim = 128, dropout_p = 0.1, **kwargs):
        super().__init__(**kwargs)
        self.vocab_size_ipt = vocab_size_ipt
        self.vocab_size_label = vocab_size_label
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.dropout = dropout_p

class EncoderRNN(nn.Module):
    def __init__(self, inpt_size, embed_dim, hidden_dim, dropout_p):
        super(EncoderRNN, self).__init__()
        self.hidden_dim = hidden_dim
        self.inpt_size = inpt_size
        self.embed_dim = embed_dim

        self.embedding = nn.Embedding(inpt_size, self.embed_dim)
        self.gru = nn.GRU(embed_dim, hidden_dim, batch_first = True)
        self.dropout = nn.Dropout(dropout_p)
        
    def forward(self, x):
        x = self.embedding(x) # batch_size x seq_len x embed_dim
        x = self.dropout(x)
        
        out, hidden = self.gru(x) 
        # Each out size: batch_size x seq_len x embed_dim
        # Each hidden (state) size: batch_size x embed_dim
        
        return out, hidden

class DecoderRNN(nn.Module):
    def __init__(self, hidden_dim, embed_dim, out_size):
        super(DecoderRNN, self).__init__()
        self.embedding = nn.Embedding(out_size, embed_dim)
        self.gru = nn.GRU(embed_dim, hidden_dim, batch_first = True)
        self.fc = nn.Linear(hidden_dim, out_size) # Probability for every word in vocab

    def forward(self, x, hidden):
        x = self.embedding(x)
        out, hidden = self.gru(x, hidden)
        out = self.fc(out) # batch_size x 1 x vocab_size
        return out, hidden

class Seq2SeqRNNModel(PreTrainedModel):
    config_class = Seq2SeqRNNConfig

    def __init__(self, config, tokenizer_en):
        super().__init__(config)
        self.encoder = EncoderRNN(
            config.vocab_size_ipt, config.embed_dim,
            config.hidden_dim, config.dropout
        )
        self.decoder = DecoderRNN(
            config.hidden_dim, config.embed_dim, config.vocab_size_ipt
        )
        self.BOS_IDX = tokenizer_en.bos_token_id
        self.loss_fn = nn.CrossEntropyLoss(ignore_index = 0) # Ignore <PAD>

    def forward(self, input_ids, labels):
        
        batch_size, seq_len = labels.shape
        
        decoder_input = torch.full(
            (batch_size, 1), self.BOS_IDX, dtype=torch.long
        ).to(input_ids.device)
        
        encoder_output, decoder_hidden = self.encoder(input_ids)
        decoder_outputs = []

        for i in range(seq_len):
            """
                Teacher Forcing so we take last hidden state of Encoder/GRU
                the Last Hidden State from Encoder acts as h_0

                NOT using the word WE PREDICTED
                we USED the word FROM THE LABEL + Past Hidden State (From input and from the words before)
                To predict the CURRENT word.

            I.e:
            I go to School
            => Go through Encoder
            => Take <eos> token 
            => Input <eos> token as h_0 for decoder

            Decoder: Tôi đi đến trường
            We use "Tôi" and h_0 to predict "đi"
            We then compute the loss from the predicted word (output of decoder) with the Label word
            We then use the label word "đi" as "context" to predict "đến" 
            we ONLY do this for TRAINING, at inference, we just take the last predicted word as input
            """
            decoder_output, decoder_hidden = self.decoder(decoder_input, 
                                                          decoder_hidden)
            decoder_outputs.append(decoder_output)
            # Take word at i index
            decoder_input = labels[:, i].unsqueeze(1)
            
        logits = torch.cat(decoder_outputs, dim=1) # batch_size x seq_len x vocab_size
        loss = self.loss_fn(logits.view(
            -1, logits.shape[-1]
        ), labels.view(-1))
        
        return{
            "loss": loss, "logits": logits
        }

config = Seq2SeqRNNConfig(
    vocab_size_ipt = len(tokenizer_en), vocab_size_label = len(tokenizer_vi)
)
model = Seq2SeqRNNModel(config, tokenizer_en)

![image.png](attachment:ec3e92a6-4707-4b5e-a5d3-c246af8c8b19.png)

In [11]:
"""
Basically masking, diagnol = 1 
=> Diagnol at index 0
=> Everything above diagnol index 0 is retained 
(Diagnol with index 1 contains [0,0], [1,1], [2, 2])
"""

'\nBasically masking, diagnol = 1 \n=> Diagnol at index 0\n=> Everything above diagnol index 0 is retained \n(Diagnol with index 1 contains [0,0], [1,1], [2, 2])\n'

![image.png](attachment:1c03f6a7-11db-4bb4-9414-90677989812c.png)

In [12]:
import torch
import torch.nn as nn
from transformers import PreTrainedModel, PretrainedConfig

def generate_square_subsequent_mask(sz, device):
    mask = (torch.triu(torch.ones((sz, sz), device=device)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

def create_mask(src, tgt):
    src_seq_len = src.shape[1]
    tgt_seq_len = tgt.shape[1]
    device = src.device

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len, device).to(torch.bool)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=device).type(torch.bool)
    src_padding_mask = (src == 0)
    tgt_padding_mask = (tgt == 0)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

class Seq2SeqTransformerConfig(PretrainedConfig):
    def __init__(
            self, vocab_size_src=10000, vocab_size_tgt=10000, max_seq_length=50,
            d_model=256, num_heads=8, num_layers=6, dropout=0.1, **kwargs):
        super().__init__(**kwargs)
        self.vocab_size_src = vocab_size_src
        self.vocab_size_tgt = vocab_size_tgt
        self.max_seq_length = max_seq_length
        self.d_model = d_model
        self.num_heads = num_heads
        self.num_layers = num_layers
        self.dropout = dropout

class Seq2SeqTransformerModel(PreTrainedModel):
    config_class = Seq2SeqTransformerConfig

    def __init__(self, config):
        super().__init__(config)

        self.embedding_src = nn.Embedding(
            config.vocab_size_src, config.d_model)
        self.embedding_tgt = nn.Embedding(
            config.vocab_size_tgt, config.d_model)

        self.position_embedding_src = nn.Embedding(
            config.max_seq_length, config.d_model)
        self.position_embedding_tgt = nn.Embedding(
            config.max_seq_length, config.d_model)

        self.transformer = nn.Transformer(
            d_model=config.d_model,
            nhead=config.num_heads,
            num_encoder_layers=config.num_layers,
            num_decoder_layers=config.num_layers,
            dropout=config.dropout,
            batch_first=True
        )

        self.generator = nn.Linear(
            config.d_model, config.vocab_size_tgt
            )
        self.loss_fn = nn.CrossEntropyLoss(ignore_index=0)  # Ignore PAD token

    def forward(self, input_ids, labels):
        tgt_input = labels[:, :-1]
        tgt_output = labels[:, 1:]
        batch_size, seq_len_src = input_ids.shape
        _, seq_len_tgt = tgt_input.shape

        src_positions = torch.arange(seq_len_src, device=input_ids.device).unsqueeze(0)
        tgt_positions = torch.arange(seq_len_tgt, device=labels.device).unsqueeze(0)

        src_embedded = self.embedding_src(input_ids) + self.position_embedding_src(src_positions)
        tgt_embedded = self.embedding_tgt(tgt_input) + self.position_embedding_tgt(tgt_positions)

        src_mask, tgt_mask, src_key_padding_mask, tgt_key_padding_mask = create_mask(input_ids, tgt_input)

        outs = self.transformer(
            src_embedded, tgt_embedded, src_mask, tgt_mask,
            src_key_padding_mask=src_key_padding_mask,
            tgt_key_padding_mask=tgt_key_padding_mask
        )

        logits = self.generator(outs)
        loss = self.loss_fn(logits.permute(0, 2, 1), tgt_output)

        return {"loss": loss, "logits": logits}

    def encode(self, src, src_mask):
        _, seq_len_src = src.shape
        src_positions = torch.arange(
            seq_len_src, device=src.device).unsqueeze(0)
        src_embedded = self.embedding_src(src) + self.position_embedding_src(
            src_positions)
        return self.transformer.encoder(src_embedded, src_mask)

    def decode(self, tgt, encoder_output, tgt_mask):
        _, seq_len_tgt = tgt.shape
        tgt_positions = torch.arange(
            seq_len_tgt, device=tgt.device).unsqueeze(0)
        tgt_embedded = self.embedding_tgt(tgt) + self.position_embedding_tgt(
            tgt_positions)
        return self.transformer.decoder(
            tgt_embedded, encoder_output, tgt_mask
        )

In [13]:
config = Seq2SeqTransformerConfig(
    vocab_size_src=len(tokenizer_en), vocab_size_tgt=len(tokenizer_vi), max_seq_length=75
)


model_trans = Seq2SeqTransformerModel(config)

---------------
# **Test**

In [14]:
input_ids = torch.tensor([
    pp_data['train'][0]['input_ids']
])

labels = torch.tensor(
    [
        pp_data['train'][0]['labels']
    ]
)

pred = model(input_ids, labels)
pred_trans = model_trans(input_ids, labels)

In [15]:
print(pred)
print("-"*59)
print(pred_trans)

{'loss': tensor(9.5345, grad_fn=<NllLossBackward0>), 'logits': tensor([[[ 0.0701, -0.1100,  0.1259,  ..., -0.2321, -0.0897, -0.0462],
         [ 0.0635, -0.1687,  0.0957,  ..., -0.2677, -0.0220,  0.0632],
         [ 0.0549,  0.0120,  0.0450,  ..., -0.5827, -0.0942,  0.0932],
         ...,
         [-0.0127, -0.2222,  0.0109,  ...,  0.0687,  0.0936,  0.2562],
         [-0.0127, -0.2222,  0.0109,  ...,  0.0687,  0.0936,  0.2562],
         [-0.0127, -0.2222,  0.0109,  ...,  0.0687,  0.0936,  0.2562]]],
       grad_fn=<CatBackward0>)}
-----------------------------------------------------------
{'loss': tensor(9.9907, grad_fn=<NllLoss2DBackward0>), 'logits': tensor([[[ 0.2179, -0.5312, -0.8604,  ..., -1.1784, -0.3494, -0.3013],
         [ 0.7199, -0.5920, -0.4174,  ..., -0.7122, -0.7052, -0.5252],
         [ 0.4757, -0.1074, -0.2754,  ..., -0.5292, -0.9356, -0.4243],
         ...,
         [ 0.7746, -0.1446, -0.0945,  ..., -0.6869, -0.7405, -0.1688],
         [ 0.4360, -0.1890, -0.2646,  ..

--------------
# **Train**

In [16]:
from huggingface_hub import login

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
HUGGINGFACE_TOKEN = user_secrets.get_secret("HF_TOKEN")

# Login to Hugging Face
login(HUGGINGFACE_TOKEN)

In [17]:
from transformers import Trainer, TrainingArguments

import warnings

warnings.filterwarnings(
    "ignore",
    message="Was asked to gather along dimension 0, but all input tensors were scalars",
    category=UserWarning
)


training_args = TrainingArguments(
    output_dir = "./EN-VI-Machine-Translation-RNN",
    logging_dir = "logs",
    eval_strategy = "epoch",
    save_strategy = "epoch",
    logging_strategy = "epoch",
    per_device_train_batch_size = 128,
    per_device_eval_batch_size = 128,
    num_train_epochs = 5,
    learning_rate = 5e-5,
    weight_decay = 0.0005,
    save_total_limit = 1,
    report_to = "none",
    greater_is_better = False,
    metric_for_best_model = 'loss',
    logging_steps=1,
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = pp_data['train'],
    eval_dataset = pp_data['validation']
)

# Train the Model
trainer.train()
    
# Save the best model locally
trainer.save_model("./EN-VI-Machine-Translation-RNN")

# Push best model to Hugging Face Hub
trainer.push_to_hub("KanWasTaken/EN-VI-Machine-Translation-RNN")

Epoch,Training Loss,Validation Loss
1,4.1067,1.140927
2,0.9338,0.784412
3,0.8163,0.736664
4,0.7872,0.719417
5,0.777,0.714809


model.safetensors:   0%|          | 0.00/23.9M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/KanWasTaken/EN-VI-Machine-Translation-RNN/commit/e7c5ca4d610d30372868fc344457a7ed352fa301', commit_message='KanWasTaken/EN-VI-Machine-Translation-RNN', commit_description='', oid='e7c5ca4d610d30372868fc344457a7ed352fa301', pr_url=None, repo_url=RepoUrl('https://huggingface.co/KanWasTaken/EN-VI-Machine-Translation-RNN', endpoint='https://huggingface.co', repo_type='model', repo_id='KanWasTaken/EN-VI-Machine-Translation-RNN'), pr_revision=None, pr_num=None)

In [18]:
warnings.filterwarnings(
    "ignore",
    message="Was asked to gather along dimension 0, but all input tensors were scalars",
    category=UserWarning
)

training_args_trans = TrainingArguments(
    output_dir = "./EN-VI-Machine-Translation-Transformer",
    logging_dir = "logs",
    eval_strategy = "epoch",
    save_strategy = "epoch",
    logging_strategy = "epoch",
    per_device_train_batch_size = 128,
    per_device_eval_batch_size = 128,
    num_train_epochs = 5,
    learning_rate = 5e-5,
    weight_decay = 0.0005,
    save_total_limit = 1,
    report_to = "none",
    greater_is_better = False,
    metric_for_best_model = 'loss',
    logging_steps=1,
)

trainer_trans = Trainer(
    model = model_trans,
    args = training_args_trans,
    train_dataset = pp_data['train'],
    eval_dataset = pp_data['validation']
)

# Train the Model
trainer_trans.train()
    
# Save the best model locally
trainer_trans.save_model("./EN-VI-Machine-Translation-Transformer")

# Push best model to Hugging Face Hub
trainer_trans.push_to_hub("KanWasTaken/EN-VI-Machine-Translation-Transformer")

Epoch,Training Loss,Validation Loss
1,1.0705,0.597961
2,0.6527,0.566007
3,0.6288,0.553006
4,0.6174,0.546025
5,0.6117,0.543288


training_args.bin:   0%|          | 0.00/5.24k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/116M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/KanWasTaken/EN-VI-Machine-Translation-Transformer/commit/0680e870b3c5ac5463385f6420fbf635de1a40d7', commit_message='KanWasTaken/EN-VI-Machine-Translation-Transformer', commit_description='', oid='0680e870b3c5ac5463385f6420fbf635de1a40d7', pr_url=None, repo_url=RepoUrl('https://huggingface.co/KanWasTaken/EN-VI-Machine-Translation-Transformer', endpoint='https://huggingface.co', repo_type='model', repo_id='KanWasTaken/EN-VI-Machine-Translation-Transformer'), pr_revision=None, pr_num=None)

-----------
# **Infer**

In [19]:
!pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-3.1.1-py3-none-any.whl (19 kB)
Installing collected packages: portalocker, sacrebleu
Successfully installed portalocker-3.1.1 sacrebleu-2.5.1


In [20]:
def greedy_decode(model, src, src_mask, max_len, start_symbol, device="cpu"):
    src = src.to(device)
    src_mask = src_mask.to(device)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(device)
    print('v')
    for i in range(max_len-1):
        memory = memory.to(device)
        tgt_mask = (generate_square_subsequent_mask(ys.size(1), device)
                    .type(torch.bool)).to(device)
        out = model.decode(ys, memory, tgt_mask)
        prob = model.generator(out[:, -1, :]) # LM Head
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word[-1].item() # index

        ys = torch.cat([ys,torch.ones(1, 1).type_as(
            src.data).fill_(next_word)], dim=1)
        if next_word == 3: #EOS : 3
            break
    return ys

def translate(model, src_sentence, device):
    model.eval()
    input_ids = tokenizer_en([src_sentence], return_tensors='pt')['input_ids'].to(device)
    num_tokens = input_ids.shape[1]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).to(device)
    tgt_tokens = greedy_decode(
        model,  input_ids, src_mask, max_len=num_tokens + 5, start_symbol=2, device=device)
    return tokenizer_vi.decode(tgt_tokens.detach().cpu()[0])

In [21]:
translate(model_trans, "i go to school", model_trans.device)

v


'<bos> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>'

In [22]:
!pip install -q sacrebleu==2.5.1

In [23]:
from tqdm import tqdm
import sacrebleu

pred_sentences, tgt_sentences = [], []
for sample in tqdm(data['test']):
    src_sentence = sample['en']
    tgt_sentence = sample['vi']

    pred_sentence = translate(model_trans, src_sentence, model_trans.device)
    pred_sentences.append(pred_sentence)

    tgt_sentences.append(tgt_sentence)

bleu_score = sacrebleu.corpus_bleu(pred_sentences, [tgt_sentences], force=True)
bleu_score

  0%|          | 0/1268 [00:00<?, ?it/s]

v


  0%|          | 2/1268 [00:00<04:34,  4.61it/s]

v
v


  0%|          | 3/1268 [00:00<06:52,  3.07it/s]

v


  0%|          | 4/1268 [00:01<06:37,  3.18it/s]

v


  0%|          | 6/1268 [00:01<05:34,  3.78it/s]

v
v


  1%|          | 7/1268 [00:01<05:29,  3.83it/s]

v


  1%|          | 8/1268 [00:02<06:30,  3.23it/s]

v


  1%|          | 9/1268 [00:02<06:16,  3.35it/s]

v
v


  1%|          | 11/1268 [00:02<04:40,  4.49it/s]

v


  1%|          | 12/1268 [00:03<05:07,  4.09it/s]

v


  1%|          | 13/1268 [00:03<05:22,  3.89it/s]

v


  1%|          | 15/1268 [00:03<04:39,  4.48it/s]

v
v


  1%|▏         | 16/1268 [00:04<05:01,  4.16it/s]

v


  1%|▏         | 18/1268 [00:04<05:10,  4.02it/s]

v
v


  1%|▏         | 19/1268 [00:04<04:51,  4.29it/s]

v


  2%|▏         | 20/1268 [00:05<04:58,  4.17it/s]

v


  2%|▏         | 21/1268 [00:05<05:15,  3.95it/s]

v
v


  2%|▏         | 23/1268 [00:05<03:54,  5.30it/s]

v


  2%|▏         | 24/1268 [00:06<05:47,  3.58it/s]

v


  2%|▏         | 26/1268 [00:06<05:02,  4.10it/s]

v
v


  2%|▏         | 27/1268 [00:06<04:40,  4.42it/s]

v


  2%|▏         | 28/1268 [00:07<06:31,  3.16it/s]

v


  2%|▏         | 29/1268 [00:07<06:24,  3.22it/s]

v


  2%|▏         | 30/1268 [00:07<06:18,  3.27it/s]

v


  3%|▎         | 32/1268 [00:08<05:15,  3.92it/s]

v
v


  3%|▎         | 33/1268 [00:08<04:36,  4.46it/s]

v


  3%|▎         | 34/1268 [00:08<04:46,  4.31it/s]

v


  3%|▎         | 36/1268 [00:09<04:17,  4.78it/s]

v
v


  3%|▎         | 37/1268 [00:09<03:52,  5.30it/s]

v


  3%|▎         | 38/1268 [00:09<04:15,  4.81it/s]

v
v


  3%|▎         | 41/1268 [00:10<03:18,  6.19it/s]

v
v


  3%|▎         | 42/1268 [00:10<04:05,  4.99it/s]

v


  3%|▎         | 43/1268 [00:10<04:13,  4.83it/s]

v


  3%|▎         | 44/1268 [00:10<04:24,  4.63it/s]

v


  4%|▎         | 45/1268 [00:11<04:39,  4.37it/s]

v


  4%|▎         | 47/1268 [00:11<05:11,  3.92it/s]

v
v


  4%|▍         | 49/1268 [00:12<04:35,  4.42it/s]

v
v


  4%|▍         | 51/1268 [00:12<04:01,  5.04it/s]

v
v


  4%|▍         | 53/1268 [00:12<03:13,  6.29it/s]

v
v


  4%|▍         | 55/1268 [00:13<03:29,  5.79it/s]

v
v


  4%|▍         | 56/1268 [00:13<03:13,  6.28it/s]

v


  4%|▍         | 57/1268 [00:13<03:39,  5.52it/s]

v


  5%|▍         | 59/1268 [00:13<04:33,  4.43it/s]

v


  5%|▍         | 60/1268 [00:14<04:12,  4.79it/s]

v
v


  5%|▍         | 61/1268 [00:14<05:24,  3.72it/s]

v


  5%|▍         | 63/1268 [00:15<05:17,  3.79it/s]

v
v


  5%|▌         | 64/1268 [00:15<04:48,  4.17it/s]

v


  5%|▌         | 65/1268 [00:15<04:44,  4.23it/s]

v


  5%|▌         | 66/1268 [00:15<04:43,  4.24it/s]

v


  5%|▌         | 68/1268 [00:16<04:46,  4.19it/s]

v
v


  5%|▌         | 69/1268 [00:16<05:11,  3.85it/s]

v


  6%|▌         | 70/1268 [00:16<05:02,  3.96it/s]

v


  6%|▌         | 72/1268 [00:17<04:46,  4.18it/s]

v
v


  6%|▌         | 73/1268 [00:17<05:21,  3.71it/s]

v


  6%|▌         | 74/1268 [00:18<06:16,  3.17it/s]

v
v


  6%|▌         | 77/1268 [00:18<04:48,  4.13it/s]

v
v


  6%|▌         | 78/1268 [00:19<06:07,  3.24it/s]

v


  6%|▋         | 80/1268 [00:19<05:23,  3.67it/s]

v


  6%|▋         | 81/1268 [00:19<04:59,  3.96it/s]

v


  6%|▋         | 82/1268 [00:19<04:26,  4.45it/s]

v
v


  7%|▋         | 83/1268 [00:20<05:49,  3.39it/s]

v


  7%|▋         | 84/1268 [00:20<05:46,  3.41it/s]

v


  7%|▋         | 85/1268 [00:20<05:43,  3.45it/s]

v


  7%|▋         | 86/1268 [00:21<05:24,  3.64it/s]

v


  7%|▋         | 87/1268 [00:21<06:38,  2.97it/s]

v


  7%|▋         | 88/1268 [00:22<06:50,  2.87it/s]

v


  7%|▋         | 91/1268 [00:22<04:17,  4.57it/s]

v
v
v


  7%|▋         | 93/1268 [00:22<03:54,  5.01it/s]

v
v


  7%|▋         | 94/1268 [00:23<03:31,  5.56it/s]

v


  7%|▋         | 95/1268 [00:23<03:51,  5.06it/s]

v
v


  8%|▊         | 98/1268 [00:23<03:20,  5.82it/s]

v
v


  8%|▊         | 99/1268 [00:23<03:43,  5.23it/s]

v


  8%|▊         | 100/1268 [00:24<04:55,  3.96it/s]

v


  8%|▊         | 102/1268 [00:24<03:58,  4.88it/s]

v
v


  8%|▊         | 103/1268 [00:24<03:27,  5.61it/s]

v


  8%|▊         | 105/1268 [00:25<04:44,  4.09it/s]

v
v


  8%|▊         | 106/1268 [00:25<04:06,  4.71it/s]

v


  8%|▊         | 107/1268 [00:25<04:57,  3.90it/s]

v


  9%|▊         | 108/1268 [00:26<06:18,  3.06it/s]

v


  9%|▊         | 110/1268 [00:27<05:51,  3.29it/s]

v
v


  9%|▉         | 111/1268 [00:27<06:37,  2.91it/s]

v


  9%|▉         | 112/1268 [00:27<05:55,  3.26it/s]

v


  9%|▉         | 114/1268 [00:28<05:04,  3.79it/s]

v
v
v


  9%|▉         | 116/1268 [00:28<04:37,  4.16it/s]

v


  9%|▉         | 118/1268 [00:29<05:06,  3.75it/s]

v
v


  9%|▉         | 119/1268 [00:29<05:52,  3.26it/s]

v
v


 10%|▉         | 121/1268 [00:30<04:55,  3.89it/s]

v


 10%|▉         | 122/1268 [00:30<05:42,  3.34it/s]

v


 10%|▉         | 124/1268 [00:31<05:08,  3.71it/s]

v
v


 10%|▉         | 125/1268 [00:31<05:29,  3.47it/s]

v
v


 10%|█         | 127/1268 [00:31<05:31,  3.44it/s]

v


 10%|█         | 129/1268 [00:32<04:47,  3.96it/s]

v
v


 10%|█         | 131/1268 [00:32<03:42,  5.12it/s]

v
v


 10%|█         | 133/1268 [00:33<04:14,  4.46it/s]

v
v


 11%|█         | 135/1268 [00:33<03:54,  4.84it/s]

v
v


 11%|█         | 137/1268 [00:34<04:07,  4.57it/s]

v
v


 11%|█         | 139/1268 [00:34<05:12,  3.61it/s]

v
v


 11%|█         | 140/1268 [00:35<05:18,  3.54it/s]

v


 11%|█         | 141/1268 [00:35<04:57,  3.79it/s]

v


 11%|█         | 142/1268 [00:35<04:40,  4.01it/s]

v


 11%|█▏        | 143/1268 [00:35<05:08,  3.65it/s]

v


 11%|█▏        | 145/1268 [00:36<04:32,  4.12it/s]

v
v


 12%|█▏        | 147/1268 [00:36<03:32,  5.27it/s]

v
v


 12%|█▏        | 149/1268 [00:36<03:31,  5.28it/s]

v
v


 12%|█▏        | 150/1268 [00:37<04:05,  4.56it/s]

v


 12%|█▏        | 151/1268 [00:37<05:57,  3.12it/s]

v


 12%|█▏        | 153/1268 [00:38<04:42,  3.95it/s]

v
v


 12%|█▏        | 154/1268 [00:38<04:23,  4.22it/s]

v


 12%|█▏        | 156/1268 [00:38<04:43,  3.92it/s]

v
v


 12%|█▏        | 158/1268 [00:39<04:20,  4.26it/s]

v
v


 13%|█▎        | 160/1268 [00:40<04:49,  3.83it/s]

v
v


 13%|█▎        | 161/1268 [00:40<04:01,  4.58it/s]

v


 13%|█▎        | 163/1268 [00:40<03:39,  5.05it/s]

v
v


 13%|█▎        | 164/1268 [00:40<03:41,  4.98it/s]

v


 13%|█▎        | 165/1268 [00:41<05:13,  3.52it/s]

v


 13%|█▎        | 166/1268 [00:41<05:25,  3.39it/s]

v


 13%|█▎        | 168/1268 [00:42<05:33,  3.30it/s]

v
v


 13%|█▎        | 169/1268 [00:42<05:39,  3.24it/s]

v
v


 13%|█▎        | 171/1268 [00:43<06:14,  2.93it/s]

v


 14%|█▎        | 173/1268 [00:44<06:27,  2.83it/s]

v
v


 14%|█▎        | 174/1268 [00:44<05:37,  3.24it/s]

v


 14%|█▍        | 176/1268 [00:44<04:11,  4.34it/s]

v
v


 14%|█▍        | 177/1268 [00:44<03:45,  4.84it/s]

v
v


 14%|█▍        | 179/1268 [00:45<03:20,  5.44it/s]

v


 14%|█▍        | 181/1268 [00:45<03:41,  4.90it/s]

v
v


 14%|█▍        | 183/1268 [00:45<03:39,  4.95it/s]

v
v


 15%|█▍        | 184/1268 [00:46<03:27,  5.23it/s]

v


 15%|█▍        | 185/1268 [00:46<03:48,  4.75it/s]

v


 15%|█▍        | 186/1268 [00:46<04:00,  4.50it/s]

v


 15%|█▍        | 187/1268 [00:46<04:41,  3.84it/s]

v


 15%|█▍        | 188/1268 [00:47<04:54,  3.66it/s]

v


 15%|█▍        | 190/1268 [00:47<04:24,  4.07it/s]

v
v


 15%|█▌        | 191/1268 [00:47<04:02,  4.44it/s]

v


 15%|█▌        | 192/1268 [00:48<04:09,  4.32it/s]

v


 15%|█▌        | 193/1268 [00:48<04:41,  3.82it/s]

v


 15%|█▌        | 195/1268 [00:48<04:12,  4.24it/s]

v
v


 16%|█▌        | 197/1268 [00:49<03:43,  4.79it/s]

v
v


 16%|█▌        | 198/1268 [00:49<04:11,  4.25it/s]

v
v


 16%|█▌        | 200/1268 [00:50<04:40,  3.81it/s]

v


 16%|█▌        | 201/1268 [00:50<05:11,  3.43it/s]

v


 16%|█▌        | 202/1268 [00:51<06:39,  2.67it/s]

v


 16%|█▌        | 204/1268 [00:51<05:04,  3.50it/s]

v
v


 16%|█▌        | 205/1268 [00:51<04:40,  3.78it/s]

v
v


 16%|█▋        | 207/1268 [00:52<04:07,  4.28it/s]

v


 16%|█▋        | 208/1268 [00:52<04:23,  4.03it/s]

v


 16%|█▋        | 209/1268 [00:52<04:26,  3.98it/s]

v


 17%|█▋        | 211/1268 [00:53<04:05,  4.31it/s]

v
v


 17%|█▋        | 213/1268 [00:53<04:39,  3.77it/s]

v
v


 17%|█▋        | 214/1268 [00:54<04:27,  3.95it/s]


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
import sacrebleu

pred_sentences = ['tôi đang đi học']
tgt_sentences = ['tôi đang đi tới trường']
bleu_score = sacrebleu.corpus_bleu(
    pred_sentences, [tgt_sentences], force=True
)
bleu_score