In [None]:
!pip install transformers



In [1]:
!pip install -q datasets sacrebleu accelerate>=0.20.1

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires requests==2.31.0, but you have requests 2.32.3 which is incompatible.[0m[31m
[0m

In [2]:
import os
import numpy as np

import sacrebleu

import torch
from torch.utils.data import Dataset

from datasets import load_dataset, load_metric
from transformers import *

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
import pandas as pd

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### 1.Prepare Data

In [6]:
class NMTDataset(Dataset):
    def __init__(self, cfg, file_path):
        super().__init__()
        self.cfg = cfg
        self.data = pd.read_csv(file_path)
        self.src_texts, self.tgt_texts = self.get_src_tgt_texts()
        self.src_input_ids, self.src_attention_mask = self.texts_to_sequences(self.src_texts)
        self.tgt_input_ids, self.tgt_attention_mask, self.labels = self.texts_to_sequences(
            self.tgt_texts,
            is_src=False
        )

    def get_src_tgt_texts(self):
        src_texts = self.data["en"].tolist()
        tgt_texts = self.data["hu"].tolist()
        return src_texts, tgt_texts

    def texts_to_sequences(self, texts, is_src=True):
        if is_src:
            src_inputs = self.cfg.src_tokenizer(
                texts,
                padding='max_length',
                truncation=True,
                max_length=self.cfg.src_max_len,
                return_tensors='pt'
            )
            return (
                src_inputs.input_ids,
                src_inputs.attention_mask
            )
        else:
            if self.cfg.add_special_tokens:
                texts = [
                    ' '.join([
                        self.cfg.tgt_tokenizer.bos_token,
                        text,
                        self.cfg.tgt_tokenizer.eos_token
                        ])
                    for text in texts
                ]
            tgt_inputs = self.cfg.tgt_tokenizer(
                texts,
                padding='max_length',
                truncation=True,
                max_length=self.cfg.tgt_max_len,
                return_tensors='pt'
            )

            # Sửa đoạn này: Đảm bảo rằng labels được tạo đúng
            labels = tgt_inputs.input_ids.numpy().tolist()
            labels = [
                [
                    -100 if token_id == self.cfg.tgt_tokenizer.pad_token_id else token_id
                    for token_id in label
                ]
                for label in labels
            ]
            labels = torch.LongTensor(labels)
            return (
                tgt_inputs.input_ids,
                tgt_inputs.attention_mask,
                labels
            )

    def __getitem__(self, idx):
        return {
            "input_ids": self.src_input_ids[idx],
            "attention_mask": self.src_attention_mask[idx],
            "decoder_input_ids": self.tgt_input_ids[idx],
            "decoder_attention_mask": self.tgt_attention_mask[idx],
            "labels": self.labels[idx]
        }

    def __len__(self):
        return np.shape(self.src_input_ids)[0]

### 2.Load Tokenizer and Model

In [7]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def load_tokenizer(model_name_or_path):
        return AutoTokenizer.from_pretrained(model_name_or_path)

In [8]:
class SaveBestModelCallback(TrainerCallback):
    def __init__(self, output_dir):
        self.output_dir = output_dir
        self.best_bleu_score = 0

    def on_evaluate(self, args, state, control, **kwargs):
        metrics = kwargs["metrics"]
        model = kwargs["model"]
        tokenizer = kwargs["tokenizer"]

        if "eval_bleu_score" in metrics:
            bleu_score = metrics["eval_bleu_score"]
            if bleu_score > self.best_bleu_score:
                self.best_bleu_score = bleu_score
                print(f"New best BLEU score: {bleu_score}. Saving model.")
                model.save_pretrained(self.output_dir)
        return control

In [9]:
# class CustomDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
#     def __init__(self, tokenizer, model, decoder_start_token_id):
#         super().__init__(tokenizer, model)
#         self.tokenizer = tokenizer
#         self.model = model
#         self.decoder_start_token_id = decoder_start_token_id  # Lưu trữ giá trị decoder_start_token_id

#     def __call__(self, features):
#         batch = super().__call__(features)

#         if "labels" in batch:
#             labels = batch["labels"]
#             decoder_input_ids = self._shift_tokens_right(labels, self.tokenizer.pad_token_id, self.decoder_start_token_id)
#             batch["decoder_input_ids"] = decoder_input_ids

#         return batch

#     def _shift_tokens_right(self, input_ids, pad_token_id, decoder_start_token_id):
#         shifted_input_ids = input_ids.new_zeros(input_ids.shape)
#         shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()

#         if decoder_start_token_id is None:
#             raise ValueError("decoder_start_token_id has to be defined.")

#         shifted_input_ids[..., 0] = decoder_start_token_id

#         if pad_token_id is None:
#             raise ValueError("pad_token_id has to be defined.")

#         shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)

#         return shifted_input_ids

In [10]:
class Manager():
    def __init__(self, cfg, file_path, fold_index=1, best_bleu_score = 0, is_train=True):
        self.cfg = cfg
        self.fold_index = fold_index
        self.best_bleu_score = best_bleu_score

        print("Loading Tokenizer...")
        self.get_tokenizer()

        print("Loading Model...")
        self.get_model()

        print("Loading Metric...")
        self.bleu_metric = load_metric("sacrebleu")

        print("Check Save Model Path")
        if not os.path.exists(self.cfg.ckpt_dir):
            os.mkdir(self.cfg.ckpt_dir)

        if is_train:
            print("Loading Dataset...")
            file_train_path = file_path + f'fold_{fold_index}_train.csv'
            file_valid_path = file_path + f'fold_{fold_index}_valid.csv'
            self.train_dataset = NMTDataset(self.cfg, file_train_path)
            self.valid_dataset = NMTDataset(self.cfg, file_valid_path)

        print("Setting finished.")

    def get_tokenizer(self):
        if self.cfg.load_model_from_path:
            self.cfg.src_tokenizer = load_tokenizer(self.cfg.ckpt_dir)
            self.cfg.tgt_tokenizer = load_tokenizer(self.cfg.ckpt_dir)
        else:
            self.cfg.src_tokenizer = load_tokenizer(self.cfg.src_model_name)
            self.cfg.tgt_tokenizer = load_tokenizer(self.cfg.tgt_model_name)
            if "bert" in self.cfg.tgt_model_name.split('-'):
                self.cfg.add_special_tokens = False
                self.cfg.bos_token_id = self.cfg.tgt_tokenizer.cls_token_id
                self.cfg.eos_token_id = self.cfg.tgt_tokenizer.sep_token_id
                self.cfg.pad_token_id = self.cfg.tgt_tokenizer.pad_token_id
            else:
                self.cfg.add_special_tokens = True
                self.cfg.tgt_tokenizer.add_special_tokens(
                    {
                        "bos_token": "[BOS]",
                        "eos_token": "[EOS]",
                        "pad_token": "[PAD]"
                    }
                )
                self.cfg.bos_token_id = self.cfg.tgt_tokenizer.bos_token_id
                self.cfg.eos_token_id = self.cfg.tgt_tokenizer.eos_token_id
                self.cfg.pad_token_id = self.cfg.tgt_tokenizer.pad_token_id
        self.cfg.src_tokenizer.save_pretrained(
                os.path.join(self.cfg.ckpt_dir, f"{self.cfg.src_lang}_tokenizer_{cfg.src_model_name}")
            )

        self.cfg.tgt_tokenizer.save_pretrained(
                os.path.join(self.cfg.ckpt_dir, f"{self.cfg.tgt_lang}_tokenizer_{cfg.tgt_model_name}")
            )

    def get_model(self):
        if self.cfg.load_model_from_path:
            save_model_path = os.path.join(self.cfg.ckpt_dir, self.cfg.ckpt_name)
            self.model = EncoderDecoderModel.from_pretrained(save_model_path)
        else:
            self.model = EncoderDecoderModel.from_encoder_decoder_pretrained(
                self.cfg.src_model_name,
                self.cfg.tgt_model_name
            )
            self.model.decoder.resize_token_embeddings(len(self.cfg.tgt_tokenizer))
            self.model.config.decoder_start_token_id = self.cfg.bos_token_id
            self.model.config.eos_token_id = self.cfg.eos_token_id
            self.model.config.pad_token_id = self.cfg.pad_token_id
            self.model.config.vocab_size = len(self.cfg.tgt_tokenizer)
            self.model.config.max_length = self.cfg.max_length_decoder
            self.model.config.min_length = self.cfg.min_length_decoder
            self.model.config.no_repeat_ngram_size = 3
            self.model.config.early_stopping = True
            self.model.config.length_penalty = 1.0
            self.model.config.num_beams = self.cfg.beam_size

    def train(self):
        print("Training...")
        if self.cfg.use_eval_steps:
            training_args = Seq2SeqTrainingArguments(
                predict_with_generate=True,
                evaluation_strategy="steps",
                save_strategy='steps',
                save_steps=self.cfg.eval_steps,
                eval_steps=self.cfg.eval_steps,
                output_dir=self.cfg.ckpt_dir,
                per_device_train_batch_size=self.cfg.train_batch_size,
                per_device_eval_batch_size=self.cfg.eval_batch_size,
                learning_rate=self.cfg.learning_rate,
                weight_decay=0.005,
                num_train_epochs=self.cfg.num_train_epochs
            )
        else:
            training_args = Seq2SeqTrainingArguments(
                predict_with_generate=True,
                evaluation_strategy="epoch",
                save_strategy='epoch',
                output_dir=self.cfg.ckpt_dir,
                per_device_train_batch_size=self.cfg.train_batch_size,
                per_device_eval_batch_size=self.cfg.eval_batch_size,
                learning_rate=self.cfg.learning_rate,
                weight_decay=0.005,
                num_train_epochs=self.cfg.num_train_epochs
            )

        data_collator = DataCollatorForSeq2Seq(
            tokenizer=self.cfg.tgt_tokenizer,
            model=self.model,
        )

        #####################
        # for batch in DataLoader(self.train_dataset, batch_size=1, collate_fn=data_collator):
        #     print(batch)
        #     break

        trainer = Seq2SeqTrainer(
            self.model,
            training_args,
            train_dataset=self.train_dataset,
            eval_dataset=self.valid_dataset,
            data_collator=data_collator,
            #tokenizer=self.cfg.tgt_tokenizer,
            compute_metrics=self.compute_metrics,
            callbacks=[SaveBestModelCallback(output_dir=self.cfg.ckpt_dir)]
        )

        trainer.train()

    def compute_metrics(self, eval_preds):
        preds, labels = eval_preds
        if isinstance(preds, tuple):
            preds = preds[0]
        decoded_preds = self.cfg.tgt_tokenizer.batch_decode(preds, skip_special_tokens=True)

        labels = np.where(labels != -100, labels, self.cfg.tgt_tokenizer.pad_token_id)
        decoded_labels = self.cfg.tgt_tokenizer.batch_decode(labels, skip_special_tokens=True)

        decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

        result = self.bleu_metric.compute(
            predictions=decoded_preds,
            references=decoded_labels
        )

        result = {"bleu_score": result["score"]}

        prediction_lens = [np.count_nonzero(pred != self.cfg.tgt_tokenizer.pad_token_id) for pred in preds]
        result["gen_len"] = np.mean(prediction_lens)
        result = {k: round(v, 4) for k, v in result.items()}

        # Check if the BLEU score is the best and save the model if it is
        if result["bleu_score"] > self.best_bleu_score:
            print('Save model with bleu score: ', result["bleu_score"])
            self.best_bleu_score = result["bleu_score"]
            self.model.save_pretrained(os.path.join(self.cfg.ckpt_dir, f"best_model_fold_{self.fold_index}"))

        return result

##3.Config

In [13]:
class BaseConfig:
    """ base Encoder Decoder config """

    def __init__(self, **kwargs):
        for k, v in kwargs.items():
            setattr(self, k, v)

class NMTConfig(BaseConfig):
    # Data
    src_lang = 'en'
    tgt_lang = 'hu'
    src_max_len = 100
    tgt_max_len = 100 #####################################

    # Model
    src_model_name = "bert-base-multilingual-cased"
    tgt_model_name = "bert-base-multilingual-cased"
    # src_model_name = "FacebookAI/xlm-roberta-base"
    # tgt_model_name = "FacebookAI/xlm-roberta-base"

    # Training
    load_model_from_path = False
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    learning_rate = 3e-5
    train_batch_size = 4
    eval_batch_size = 4
    num_train_epochs = 20
    ckpt_dir = '/content/drive/MyDrive/jax and flax/' + src_model_name.split('/')[-1] + '_to_' + tgt_model_name.split('/')[-1]
    use_eval_steps = False
    eval_steps = 2000

    # Inference
    max_length_decoder = 75
    min_length_decoder = 25  ######################################
    beam_size = 3

cfg = NMTConfig()

##4.Training

In [14]:
# #xóa folder dư
# import shutil
# shutil.rmtree("/content/bert-base-multilingual-cased_to_bert-base-multilingual-cased")

In [15]:
file_path="/content/drive/MyDrive/jax and flax/data1000/"
def run_cross_validation(cfg, file_path, k = 1):
    best_bleu_score = 0
    best_fold_index = 0

    for fold_index in range(1, k + 1):
        print(f'Running Fold {fold_index}')
        manager = Manager(cfg, file_path, fold_index, best_bleu_score, is_train=True)
        manager.train()

        bleu_score = manager.best_bleu_score

        if bleu_score > best_bleu_score:
            best_bleu_score = bleu_score
            best_fold_index = fold_index

    print(f"Best BLEU score: {best_bleu_score} found in fold {best_fold_index}")
    print(f"Best model saved at: {os.path.join(cfg.ckpt_dir, f'best_model_fold_{best_fold_index}')}")

In [16]:
run_cross_validation(cfg, file_path, k = 1)

Running Fold 1
Loading Tokenizer...


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-multilingual-cased/snapshots/3f076fdb1ab68d5b2880cb87a0886f315b8146f8/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-multilingual-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.41.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size":

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--bert-base-multilingual-cased/snapshots/3f076fdb1ab68d5b2880cb87a0886f315b8146f8/vocab.txt
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--bert-base-multilingual-cased/snapshots/3f076fdb1ab68d5b2880cb87a0886f315b8146f8/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--bert-base-multilingual-cased/snapshots/3f076fdb1ab68d5b2880cb87a0886f315b8146f8/tokenizer_config.json
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-multilingual-cased/snapshots/3f076fdb1ab68d5b2880cb87a0886f315b8146f8/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-multilingual-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout"

Loading Model...


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--bert-base-multilingual-cased/snapshots/3f076fdb1ab68d5b2880cb87a0886f315b8146f8/model.safetensors
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassificatio

Loading Metric...


Downloading builder script:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

Check Save Model Path
Loading Dataset...


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


Setting finished.
Training...


***** Running training *****
  Num examples = 810
  Num Epochs = 20
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 4,060
  Number of trainable parameters = 384,194,811
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


Epoch,Training Loss,Validation Loss,Bleu Score,Gen Len
1,No log,4.085449,1.9443,34.8778
2,No log,3.778748,1.7099,37.5556


***** Running Evaluation *****
  Num examples = 90
  Batch size = 4
Generate config GenerationConfig {
  "decoder_start_token_id": 101,
  "early_stopping": true,
  "eos_token_id": 102,
  "max_length": 75,
  "min_length": 25,
  "no_repeat_ngram_size": 3,
  "num_beams": 3,
  "pad_token_id": 0
}

Non-default generation parameters: {'max_length': 75, 'min_length': 25, 'early_stopping': True, 'num_beams': 3, 'no_repeat_ngram_size': 3}
Configuration saved in /content/drive/MyDrive/jax and flax/bert-base-multilingual-cased_to_bert-base-multilingual-cased/best_model_fold_1/config.json
Configuration saved in /content/drive/MyDrive/jax and flax/bert-base-multilingual-cased_to_bert-base-multilingual-cased/best_model_fold_1/generation_config.json


Save model with bleu score:  1.9443


Model weights saved in /content/drive/MyDrive/jax and flax/bert-base-multilingual-cased_to_bert-base-multilingual-cased/best_model_fold_1/model.safetensors
Non-default generation parameters: {'max_length': 75, 'min_length': 25, 'early_stopping': True, 'num_beams': 3, 'no_repeat_ngram_size': 3}
Configuration saved in /content/drive/MyDrive/jax and flax/bert-base-multilingual-cased_to_bert-base-multilingual-cased/config.json
Configuration saved in /content/drive/MyDrive/jax and flax/bert-base-multilingual-cased_to_bert-base-multilingual-cased/generation_config.json


New best BLEU score: 1.9443. Saving model.


Model weights saved in /content/drive/MyDrive/jax and flax/bert-base-multilingual-cased_to_bert-base-multilingual-cased/model.safetensors
Saving model checkpoint to /content/drive/MyDrive/jax and flax/bert-base-multilingual-cased_to_bert-base-multilingual-cased/checkpoint-203
Non-default generation parameters: {'max_length': 75, 'min_length': 25, 'early_stopping': True, 'num_beams': 3, 'no_repeat_ngram_size': 3}
Configuration saved in /content/drive/MyDrive/jax and flax/bert-base-multilingual-cased_to_bert-base-multilingual-cased/checkpoint-203/config.json
Configuration saved in /content/drive/MyDrive/jax and flax/bert-base-multilingual-cased_to_bert-base-multilingual-cased/checkpoint-203/generation_config.json
Model weights saved in /content/drive/MyDrive/jax and flax/bert-base-multilingual-cased_to_bert-base-multilingual-cased/checkpoint-203/model.safetensors
***** Running Evaluation *****
  Num examples = 90
  Batch size = 4
Saving model checkpoint to /content/drive/MyDrive/jax and 

Save model with bleu score:  1.9721


Configuration saved in /content/drive/MyDrive/jax and flax/bert-base-multilingual-cased_to_bert-base-multilingual-cased/best_model_fold_1/generation_config.json
Model weights saved in /content/drive/MyDrive/jax and flax/bert-base-multilingual-cased_to_bert-base-multilingual-cased/best_model_fold_1/model.safetensors
Non-default generation parameters: {'max_length': 75, 'min_length': 25, 'early_stopping': True, 'num_beams': 3, 'no_repeat_ngram_size': 3}
Configuration saved in /content/drive/MyDrive/jax and flax/bert-base-multilingual-cased_to_bert-base-multilingual-cased/config.json


New best BLEU score: 1.9721. Saving model.


Configuration saved in /content/drive/MyDrive/jax and flax/bert-base-multilingual-cased_to_bert-base-multilingual-cased/generation_config.json
Model weights saved in /content/drive/MyDrive/jax and flax/bert-base-multilingual-cased_to_bert-base-multilingual-cased/model.safetensors


Epoch,Training Loss,Validation Loss,Bleu Score,Gen Len
1,No log,4.085449,1.9443,34.8778
2,No log,3.778748,1.7099,37.5556
3,4.421300,3.689551,1.9721,34.8556
4,4.421300,3.707013,1.9887,37.8889
5,3.000700,3.752311,2.7653,37.1556
6,3.000700,3.975467,2.9905,37.8333
7,3.000700,4.113042,2.692,39.6444
8,2.109700,4.33392,2.3083,35.8111
9,2.109700,4.501572,2.2084,37.5889
10,1.395400,4.639874,2.6743,33.3


Saving model checkpoint to /content/drive/MyDrive/jax and flax/bert-base-multilingual-cased_to_bert-base-multilingual-cased/checkpoint-609
Non-default generation parameters: {'max_length': 75, 'min_length': 25, 'early_stopping': True, 'num_beams': 3, 'no_repeat_ngram_size': 3}
Configuration saved in /content/drive/MyDrive/jax and flax/bert-base-multilingual-cased_to_bert-base-multilingual-cased/checkpoint-609/config.json
Configuration saved in /content/drive/MyDrive/jax and flax/bert-base-multilingual-cased_to_bert-base-multilingual-cased/checkpoint-609/generation_config.json
Model weights saved in /content/drive/MyDrive/jax and flax/bert-base-multilingual-cased_to_bert-base-multilingual-cased/checkpoint-609/model.safetensors
***** Running Evaluation *****
  Num examples = 90
  Batch size = 4
Non-default generation parameters: {'max_length': 75, 'min_length': 25, 'early_stopping': True, 'num_beams': 3, 'no_repeat_ngram_size': 3}
Configuration saved in /content/drive/MyDrive/jax and fla

Save model with bleu score:  1.9887


Model weights saved in /content/drive/MyDrive/jax and flax/bert-base-multilingual-cased_to_bert-base-multilingual-cased/best_model_fold_1/model.safetensors
Non-default generation parameters: {'max_length': 75, 'min_length': 25, 'early_stopping': True, 'num_beams': 3, 'no_repeat_ngram_size': 3}
Configuration saved in /content/drive/MyDrive/jax and flax/bert-base-multilingual-cased_to_bert-base-multilingual-cased/config.json
Configuration saved in /content/drive/MyDrive/jax and flax/bert-base-multilingual-cased_to_bert-base-multilingual-cased/generation_config.json


New best BLEU score: 1.9887. Saving model.


Model weights saved in /content/drive/MyDrive/jax and flax/bert-base-multilingual-cased_to_bert-base-multilingual-cased/model.safetensors
Saving model checkpoint to /content/drive/MyDrive/jax and flax/bert-base-multilingual-cased_to_bert-base-multilingual-cased/checkpoint-812
Non-default generation parameters: {'max_length': 75, 'min_length': 25, 'early_stopping': True, 'num_beams': 3, 'no_repeat_ngram_size': 3}
Configuration saved in /content/drive/MyDrive/jax and flax/bert-base-multilingual-cased_to_bert-base-multilingual-cased/checkpoint-812/config.json
Configuration saved in /content/drive/MyDrive/jax and flax/bert-base-multilingual-cased_to_bert-base-multilingual-cased/checkpoint-812/generation_config.json
Model weights saved in /content/drive/MyDrive/jax and flax/bert-base-multilingual-cased_to_bert-base-multilingual-cased/checkpoint-812/model.safetensors
***** Running Evaluation *****
  Num examples = 90
  Batch size = 4
Non-default generation parameters: {'max_length': 75, 'min

Save model with bleu score:  2.7653


Model weights saved in /content/drive/MyDrive/jax and flax/bert-base-multilingual-cased_to_bert-base-multilingual-cased/best_model_fold_1/model.safetensors
Non-default generation parameters: {'max_length': 75, 'min_length': 25, 'early_stopping': True, 'num_beams': 3, 'no_repeat_ngram_size': 3}
Configuration saved in /content/drive/MyDrive/jax and flax/bert-base-multilingual-cased_to_bert-base-multilingual-cased/config.json
Configuration saved in /content/drive/MyDrive/jax and flax/bert-base-multilingual-cased_to_bert-base-multilingual-cased/generation_config.json


New best BLEU score: 2.7653. Saving model.


Model weights saved in /content/drive/MyDrive/jax and flax/bert-base-multilingual-cased_to_bert-base-multilingual-cased/model.safetensors
Saving model checkpoint to /content/drive/MyDrive/jax and flax/bert-base-multilingual-cased_to_bert-base-multilingual-cased/checkpoint-1015
Non-default generation parameters: {'max_length': 75, 'min_length': 25, 'early_stopping': True, 'num_beams': 3, 'no_repeat_ngram_size': 3}
Configuration saved in /content/drive/MyDrive/jax and flax/bert-base-multilingual-cased_to_bert-base-multilingual-cased/checkpoint-1015/config.json
Configuration saved in /content/drive/MyDrive/jax and flax/bert-base-multilingual-cased_to_bert-base-multilingual-cased/checkpoint-1015/generation_config.json
Model weights saved in /content/drive/MyDrive/jax and flax/bert-base-multilingual-cased_to_bert-base-multilingual-cased/checkpoint-1015/model.safetensors
***** Running Evaluation *****
  Num examples = 90
  Batch size = 4
Non-default generation parameters: {'max_length': 75, 

Save model with bleu score:  2.9905


Model weights saved in /content/drive/MyDrive/jax and flax/bert-base-multilingual-cased_to_bert-base-multilingual-cased/best_model_fold_1/model.safetensors
Non-default generation parameters: {'max_length': 75, 'min_length': 25, 'early_stopping': True, 'num_beams': 3, 'no_repeat_ngram_size': 3}


New best BLEU score: 2.9905. Saving model.


Configuration saved in /content/drive/MyDrive/jax and flax/bert-base-multilingual-cased_to_bert-base-multilingual-cased/config.json
Configuration saved in /content/drive/MyDrive/jax and flax/bert-base-multilingual-cased_to_bert-base-multilingual-cased/generation_config.json
Model weights saved in /content/drive/MyDrive/jax and flax/bert-base-multilingual-cased_to_bert-base-multilingual-cased/model.safetensors
Saving model checkpoint to /content/drive/MyDrive/jax and flax/bert-base-multilingual-cased_to_bert-base-multilingual-cased/checkpoint-1218
Non-default generation parameters: {'max_length': 75, 'min_length': 25, 'early_stopping': True, 'num_beams': 3, 'no_repeat_ngram_size': 3}
Configuration saved in /content/drive/MyDrive/jax and flax/bert-base-multilingual-cased_to_bert-base-multilingual-cased/checkpoint-1218/config.json
Configuration saved in /content/drive/MyDrive/jax and flax/bert-base-multilingual-cased_to_bert-base-multilingual-cased/checkpoint-1218/generation_config.json
M

RuntimeError: [enforce fail at inline_container.cc:595] . unexpected pos 1161708736 vs 1161708628

##5.Evaluate

In [17]:
def load_model(cfg, checkpoint_name):
    # Load Tokenizer
    src_tokenizer_save_path = f"{cfg.ckpt_dir}/{cfg.src_lang}_tokenizer_{cfg.src_model_name}"
    src_tokenizer = AutoTokenizer.from_pretrained(src_tokenizer_save_path)

    tgt_tokenizer_save_path = f"{cfg.ckpt_dir}/{cfg.tgt_lang}_tokenizer_{cfg.tgt_model_name}"
    tgt_tokenizer = AutoTokenizer.from_pretrained(tgt_tokenizer_save_path)

    # Load Model
    model_save_path = f"{cfg.ckpt_dir}/{checkpoint_name}"
    model = EncoderDecoderModel.from_pretrained(model_save_path)

    # Inference Param
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    return src_tokenizer, tgt_tokenizer, model, device

In [18]:
from tqdm import tqdm
def inference(
    text,
    src_tokenizer,
    tgt_tokenizer,
    model,
    device="cpu",
    max_length=75,
    beam_size=5
    ):
    inputs = src_tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
        )
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)
    model.to(device)

    outputs = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        early_stopping=True,
        num_beams=beam_size,
        length_penalty=2.0
    )

    output_str = tgt_tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return output_str

def inference_bath(
    texts,
    src_tokenizer,
    tgt_tokenizer,
    model,
    device="cpu",
    max_length=75,
    beam_size=5,
    batch_size=32
    ):

    pred_texts = []

    if len(texts) < batch_size:
        batch_size = len(texts)

    for x in tqdm(range(0, len(texts), batch_size)):
        text = texts[x:x+batch_size]

        inputs = src_tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=max_length,
            return_tensors="pt"
            )

        input_ids = inputs.input_ids.to(device)
        attention_mask = inputs.attention_mask.to(device)
        model.to(device)

        outputs = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_length=max_length,
            early_stopping=True,
            num_beams=beam_size,
            length_penalty=2.0
        )

        output_str = tgt_tokenizer.batch_decode(outputs, skip_special_tokens=True)
        pred_texts.extend(output_str)
        torch.cuda.empty_cache()

    return pred_texts

In [19]:
class BaseConfig:
    """ base Encoder Decoder config """

    def __init__(self, **kwargs):
        for k, v in kwargs.items():
            setattr(self, k, v)

class NMTConfig(BaseConfig):
    # Data
    src_lang = 'en'
    tgt_lang = 'hu'
    src_max_len = 75
    tgt_max_len = 75

    # Model
    src_model_name = "bert-base-multilingual-cased"
    tgt_model_name = "bert-base-multilingual-cased"

    # Training
    load_model_from_path = False
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    learning_rate = 3e-5
    train_batch_size = 16
    eval_batch_size = 8
    num_train_epochs =15
    ckpt_dir = '/content/drive/MyDrive/jax and flax/' + src_model_name + '_to_' + tgt_model_name
    use_eval_steps = False
    eval_steps = 2000

    # Inference
    max_length_decoder = 75
    min_length_decoder = 25
    beam_size = 5

cfg = NMTConfig()

In [24]:
data_path = '/content/drive/MyDrive/jax and flax/data1000/test_data.csv'  # Đường dẫn tới tệp test_data.csv
test_df = pd.read_csv(data_path)

src_texts = test_df['en'].tolist()
tgt_texts = test_df['hu'].tolist()

In [25]:
src_tokenizer, tgt_tokenizer, model, device = load_model(cfg, checkpoint_name="best_model_fold_1")

loading file vocab.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading file vocab.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file /content/drive/MyDrive/jax and flax/bert-base-multilingual-cased_to_bert-base-multilingual-cased/best_model_fold_1/config.json
Model config EncoderDecoderConfig {
  "architectures": [
    "EncoderDecoderModel"
  ],
  "decoder": {
    "_name_or_path": "bert-base-multilingual-cased",
    "add_cross_attention": true,
    "architectures": [
      "BertForMaskedLM"
    ],
    "attention_probs_dropout_prob": 0.1,
    "bad_words_ids": null,
    "begin_suppress_tokens": null,
    "bos_token_id": null,
    "chunk_size_feed_forward": 0,
    "classifier_dropout": null,
    "cross_attention_hidden_size": null,
    "decoder_start_token_id": null,
    "directionality": "

In [26]:
pred_texts = inference_bath(src_texts, src_tokenizer, tgt_tokenizer, model, device, beam_size=2)

100%|██████████| 4/4 [00:25<00:00,  6.26s/it]


In [27]:
sacrebleu.corpus_bleu(pred_texts, [tgt_texts])

BLEU = 2.13 22.1/5.0/0.8/0.2 (BP = 1.000 ratio = 1.258 hyp_len = 2503 ref_len = 1990)