# Install packages, connect to google drive

In [2]:
# Install required packages
# !pip install protobuf==3.20.3

!pip install transformers datasets sentencepiece sacrebleu evaluate langdetect optuna unbabel-comet

# Install Ray, but note that it may have issues in Colab
!pip install ray[tune]

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting unbabel-comet
  Downloading unbabel_comet-2.2.2-py3-none-any.whl.metadata (15 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64

In [3]:
import os
from datasets import load_dataset, concatenate_datasets, Dataset
from transformers import (MarianMTModel, MarianTokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments)
from langdetect import detect, LangDetectException
import evaluate
import torch
from ray import tune

from google.colab import drive
drive.mount('/content/drive')
output_dir = './result'
logging_dir = './logs'


Mounted at /content/drive


# Define the functions that preprocess the dataset

In [4]:
def sentence_length_filter(example, max_diff_ratio=0.5):
    source_sentence = example['translation']['zh']
    target_sentence = example['translation']['en']
    len_src = len(source_sentence)
    len_tgt = len(target_sentence.split())
    length_ratio = abs(len_src - len_tgt) / max(len_src, len_tgt)
    return length_ratio < max_diff_ratio

def preprocess_dataset(dataset, max_diff_ratio=0.5):
    dataset = dataset.filter(lambda example: sentence_length_filter(example, max_diff_ratio))
    return dataset

# Synonyms replacement


In [5]:
import nltk
nltk.download('wordnet')
import random
from nltk.corpus import wordnet

def synonym_replacement(sentence, replacement_prob):
    words = sentence.split()
    new_words = []
    for word in words:
        if random.random() < replacement_prob:
            synonyms = wordnet.synsets(word)
            if synonyms:
                synonym = synonyms[0].lemmas()[0].name()
                new_words.append(synonym)
            else:
                new_words.append(word)
        else:
            new_words.append(word)
    return ' '.join(new_words)


sentence = "This is a test sentence for synonym replacement."
print(synonym_replacement(sentence, 0.5))


def preprocess_with_augmentation(dataset, max_diff_ratio=0.5, augmentation_prob=0.8, augment_times=1):
    dataset = dataset.filter(lambda example: sentence_length_filter(example, max_diff_ratio))

    augmented_examples = []
    for example in dataset['translation']:  # example is dict{'zh': '...', 'en': '...'}, dataset['translation'] is list of dict
        augmented_examples.append(example)
        zh_text = example['zh']
        en_text = example['en']

        augmented_en_text = synonym_replacement(en_text, augmentation_prob)

        augmented_examples.append({'zh': zh_text,'en': augmented_en_text})

    augmented_dataset = Dataset.from_dict({
        'translation': augmented_examples
    })

    return augmented_dataset


[nltk_data] Downloading package wordnet to /root/nltk_data...


This is angstrom test sentence for synonym replacement.


# Load and preprocess the data, prepare the training and validation dataset

In [6]:
def load_and_prepare_datasets(selected_datasets):
    subsets = []
    for ds in selected_datasets:
        name = ds['name']
        config = ds.get('config', None)
        split = ds.get('split', 'train')
        proportion = ds['proportion']

        dataset = load_dataset(name, config, split=split)
        subset_size = int(len(dataset) * proportion)

        if (proportion < 1):
          subset = dataset.select(range(subset_size))
          subset = preprocess_with_augmentation(dataset)
        else:
          subset = preprocess_with_augmentation(dataset)

        subsets.append(subset)

    if subsets:
        print("Merge all selected subsets.")
        combined_dataset = concatenate_datasets(subsets)

        print("Split into training set and validation set.")
        train_testvalid = combined_dataset.train_test_split(test_size=0.1, seed=42)
        train_data = train_testvalid['train']
        valid_data = train_testvalid['test']

        return train_data, valid_data
    else:
        return None, None

selected_datasets = [
    {
        "name": "haoranxu/ALMA-Human-Parallel",
        "config": "zh-en",
        "split": "train",
        "proportion": 1
    },
    {
        "name": "haoranxu/X-ALMA-Parallel-Data",
        "config": "zh-en",
        "split": "train",
        "proportion": 1
    }
]
train_data, valid_data = load_and_prepare_datasets(selected_datasets)


# {'translation': [{'en': '...', 'zh': '...'},
#          {'en': '...', 'zh': '...'}] }
# Check format
if train_data and valid_data:
    print("Training dataset example:")
    print(train_data[0:2])
    print("Validation dataset example:")
    print(valid_data[0])
else:
  print("No training dataset or validation dataset.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/3.03k [00:00<?, ?B/s]

(…)-00000-of-00001-6bd744feceb30dbf.parquet:   0%|          | 0.00/3.06M [00:00<?, ?B/s]

(…)-00000-of-00001-d1cc83e30e3dcdb2.parquet:   0%|          | 0.00/196k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15406 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1002 [00:00<?, ? examples/s]

Filter:   0%|          | 0/15406 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/17.1k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.11M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6906 [00:00<?, ? examples/s]

Filter:   0%|          | 0/6906 [00:00<?, ? examples/s]

Merge all selected subsets.
Split into training set and validation set.
Training dataset example:
{'translation': [{'en': 'The statistics Canada state on Friday that 12 of the twenty industries have achieve growth.', 'zh': '加拿大统计局周五称，20个行业中有12个行业实现增长。'}, {'en': 'nourish the liver and the eyes, tonify the liver and the kidney, whiten and nourish the skin, nourish deficiency, produce secretion and delay aging.', 'zh': '益肝明目，滋补肝肾，美白养颜，补虚生津，延缓衰老。'}]}
Validation dataset example:
{'translation': {'en': 'Wai Tau Tsuen boast angstrom alone Marine environment, with two leisure travel resource include natural ecology and cultural ecology.', 'zh': '围头村具有得天独厚的海洋环境，坐拥自然生态和人文生态两大休闲旅游资源。'}}


# Tokenization

In [None]:
def preprocess_function(examples):
    # Extract lists of source and target sentences
    src_texts = [ex['zh'] for ex in examples['translation']]
    tgt_texts = [ex['en'] for ex in examples['translation']]

    # Tokenize the source sentences
    model_inputs = tokenizer(src_texts, truncation=True, padding=True)

    # Tokenize the target sentences using 'text_target'
    labels = tokenizer(text_target=tgt_texts, truncation=True, padding=True)

    # Add labels to the inputs
    model_inputs['labels'] = labels['input_ids']
    return model_inputs


In [None]:
# Load the tokenizer
tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-zh-en')

# Tokenize the datasets
tokenized_train = train_data.map(preprocess_function, batched=True)
tokenized_valid = valid_data.map(preprocess_function, batched=True)

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/805k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/807k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.62M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]



Map:   0%|          | 0/19224 [00:00<?, ? examples/s]

Map:   0%|          | 0/2136 [00:00<?, ? examples/s]

# Define the Data collector & Evaluation function

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=None)

def compute_metrics(eval_preds):
    metric = evaluate.load("sacrebleu")
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    labels = tokenizer.batch_decode(labels, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    labels = [[label] for label in labels]
    result = metric.compute(predictions=decoded_preds, references=labels)
    return {"bleu": result["score"]}

# Back translation

In [None]:
# Step 1: Extract English sentences from your existing dataset
# Assuming 'train_data' is your original training dataset
# and has the format {'translation': {'zh': ..., 'en': ...}}

monolingual_en = train_data.map(lambda x: {'text': x['translation']['en']})
monolingual_en = monolingual_en.remove_columns(['translation'])

# Step 2: Load the English-to-Chinese translation model and tokenizer
from transformers import MarianMTModel, MarianTokenizer

en_zh_model_name = 'Helsinki-NLP/opus-mt-en-zh'
en_zh_tokenizer = MarianTokenizer.from_pretrained(en_zh_model_name)
en_zh_model = MarianMTModel.from_pretrained(en_zh_model_name)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
en_zh_model.to(device)

# Step 3: Translate English sentences to Chinese
def back_translate_en_to_zh(examples):
    inputs = en_zh_tokenizer(examples['text'], return_tensors='pt', padding=True, truncation=True).to(device)
    with torch.no_grad():
        translated_tokens = en_zh_model.generate(**inputs)
    translations = en_zh_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
    return {'translation_zh': translations}

monolingual_en = monolingual_en.map(back_translate_en_to_zh, batched=True, batch_size=16)

# Step 4: Create synthetic parallel data with matching field order
monolingual_en = monolingual_en.map(lambda x: {'translation': {'zh': x['translation_zh'], 'en': x['text']}})
synthetic_dataset = monolingual_en.remove_columns(['text', 'translation_zh'])

# Step 5: Cast the synthetic_dataset to have the same features as train_data
synthetic_dataset = synthetic_dataset.cast(train_data.features)

# Verify that the field orders match
print(train_data.features)
print(synthetic_dataset.features)

# Expected Output:
# {'translation': Translation(languages=('zh', 'en'), id=None)}
# {'translation': Translation(languages=('zh', 'en'), id=None)}

# Step 6: Combine synthetic data with original training data
combined_train_data = concatenate_datasets([train_data, synthetic_dataset])

# Step 7: Preprocess and tokenize the combined dataset
tokenized_combined_train = combined_train_data.map(preprocess_function, batched=True)
tokenized_valid = valid_data.map(preprocess_function, batched=True)


Map:   0%|          | 0/19224 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/806k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/805k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.62M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Map:   0%|          | 0/19224 [00:00<?, ? examples/s]

Map:   0%|          | 0/19224 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/19224 [00:00<?, ? examples/s]

{'translation': {'en': Value(dtype='string', id=None), 'zh': Value(dtype='string', id=None)}}
{'translation': {'en': Value(dtype='string', id=None), 'zh': Value(dtype='string', id=None)}}


Map:   0%|          | 0/38448 [00:00<?, ? examples/s]

Map:   0%|          | 0/2136 [00:00<?, ? examples/s]

# Optuna tuning

In [None]:
!pip install optuna



In [None]:
from transformers import EarlyStoppingCallback
import random
import numpy as np
import evaluate

seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

def model_init():
    return MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-zh-en')

def hp_space_optuna(trial):
    return {
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-5, 5e-5),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 2, 4),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8, 16]),
        "weight_decay": trial.suggest_uniform("weight_decay", 0.0, 0.1),
    }

def compute_metrics_optuna(eval_preds):

    metric = evaluate.load("sacrebleu")
    tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-zh-en')
    preds, labels = eval_preds

    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Prepare references for sacrebleu
    decoded_labels = [[label] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,  # Initial value, will be overridden by hyperparameter search
    per_device_train_batch_size=16,  # Initial value, will be overridden
    per_device_eval_batch_size=16,
    weight_decay=0.01,  # Initial value, will be overridden
    save_total_limit=3,
    num_train_epochs=3,  # Initial value, will be overridden
    predict_with_generate=True,
    logging_dir=logging_dir,
    logging_steps=100,
    report_to="none",
    load_best_model_at_end=True,
)

tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-zh-en')
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=None)


trainer = Seq2SeqTrainer(
    model_init = model_init,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics_optuna,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)],
)

best_run = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    n_trials=6,
    hp_space = hp_space_optuna,
    compute_objective=lambda metrics: metrics["eval_bleu"],
)

print("Best hyperparameters found:")
for key, value in best_run.hyperparameters.items():
    print(f"{key}: {value}")


training_args.learning_rate = best_run.hyperparameters["learning_rate"]
training_args.num_train_epochs = best_run.hyperparameters["num_train_epochs"]
training_args.per_device_train_batch_size = best_run.hyperparameters["per_device_train_batch_size"]
training_args.weight_decay = best_run.hyperparameters["weight_decay"]

trainer.train()
trainer.save_model()


[I 2024-11-10 12:25:50,178] A new study created in memory with name: no-name-e60317dd-a92a-4adf-aa6b-4eb309a4ffba
  "learning_rate": trial.suggest_loguniform("learning_rate", 1e-5, 5e-5),
  "weight_decay": trial.suggest_uniform("weight_decay", 0.0, 0.1),


Epoch,Training Loss,Validation Loss,Bleu
1,0.3716,0.342142,20.255826
2,0.3316,0.327033,21.00865


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.encoder.embed_positions.weight', 'model.decoder.embed_tokens.weight', 'model.decoder.embed_positions.weight', 'lm_head.weight'].
[I 2024-11-10 12:43:32,044] Trial 0 finished with value: 21.008649604615847 and parameters: {'learning_rate': 1.5086681733808356e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 16, 'weight_decay': 0.08724462017800183}. Best is trial 0 with value: 21.008649604615847.
  "learning_rate": trial.suggest_loguniform("learning_rate", 1e-5, 5e-5),
  "weight_decay": trial.suggest_uniform("we

Epoch,Training Loss,Validation Loss,Bleu
1,0.3717,0.341944,20.210792
2,0.3237,0.317507,21.477363
3,0.3057,0.311233,21.893914


Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.encoder.embed_positions.weight', 'model.decoder.embed_tokens.weight', 'model.decoder.embed_positions.weight', 'lm_head.weight'].
[I 2024-11-10 13:09:22,470] Trial 1 finished with value: 21.893914115587965 and parameters: {'learning_rate': 1.364917552133475e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 16, 'weight_decay': 0.005971024169409412}. Best is trial 1 with value: 21.893914115

Epoch,Training Loss,Validation Loss,Bleu
1,0.3298,0.295919,22.756041
2,0.2615,0.27678,23.757643


Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.encoder.embed_positions.weight', 'model.decoder.embed_tokens.weight', 'model.decoder.embed_positions.weight', 'lm_head.weight'].
[I 2024-11-10 13:26:26,475] Trial 2 finished with value: 23.757643288409067 and parameters: {'learning_rate': 3.156663840507841e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 8, 'weight_decay': 0.09191287310105578}. Best is trial 2 with value: 23.757643288409067.
  "learning_rate": trial.suggest_loguniform("learning_rate", 1e-5, 5e-5),
  "weight_decay": trial.suggest_uniform("weig

Epoch,Training Loss,Validation Loss,Bleu
1,0.3821,0.352229,20.076112
2,0.3345,0.325444,21.186147
3,0.3126,0.313504,21.809582
4,0.3273,0.310372,21.979585


Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.encoder.embed_positions.weight', 'model.decoder.embed_tokens.weight', 'model.decoder.embed_positions.weight', 'lm_head.weight'].
[I 2024-11-10 14:00:57,160] Trial 3 finished with value: 21.979585292247375 and parameters: {'learning_rate': 1.0271541538742113e-05, 'num_train_

Epoch,Training Loss,Validation Loss,Bleu
1,0.3601,0.330351,20.228904
2,0.3081,0.304964,22.272606
3,0.2874,0.298437,22.537512


Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.encoder.embed_positions.weight', 'model.decoder.embed_tokens.weight', 'model.decoder.embed_positions.weight', 'lm_head.weight'].
[I 2024-11-10 14:27:04,029] Trial 4 finished with value: 22.537512452975424 and parameters: {'learning_rate': 1.7747553615438413e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 16, 'weight_decay': 0.08594297980783579}. Best is trial 2 with value: 23.757643288

Epoch,Training Loss,Validation Loss,Bleu
1,0.3167,0.283318,23.454971
2,0.2377,0.260728,24.847312


Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.encoder.embed_positions.weight', 'model.decoder.embed_tokens.weight', 'model.decoder.embed_positions.weight', 'lm_head.weight'].
[I 2024-11-10 14:44:18,144] Trial 5 finished with value: 24.847312184532395 and parameters: {'learning_rate': 4.506030975094382e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 8, 'weight_decay': 0.062179222404497296}. Best is trial 5 with value: 24.847312184532395.


Best hyperparameters found:
learning_rate: 4.506030975094382e-05
num_train_epochs: 2
per_device_train_batch_size: 8
weight_decay: 0.062179222404497296


Epoch,Training Loss,Validation Loss,Bleu
1,0.3167,0.283318,23.454971
2,0.2377,0.260728,24.847312


Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.encoder.embed_positions.weight', 'model.decoder.embed_tokens.weight', 'model.decoder.embed_positions.weight', 'lm_head.weight'].
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
