# 1. Install packages, connect to dirve

In [1]:
# Install required packages
!pip install transformers datasets sentencepiece sacrebleu evaluate langdetect optuna unbabel-comet ray[tune] jieba peft adapter-transformers adapters
!pip install --upgrade peft transformers accelerate
!pip install --upgrade git+https://github.com/huggingface/transformers

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting unbabel-comet
  Downloading unbabel_comet-2.2.2-py3-none-any.whl.metadata (15 kB)
Collecting adapter-transformers
  Downloading adapter_transformers-4.0.0.tar.gz (2.9 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting adapters
  Downloading adapters-1.0.1-py3-none

In [2]:
import os
from datasets import load_dataset, concatenate_datasets
from transformers import (MarianMTModel, MarianTokenizer, DataCollatorForSeq2Seq,
                          Seq2SeqTrainer, Seq2SeqTrainingArguments)
from langdetect import detect, LangDetectException
import evaluate
import torch
from ray import tune
import random
from peft import PrefixTuningConfig, get_peft_model, PeftModel, TaskType
from tqdm import tqdm
import numpy as np
from google.colab import drive

In [3]:
drive.mount('/content/drive')
output_dir = '/content/drive/MyDrive//full_fine_tuning'
logging_dir = '/content/drive/MyDrive/full_fine_tuning_log'
model_path='/content/drive/MyDrive/final_model'

Mounted at /content/drive


# 2. Filter and preprocess dataset

In [None]:
def sentence_length_filter(example, max_diff_ratio=0.5):
    source_sentence = example['translation']['zh']
    target_sentence = example['translation']['en']
    len_src = len(source_sentence)  # Character count for Chinese
    len_tgt = len(target_sentence.split())  # Word count for English
    length_ratio = abs(len_src - len_tgt) / max(len_src, len_tgt)
    return length_ratio < max_diff_ratio


def preprocess_dataset(dataset, max_diff_ratio=0.5):
    # Apply sentence length filter
    dataset = dataset.filter(lambda example: sentence_length_filter(example, max_diff_ratio))
    return dataset

def excessive_punctuation_filter(example, max_punct_ratio=0.3):
    source_sentence = example['translation']['zh']
    target_sentence = example['translation']['en']

    def punct_ratio(text):
        punct_count = sum(1 for char in text if char in string.punctuation)
        return punct_count / max(len(text), 1)

    if punct_ratio(source_sentence) > max_punct_ratio or punct_ratio(target_sentence) > max_punct_ratio:
        return False
    return True

### Define preprocess_with_augmentation(),and synonym_replacement function

In [None]:
import nltk
nltk.download('wordnet')
import random
from nltk.corpus import wordnet

def synonym_replacement(sentence, replacement_prob):
    words = sentence.split()
    new_words = []
    for word in words:
        if random.random() < replacement_prob:
            synonyms = wordnet.synsets(word)
            if synonyms:
                synonym = synonyms[0].lemmas()[0].name()
                new_words.append(synonym)
            else:
                new_words.append(word)
        else:
            new_words.append(word)
    return ' '.join(new_words)

sentence = "This is a test sentence for synonym replacement."
print(synonym_replacement(sentence, 0.5))

def preprocess_with_augmentation(dataset, max_diff_ratio=0.5, augmentation_prob=0.5, augment_times=1):

    dataset = dataset.filter(lambda example: sentence_length_filter(example, max_diff_ratio))

    augmented_examples = []
    for example in dataset['translation']:
        augmented_examples.append(example)
        zh_text = example['zh']
        en_text = example['en']

        augmented_en_text = synonym_replacement(en_text, augmentation_prob)
        augmented_examples.append({'zh': zh_text,'en': augmented_en_text})
    augmented_dataset = Dataset.from_dict({
        'translation': augmented_examples
    })

    # combined_dataset = concatenate_datasets([dataset, augmented_dataset])
    return augmented_dataset


[nltk_data] Downloading package wordnet to /root/nltk_data...


This is angstrom trial sentence for synonym replacement.


# 3. Load and prepare the training and validation dataset

##Availabel dataset :
'haoranxu/ALMA-Human-Parallel' 16.4k rows,
'haoranxu/X-ALMA-Parallel-Data' 6.9k rows,
opus 1M rows,
wmt19 26M rows

In [None]:
def load_and_prepare_datasets(selected_datasets):
    subsets = []
    for ds in selected_datasets:
        name = ds['name']
        config = ds.get('config', None)
        split = ds.get('split', 'train')
        proportion = ds['proportion']

        print(f"load dataset: {name}, config: {config}, split: {split}, proportion: {proportion}")


        dataset = load_dataset(name, config, split=split)
        subset_size = int(len(dataset) * proportion)

        if (proportion < 1):
          print(f"pick {subset_size} samples")
          subset = dataset.select(range(subset_size))
          subset = preprocess_dataset(subset)
          # subset = preprocess_with_augmentation(dataset)
        else:
          print(f"full dataset，{subset_size} samples")
          subset = preprocess_dataset(dataset)
          # subset = preprocess_with_augmentation(dataset)
          print(f"after preprocessing {len(subset)}")

        subsets.append(subset)

    if subsets:
        combined_dataset = concatenate_datasets(subsets)
        train_testvalid = combined_dataset.train_test_split(test_size=0.1, seed=42)
        train_data = train_testvalid['train']
        valid_data = train_testvalid['test']

        return train_data, valid_data
    else:
        print("no dataset")
        return None, None

selected_datasets = [
    {
        "name": "haoranxu/ALMA-Human-Parallel",
        "config": "zh-en",
        "split": "train",
        "proportion": 1
    },
    {
        "name": "haoranxu/X-ALMA-Parallel-Data",
        "config": "zh-en",
        "split": "train",
        "proportion": 1
    }
    # {
    #     "name": "wmt19",
    #     "config": "zh-en",
    #     "split": "train",
    #     "proportion": 0.001
    # }
    # {
    #     "name": "librakevin/wmt19-short",
    #     "config": "zh-en-50-small",
    #     "split": "train",
    #     "proportion": 1
    # }
]
train_data, valid_data = load_and_prepare_datasets(selected_datasets)


# {'translation': [{'en': '...', 'zh': '...'},
#          {'en': '...', 'zh': '...'}] }
if train_data and valid_data:
    print(train_data[0:2])
    print(valid_data[0])
else:
  print("no data collected")

load dataset: haoranxu/ALMA-Human-Parallel, config: zh-en, split: train, proportion: 1


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/3.03k [00:00<?, ?B/s]

(…)-00000-of-00001-6bd744feceb30dbf.parquet:   0%|          | 0.00/3.06M [00:00<?, ?B/s]

(…)-00000-of-00001-d1cc83e30e3dcdb2.parquet:   0%|          | 0.00/196k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15406 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1002 [00:00<?, ? examples/s]

full dataset，15406 samples


Filter:   0%|          | 0/15406 [00:00<?, ? examples/s]

after preprocessing 10680
load dataset: haoranxu/X-ALMA-Parallel-Data, config: zh-en, split: train, proportion: 1


README.md:   0%|          | 0.00/17.1k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.11M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6906 [00:00<?, ? examples/s]

full dataset，6906 samples


Filter:   0%|          | 0/6906 [00:00<?, ? examples/s]

after preprocessing 4376
{'translation': [{'zh': '从深入推进简政放权、建立权力清单制度到营造保护企业家合法权益的法治环境，打造公平竞争的市场环境，塑造全社会尊重企业家的氛围，近年来，党中央对企业家的重视程度、制度保障力度空前。', 'en': 'In recent years, from deepening the promotion of streamline administration and delegate more power to lower-level governments, establishing a power list system, and creating a legal environment that protects the legitimate rights and interests of entrepreneurs, creating a market environment of fair competition and shaping the atmosphere of respecting entrepreneurs in the whole society, the Party Central Committee has attached unprecedented emphasis on entrepreneurs and institutional guarantees.'}, {'zh': '真心推荐大家去宣汉金夫人拍婚纱照！', 'en': 'I sincerely recommend choosing Xuanhan Mrs. Gold to take your wedding dress photoshoot!'}]}
{'translation': {'zh': '韩国9月出口同比降幅为2016年7月以来最大，降幅大于路透调查11位分析师所得预估的下滑5.7 % 。', 'en': 'The year-on-year drop of South Korean exports in September was the largest since July 2016, which was more than the decrease of 5.7% esti

# 4. Tokenization and define Data collector & Evaluation function

In [None]:
def preprocess_function(examples):
    # Extract lists of source and target sentences
    src_texts = [ex['zh'] for ex in examples['translation']]
    tgt_texts = [ex['en'] for ex in examples['translation']]
    # Tokenize the source sentences
    model_inputs = tokenizer(src_texts, truncation=True, padding=True)
    # Tokenize the target sentences using 'text_target'
    labels = tokenizer(text_target=tgt_texts, truncation=True, padding=True)
    # Add labels to the inputs
    model_inputs['labels'] = labels['input_ids']
    return model_inputs
tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-zh-en')

# Tokenize the datasets
tokenized_train = train_data.map(preprocess_function, batched=True)
tokenized_valid = valid_data.map(preprocess_function, batched=True)



metric = evaluate.load("sacrebleu")
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    labels = tokenizer.batch_decode(labels, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    labels = [[label] for label in labels]
    result = metric.compute(predictions=decoded_preds, references=labels)
    return {"bleu": result["score"]}


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/805k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/807k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.62M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]



Map:   0%|          | 0/13550 [00:00<?, ? examples/s]

Map:   0%|          | 0/1506 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

## Data augumentation with back_translation



In [None]:
# Extract English sentences from your existing dataset, Assuming 'train_data' is your original training dataset, and has the format {'translation': {'zh': ..., 'en': ...}}
monolingual_en = train_data.map(lambda x: {'text': x['translation']['en']})
monolingual_en = monolingual_en.remove_columns(['translation'])

# Load the English-to-Chinese translation model and tokenizer
from transformers import MarianMTModel, MarianTokenizer

en_zh_model_name = 'Helsinki-NLP/opus-mt-en-zh'
en_zh_tokenizer = MarianTokenizer.from_pretrained(en_zh_model_name)
en_zh_model = MarianMTModel.from_pretrained(en_zh_model_name)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
en_zh_model.to(device)

# Translate English sentences to Chinese
def back_translate_en_to_zh(examples):
    inputs = en_zh_tokenizer(examples['text'], return_tensors='pt', padding=True, truncation=True).to(device)
    with torch.no_grad():
        translated_tokens = en_zh_model.generate(**inputs)
    translations = en_zh_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
    return {'translation_zh': translations}

monolingual_en = monolingual_en.map(back_translate_en_to_zh, batched=True, batch_size=16)

# Create synthetic parallel data with matching field order
monolingual_en = monolingual_en.map(lambda x: {'translation': {'zh': x['translation_zh'], 'en': x['text']}})
synthetic_dataset = monolingual_en.remove_columns(['text', 'translation_zh'])

# Cast the synthetic_dataset to have the same features as train_data
synthetic_dataset = synthetic_dataset.cast(train_data.features)

# Verify that the field orders match
print(train_data.features)
print(synthetic_dataset.features)

# Combine synthetic data with original training data
combined_train_data = concatenate_datasets([train_data, synthetic_dataset])

# Preprocess and tokenize the combined dataset
tokenized_combined_train = combined_train_data.map(preprocess_function, batched=True)
tokenized_valid = valid_data.map(preprocess_function, batched=True)


Map:   0%|          | 0/13550 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/806k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/805k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.62M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Map:   0%|          | 0/13550 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/312M [00:00<?, ?B/s]

Map:   0%|          | 0/13550 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/13550 [00:00<?, ? examples/s]

{'translation': {'zh': Value(dtype='string', id=None), 'en': Value(dtype='string', id=None)}}
{'translation': {'zh': Value(dtype='string', id=None), 'en': Value(dtype='string', id=None)}}


Map:   0%|          | 0/27100 [00:00<?, ? examples/s]

#5. Set training parameters and training model


## Default training

In [None]:
model_name = 'Helsinki-NLP/opus-mt-zh-en'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Default training
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    logging_dir=logging_dir,
    logging_steps=100,
    save_strategy='epoch',
    report_to="none",
    load_best_model_at_end=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_combined_train,
    eval_dataset=tokenized_valid,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.save_model()

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss,Bleu
1,0.2506,0.299894,26.63743
2,0.21,0.291919,27.257324
3,0.1896,0.291613,27.631387


model.safetensors:   0%|          | 0.00/312M [00:00<?, ?B/s]

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

# 6. Load the saved model

In [4]:
def load_trained_model(model_path=model_path):
    print(model_path)
    tokenizer = MarianTokenizer.from_pretrained(model_path)
    model = MarianMTModel.from_pretrained(model_path)
    return tokenizer, model

# Load model and tokenizer
tokenizer, model = load_trained_model()

def count_parameters(model):
    return sum(p.numel() for p in model.parameters())

total_params = count_parameters(model)
print(f"total number of parameters: {total_params}") #total number of parameters in model

/content/drive/MyDrive/final_model




total number of parameters: 77943296


# 7. Testing

In [6]:
from datasets import Dataset

tatoeba_zh_path = '/content/drive/MyDrive/tatoeba.zh'
tatoeba_en_path = '/content/drive/MyDrive/tatoeba.en'
wmt_zh_path = '/content/drive/MyDrive/wmttest2022.zh'
wmt_en_path = '/content/drive/MyDrive/wmttest2022.AnnA.en'


with open(tatoeba_zh_path, 'r', encoding='utf-8') as f_zh, open(tatoeba_en_path, 'r', encoding='utf-8') as f_en:
    tatoeba_zh = f_zh.readlines()
    tatoeba_en = f_en.readlines()

with open(wmt_zh_path, 'r', encoding='utf-8') as f_zh, open(wmt_en_path, 'r', encoding='utf-8') as f_en:
    wmt_zh = f_zh.readlines()
    wmt_en = f_en.readlines()

tatoeba_data = [{'translation': {'zh': zh.strip(), 'en': en.strip()}} for zh, en in zip(tatoeba_zh, tatoeba_en)]
wmt_data = [{'translation': {'zh': zh.strip(), 'en': en.strip()}} for zh, en in zip(wmt_zh, wmt_en)]

tatoeba_dataset = Dataset.from_list(tatoeba_data)
wmt_dataset = Dataset.from_list(wmt_data)

def preprocess_test_data(examples):
    src_texts = [ex['zh'] for ex in examples['translation']]
    tgt_texts = [ex['en'] for ex in examples['translation']]
    return {'src_texts': src_texts, 'tgt_texts': tgt_texts}

tatoeba_dataset = tatoeba_dataset.map(preprocess_test_data, batched=True)
wmt_dataset = wmt_dataset.map(preprocess_test_data, batched=True)
tatoeba_test_texts = tatoeba_dataset['src_texts']
tatoeba_references = [[ref] for ref in tatoeba_dataset['tgt_texts']]
wmt_test_texts = wmt_dataset['src_texts']
wmt_references = [[ref] for ref in wmt_dataset['tgt_texts']]


Map:   0%|          | 0/469 [00:00<?, ? examples/s]

Map:   0%|          | 0/1875 [00:00<?, ? examples/s]

In [7]:
def generate_translations(tokenizer, model, texts, batch_size=16):
    translations = []
    model.eval()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    #device = torch.device('cpu')
    model.to(device)

    # Initialize tqdm progress bar
    with tqdm(total=len(texts), desc="Generating Translations", unit="text") as pbar:
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            # Tokenize the input texts
            inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True).to(device)
            # Generate translations
            with torch.no_grad():
                translated_tokens = model.generate(**inputs)
            # Decode the tokens to strings
            batch_translations = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
            translations.extend(batch_translations)

            # Update the progress bar by the batch size
            pbar.update(len(batch_texts))

    return translations

In [8]:
tatoeba_translated_texts = generate_translations(tokenizer, model, tatoeba_test_texts)

Generating Translations: 100%|██████████| 469/469 [00:13<00:00, 35.87text/s]


In [9]:
wmt_translated_texts = generate_translations(tokenizer, model, wmt_test_texts[0:335])

Generating Translations: 100%|██████████| 335/335 [00:33<00:00, 10.01text/s]


# 8. BLEU Score

In [10]:
## Calculate BLEU score
bleu = evaluate.load('sacrebleu')
tatoeba_results = bleu.compute(predictions = tatoeba_translated_texts, references = tatoeba_references)
wmt_results = bleu.compute(predictions = wmt_translated_texts, references = wmt_references[0:335])
print(f"BLEU score of tatoeba: {tatoeba_results['score']:.2f}")
print(f"BLEU score of wmt: {wmt_results['score']:.2f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

BLEU score of tatoeba: 35.77
BLEU score of wmt: 31.48


# 9. COMET score

In [11]:
from comet import download_model, load_from_checkpoint

# Download and load a COMET model
model_path = download_model("Unbabel/wmt22-comet-da")
comet_model = load_from_checkpoint(model_path)

# Prepare data for COMET
tatoeba_comet_data = [{
    'src': src,
    'mt': mt,
    'ref': ref[0]
} for src, mt, ref in zip(tatoeba_test_texts, tatoeba_translated_texts, tatoeba_references)]

wmt_comet_data = [{
    'src': src,
    'mt': mt,
    'ref': ref[0]
} for src, mt, ref in zip(wmt_test_texts, wmt_translated_texts, wmt_references)]

# Compute COMET scores
tatoeba_comet_scores = comet_model.predict(tatoeba_comet_data, batch_size=8, gpus=1 if torch.cuda.is_available() else 0)
wmt_comet_scores = comet_model.predict(wmt_comet_data, batch_size=8, gpus=1 if torch.cuda.is_available() else 0)

average_comet_score_tatoeba = sum(tatoeba_comet_scores['scores']) / len(tatoeba_comet_scores['scores'])
average_comet_score_wmt = sum(wmt_comet_scores['scores']) / len(wmt_comet_scores['scores'])

print(f"COMET score of tatoeba: {average_comet_score_tatoeba:.4f}")
print(f"COMET score of wmt: {average_comet_score_wmt:.4f}")

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model.ckpt:   0%|          | 0.00/2.32G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.53k [00:00<?, ?B/s]

LICENSE:   0%|          | 0.00/9.69k [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

hparams.yaml:   0%|          | 0.00/567 [00:00<?, ?B/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/371e9839ca4e213dde891b066cf3080f75ec7e72/checkpoints/model.ckpt`


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 59/59 [00:04<00:00, 11.81it/s]
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 42/42 [00:08<00:00,  4.80it/s]


COMET score of tatoeba: 0.8697
COMET score of wmt: 0.8072


# 10. Example

In [12]:
def translate_sentence(tokenizer, model, sentence):
    model.eval()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    # Tokenize the input text
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True).to(device)
    # Generate translation
    with torch.no_grad():
        translated_tokens = model.generate(**inputs)
    # Decode the tokens to a string
    translation = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
    return translation

# Example usage
chinese_sentence = "很高兴认识你"
english_translation = translate_sentence(tokenizer, model, chinese_sentence)
print(f"Chinese: {chinese_sentence}")
print(f"English Translation: {english_translation}")


Chinese: 很高兴认识你
English Translation: Nice to meet you.
