mBART50 - BLEU and COMET

Generate translation

In [None]:
import os
import json
from datasets import load_dataset
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
import torch
from evaluate import load

# Load the test dataset
test_dataset = load_dataset('csv', data_files='ko_zh_test_dataset_SWRC.csv')

# Initialize tokenizer
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

# Define the translation function
def generate_translation(examples, model, batch_size = 16, max_length=200):
    target_language_code = "zh_CN"
    formatted_inputs = [f"ko_KR {sentence} </s>" for sentence in examples['source']]
    encoded_inputs = tokenizer(formatted_inputs, return_tensors="pt", padding=True, truncation=True, max_length=max_length)

    # Use batching to handle large numbers of sentences at once
    all_generated_texts = []
    input_ids = encoded_inputs['input_ids']
    attention_mask = encoded_inputs['attention_mask']

    # Split input into smaller batches based on batch_size
    num_batches = (len(input_ids) + batch_size - 1) // batch_size  # Calculate number of batches

    for batch_idx in range(num_batches):
        start_idx = batch_idx * batch_size
        end_idx = min((batch_idx + 1) * batch_size, len(input_ids))

        # Get the current batch
        input_ids_batch = input_ids[start_idx:end_idx].to(model.device)
        attention_mask_batch = attention_mask[start_idx:end_idx].to(model.device)

        with torch.no_grad():
            generated_ids = model.generate(
                input_ids=input_ids_batch,
                attention_mask=attention_mask_batch,
                forced_bos_token_id=tokenizer.lang_code_to_id[target_language_code]
            )
        # Decode the generated ids to text
        generated_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        all_generated_texts.extend(generated_texts)

    return {"generated_text": all_generated_texts}

# Specify the checkpoint to process
checkpoint_dir = '/home/u542596/experiments/bilingual_fine_tune/SWRC'
checkpoints = [name for name in os.listdir(checkpoint_dir) if name.startswith("checkpoint-")]

# GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'

batch_size = 16

# Process each checkpoint
for checkpoint_name in checkpoints:
    checkpoint_path = os.path.join(checkpoint_dir, checkpoint_name)
    model = MBartForConditionalGeneration.from_pretrained(checkpoint_path).to(device)

    output_file = f'generated_translation_{checkpoint_name}_SWRC.txt'
    with open(output_file, 'w', encoding='utf-8') as f:
        batch = []
        for idx, source in enumerate(test_dataset['train']['source']):
            batch.append(source)

            if len(batch) == batch_size or (idx + 1) == len(test_dataset['train']['source']):
                generated_translation = generate_translation({"source": batch}, model, batch_size=batch_size)
                for translation in generated_translation['generated_text']:
                    f.write(translation + '\n')

                if (idx + 1) % 100 == 0:
                    print(f"Processed {idx + 1} sentences for {checkpoint_name}")

                batch.clear()

    print(f"Finished generating translations for {checkpoint_name}")

BLEU

In [None]:
# character-based segmentation (both reference and generated translation)
import pandas as pd

input_file = '/home/u542596/experiments/bilingual_fine_tune/BLEU_and_COMET/generated_translation_checkpoint_16030.txt'
output_file = '/home/u542596/experiments/bilingual_fine_tune/BLEU_and_COMET/generated_translation_checkpoint_16030_seg.txt'

with open(input_file, 'r', encoding='utf-8') as infile:
    sentences = infile.readlines()

# character segment function
def char_tokenize(text):
    return ' '.join(list(text.strip()))

# character-based segment every sentence
char_segmented_sentences = [char_tokenize(sentence) for sentence in sentences]

# create a df and save to csv
df = pd.DataFrame({'segmented_sentences': char_segmented_sentences})
df.to_csv(output_file, index=False, header=False, encoding='utf-8')

In [None]:
# Download the BLEU script
!wget https://raw.githubusercontent.com/ymoslem/MT-Evaluation/main/BLEU/compute-bleu.py

In [None]:
# Install sacrebleu
pip install sacrebleu

In [None]:
# BLEU
python compute-bleu.py ko_zh_test_dataset_SWRC_seg.txt  generated_translation_checkpoint_16030_seg.txt

COMET

In [None]:
# Do not use the segmented translation
# COMET
model_path = download_model("Unbabel/wmt22-comet-da")
model = load_from_checkpoint(model_path)

# Read sentence
with open('korean_original_SWRC_test.txt', 'r', encoding='utf-8') as src_file, \
     open('generated_translation_checkpoint_16030.txt', 'r', encoding='utf-8') as mt_file, \
     open('chinese_original_SWRC_test.txt.txt', 'r', encoding='utf-8') as ref_file:

    src_lines = src_file.readlines()
    mt_lines = mt_file.readlines()
    ref_lines = ref_file.readlines()

# Create data
data = [
    {
        "src": src.strip(),
        "mt": mt.strip(),
        "ref": ref.strip()
    }
    for src, mt, ref in zip(src_lines, mt_lines, ref_lines)
]

# Sentence-level COMET
model_output = model.predict(data, batch_size=8, gpus=1)

# Set output dir
output_dir = 'comet'
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "comet-score_generated_translation_checkpoint_16030.txt")

# Write scores to the file
with open(output_file, 'w', encoding='utf-8') as f:
    for i, score in enumerate(model_output["scores"]):
        f.write(f"Sentence {i}: {score}\n")
    f.write(f"Overall COMET Score: {model_output['system_score']}\n")