In [21]:
from transformers import AutoTokenizer,AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import load_dataset
import torch
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
import nltk
from sacrebleu import corpus_bleu
from nltk.translate.meteor_score import meteor_score
nltk.download('wordnet')  # Required for METEOR score
import random
import os

# Load the pre-trained model and tokenizer
model_name = "Helsinki-NLP/opus-mt-mul-en"
original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, device_map="auto")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
original_model.to(device)


# Function to preprocess test data
def preprocess_data(test_data, tokenizer):
    sources = []
    references = []
    for example in test_data:
        sources.append(">>jpn<< " + example['src'])
        references.append([example['trg']])  # Wrap in a list for sacrebleu compatibility
    
    return sources, references

# Function to generate translations using the model
def generate_translations(model, tokenizer, sources):
    translations = []
    for source in sources:
        inputs = tokenizer(source, return_tensors="pt", truncation=True, padding=True).to(device)

        torch.cuda.empty_cache()
        torch.cuda.synchronize()

        # outputs = model.generate(
        #     **inputs,
        #     max_length=512,
        #     no_repeat_ngram_size=3,
        #     repetition_penalty=0.6,
        #     num_beams=4,
        #     early_stopping=True,
        #     temperature=0    
        # )

        outputs = model.generate(
            **inputs,
            max_new_tokens=512,
            repetition_penalty=1.2,
            no_repeat_ngram_size=3,
            num_beams=4,
            early_stopping=True,
            temperature=0
        )
        
        translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
        translations.append(translation)
    return translations

def evaluate_model(model, tokenizer, test_data):
    # Preprocess data
    sources, references = preprocess_data(test_data, tokenizer)
    translations = generate_translations(model, tokenizer, sources)

    # BLEU score (using raw text)
    bleu_score = corpus_bleu(translations, references).score

    # Tokenize translations and references for METEOR
    tokenized_translations = [trans.split() for trans in translations]
    tokenized_references = [[ref.split() for ref in ref_list] for ref_list in references]

    # METEOR score
    meteor_scores = [
        max(meteor_score([ref], trans) for ref in ref_list)
        for ref_list, trans in zip(tokenized_references, tokenized_translations)
    ]
    avg_meteor_score = sum(meteor_scores) / len(meteor_scores)

    return bleu_score, avg_meteor_score



[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Matt\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [22]:
data = load_dataset("NilanE/ParallelFiction-Ja_En-100k", split="train")

dataset = data.train_test_split(test_size=0.1, seed=42)
train_data = dataset['train']
test_data = dataset['test']

In [23]:
test_data = test_data.shuffle(seed=42).select(range(100))

original_model = original_model
bleu, meteor = evaluate_model(original_model, tokenizer, test_data)

print("Original Model")
print(f"BLEU Score: {bleu:.2f}")
print(f"METEOR Score: {meteor:.2f}")

finetuned_model = AutoModelForSeq2SeqLM.from_pretrained("fine_tuned_model").to(device)
bleu, meteor = evaluate_model(finetuned_model, tokenizer, test_data)
print("Fine-tuned Model")
print(f"BLEU Score: {bleu:.2f}")
print(f"METEOR Score: {meteor:.2f}")




Original Model
BLEU Score: 0.00
METEOR Score: 0.01
Fine-tuned Model
BLEU Score: 52.82
METEOR Score: 0.13
