In [1]:
import json
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import nltk
from sacrebleu import corpus_bleu
from nltk.translate.meteor_score import meteor_score
nltk.download('wordnet')  # Required for METEOR score

# Load JSON configurations
CONFIG_FILE = "evaluationConfig.json"
EVAL_RESULTS_FILE = "evaluationResults.txt"

def load_configs(config_file):
    with open(config_file, "r") as f:
        configs = json.load(f)
    return configs

# Load the pre-trained model and tokenizer
def load_model(model_name):
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    return model, tokenizer, device

# Preprocess test data
def preprocess_data(test_data, tokenizer):
    sources = [">>jpn<< " + example['src'] for example in test_data]
    references = [[example['trg']] for example in test_data]
    return sources, references

# Generate translations using the model with dynamic parameters
def generate_translations(model, tokenizer, sources, config, device):
    translations = []
    for source in sources:
        inputs = tokenizer(source, return_tensors="pt", truncation=True, padding=True).to(device)

        torch.cuda.empty_cache()
        torch.cuda.synchronize()
        
        # Ensure only provided parameters are passed
        generate_args = {key: value for key, value in config.items() if hasattr(model.config, key)}
        outputs = model.generate(**inputs, **generate_args)
        
        translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
        translations.append(translation)
    return translations

# Evaluate the model
def evaluate_model(model, tokenizer, test_data, config, device):
    sources, references = preprocess_data(test_data, tokenizer)
    translations = generate_translations(model, tokenizer, sources, config, device)

    bleu_score = corpus_bleu(translations, references).score
    
    tokenized_translations = [trans.split() for trans in translations]
    tokenized_references = [[ref.split() for ref in ref_list] for ref_list in references]


    meteor_scores = [
        max(meteor_score([ref], trans) for ref in ref_list)
        for ref_list, trans in zip(tokenized_references, tokenized_translations)
    ]
    avg_meteor_score = sum(meteor_scores) / len(meteor_scores)
    
    return bleu_score, avg_meteor_score


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Matt\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:

# Load configurations
configs = load_configs(CONFIG_FILE)

# Load dataset
data = load_dataset("NilanE/ParallelFiction-Ja_En-100k", split="train")
dataset = data.train_test_split(test_size=0.1, seed=42)
test_data = dataset['test'].shuffle(seed=42).select(range(100))


In [None]:
# Load models
print("Loading models...")
original_model, tokenizer, device = load_model("Helsinki-NLP/opus-mt-mul-en")
fine_tuned_model, _, _ = load_model("fine_tuned_model")

Loading models...




: 

In [None]:
# Run evaluation for each configuration
print("Evaluating configurations...")
with open(EVAL_RESULTS_FILE, "a", encoding="utf-8") as f:
    for i, config in enumerate(configs):
        # Prepare a string listing the parameters
        params_str = ", ".join(f"{k}={v}" for k, v in config.items())

        print(f"Evaluating Configuration {i + 1}...")
        print(params_str)

        # Evaluate original model
        bleu_orig, meteor_orig = evaluate_model(original_model, tokenizer, test_data, config, device)
        # Evaluate fine-tuned model
        bleu_finetuned, meteor_finetuned = evaluate_model(fine_tuned_model, tokenizer, test_data, config, device)

        # Write results in desired format
        f.write(f"{params_str}\n")
        f.write("    - Helsinki-NLP/opus-mt-mul-en\n")
        f.write(f"        - BLEU Score: {bleu_orig:.5f}\n")
        f.write(f"        - METEOR Score: {meteor_orig:.5f}\n\n")

        f.write("    - Fine-tuned Model\n")
        f.write(f"        - BLEU Score: {bleu_finetuned:.5f}\n")
        f.write(f"        - METEOR Score: {meteor_finetuned:.5f}\n\n")

        f.write("=" * 50 + "\n\n")

        # Optionally, also print to console
        print(f"Evaluating Configuration {i + 1}...")
        print(params_str)
        print("  - Helsinki-NLP/opus-mt-mul-en")
        print(f"    - BLEU Score: {bleu_orig:.5f}")
        print(f"    - METEOR Score: {meteor_orig:.5f}")
        print("  - Fine-tuned Model")
        print(f"    - BLEU Score: {bleu_finetuned:.5f}")
        print(f"    - METEOR Score: {meteor_finetuned:.5f}")
        print("=" * 50 + "\n")

Evaluating configurations...
Evaluating Configuration 1...
max_new_tokens=512, num_beams=2, do_sample=True, top_k=50, top_p=0.9, temperature=1.0
Evaluating Configuration 1...
max_new_tokens=512, num_beams=2, do_sample=True, top_k=50, top_p=0.9, temperature=1.0
  - Helsinki-NLP/opus-mt-mul-en
    - BLEU Score: 0.00000
    - METEOR Score: 0.00816
  - Fine-tuned Model
    - BLEU Score: 44.98160
    - METEOR Score: 0.11698

Evaluating Configuration 2...
max_new_tokens=128, num_beams=4, early_stopping=True
Evaluating Configuration 2...
max_new_tokens=128, num_beams=4, early_stopping=True
  - Helsinki-NLP/opus-mt-mul-en
    - BLEU Score: 0.00000
    - METEOR Score: 0.00473
  - Fine-tuned Model
    - BLEU Score: 20.90445
    - METEOR Score: 0.10118

Evaluating Configuration 3...
max_new_tokens=256, num_beams=4, no_repeat_ngram_size=3, repetition_penalty=1.2, length_penalty=1.0
