In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# File structure
'''mt5_HuggingFace/
‚îú‚îÄ‚îÄ clean/
‚îÇ   ‚îú‚îÄ‚îÄ es-CL/
‚îÇ   ‚îú‚îÄ‚îÄ es-CL/
‚îÇ   ‚îî‚îÄ‚îÄ ...
‚îî‚îÄ‚îÄ mt5/
    ‚îú‚îÄ‚îÄ mt5_finetune.ipynb
    ‚îî‚îÄ‚îÄ my_saved_mt5_model/'''

'mt5_HuggingFace/\n‚îú‚îÄ‚îÄ clean/\n‚îÇ   ‚îú‚îÄ‚îÄ es-CL/\n‚îÇ   ‚îú‚îÄ‚îÄ es-CL/\n‚îÇ   ‚îî‚îÄ‚îÄ ...\n‚îî‚îÄ‚îÄ mt5/\n    ‚îú‚îÄ‚îÄ mt5_finetune.ipynb\n    ‚îî‚îÄ‚îÄ my_saved_mt5_model/'

# Instructions to Run This Notebook (Using Pre-trained Model)

These instructions will guide you through running the notebook to use the *already saved* pre-trained MT5 model for translation, skipping the training steps to save time.

### 1. Data and Notebook Access
*   **Share the Saved Model:** Ensure the `mt5` folder containing the saved model and notebook, this is also the default folder to save the model.
*   **Original Data (Optional):** The original data folder (`clean`) are only needed if you intend to run trainning model. __And you also need to modify the data path__.

### 2. Everything Runs in Google Colab

### 3. Mount Google Drive

### 4. Verify Model Path
*   Ensure that the `model_save_path` variable points to the intended location of saved model in Google Drive. Based on __my__ steps, this is `/content/drive/MyDrive/CS4120/mt5/my_saved_mt5_model`.

### 5. Install Required Libraries

### 6. Set Up GPU Runtime

### 7. Run Necessary Cells in Order
*   Since you're using a pre-trained model, you will skip the entire training process.
*   **Minimum cells to run:**
    *   **Mount Drive**
    *   **Load Model & Tokenizer:** This cell should look something like this in my path:
        ```python
        model_path = "/content/drive/MyDrive/CS4120/mt5/my_saved_mt5_model"
        tokenizer = T5TokenizerFast.from_pretrained(model_path)
        model = MT5ForConditionalGeneration.from_pretrained(model_path)
        print(f"Model and tokenizer loaded from: {model_path}")
        ```
    *   **Define `translate_mt5` function**
    *   **Define decoding configs**
    *   **Run translation examples**


### 8. View Output
*   The translation outputs will be printed directly below the relevant cells.

## Overall Notebook Logic and Process Flow

Fine-tuning a pre-trained mT5 model for English to Spanish machine translation, specifically for dialectal variations found in Gnome project data

1.  **Data Loading and Preparation**:
    *   **Source Data**: It loads into Hugging Face `Dataset` objects.
    *   **Dataset Addition**: Multiple dialectal datasets can be loaded and concatenated into a single `all_pairs`.
    *   **Train/Validation Split**

2.  **Model and Tokenizer Initialization**:
    *   **Base Model**: A pre-trained `google/mt5-small` model and its corresponding `T5TokenizerFast` are loaded from Hugging Face Hub. mT5 (Massive Text-to-Text Transfer Transformer) is a multilingual encoder-decoder model suitable for translation tasks.
    *   **Task Prefix**: A `task_prefix` ("translate English to Spanish: ") is defined.
    *   **Tokenization**: A `preprocess_batch` function is defined to tokenize both the English source and Spanish target sentences. __It also adds the task prefix to the English input.__

3.  **Model Training**:
    *   **Data Collator**: `DataCollatorForSeq2Seq`
    *   **Training Arguments**: `Seq2SeqTrainingArguments`
    *   **Trainer Setup**: A `Seq2SeqTrainer`
    *   **Training Execution**: `trainer.train()`

4.  **Model saved to a specified directory on Google Drive**

5.  **Inference and Decoding Strategies**:
    *   **`translate_mt5` Function**: performs translations. It takes an English text, the model, and tokenizer, along with various decoding parameters.
        *   **Greedy Decoding**: Selects the most probable token at each step.
        *   **Beam Search**: Keeps track of multiple probable sequences to find a globally better translation.
        *   **Length Penalty**: Adjusts the likelihood of longer or shorter sequences.


‰∫ëÁ´ØÁ°¨ÁõòÊåÇËΩΩÊàêÂäüÂêéÔºåËØ∑Êèê‰æõÊÇ®Ë¶ÅÂä†ËΩΩÁöÑÊï∞ÊçÆÊñá‰ª∂ÁöÑÂÆåÊï¥Ë∑ØÂæÑÔºà‰æãÂ¶ÇÔºå`/content/drive/My Drive/your_folder/your_file.csv`ÔºâÔºåÊàëÂ∞ÜÂ∏ÆÂä©ÊÇ®Â∞ÜÂÖ∂Âä†ËΩΩÂà∞ pandas DataFrame ‰∏≠„ÄÇ

# Data Retrieval

In [None]:
import os
#from datasets import Dataset
from transformers import MT5ForConditionalGeneration, T5TokenizerFast
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import os
import shutil # Import shutil for directory deletion
import re
from datasets import Dataset
from transformers import (
    MT5ForConditionalGeneration,
    T5TokenizerFast,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)



In [None]:
data_path = "/content/drive/MyDrive/CS4120/clean"
print("Folders:", os.listdir(data_path))

Folders: ['.DS_Store', 'es-CL', 'std_es', 'es-VE', 'es-HN', 'es-PR', 'es-UY', 'es-AR', 'es-CO', 'es-CR', 'es-PA', 'es-EC', 'es-PE', 'es-NI', 'es-DO', 'es-SV']


In [None]:
MAX_LENGTH = 20

def normalizeString(s: str) -> str:
    """
    Lowercase, space punctuation, remove non-letter characters (except Spanish accents),
    collapse multiple spaces
    The data cleaning processes are the same as RNN Notebook
    """
    s = s.lower().strip()
    # Put spaces around punctuation
    s = re.sub(r"([.!?¬ø¬°,])", r" \1", s)
    # Keep only letters (including Spanish accents) and basic punctuation
    s = re.sub(r"[^a-zA-Z√°√©√≠√≥√∫√±√Å√â√ç√ì√ö√ë.!?¬ø¬°,]+", r" ", s)
    # Collapse multiple spaces
    s = re.sub(r"\s+", r" ", s).strip()
    return s

In [None]:
# Load datasets for all dialects
region_data = {}

# Check the clean data path to be existed
if os.path.exists(data_path):
    sub_folders = sorted(os.listdir(data_path))

    # loop through all dialect data
    for folder_name in sub_folders:
        folder_full_path = os.path.join(data_path, folder_name)

        # Only process folders like es-AR, es-CO, ...
        if os.path.isdir(folder_full_path) and folder_name.startswith("es-"):

            path_en = os.path.join(folder_full_path, "all.en")
            path_es = os.path.join(folder_full_path, "all.es")

            if os.path.exists(path_en) and os.path.exists(path_es):
                with open(path_en, "r", encoding="utf-8") as f:
                    lines_en = f.read().strip().split("\n")

                with open(path_es, "r", encoding="utf-8") as f:
                    lines_es = f.read().strip().split("\n")

                # cleaned + filtered en-es pairs for this dialect
                cleaned_pairs = []
                if len(lines_en) == len(lines_es):
                    for en, es in zip(lines_en, lines_es):
                        # basic non-empty check
                        if not en.strip() or not es.strip():
                            continue

                        # all.en / all.es both are normalized
                        clean_en = normalizeString(en)
                        clean_es = normalizeString(es)

                        # drop if empty after cleaning
                        if not clean_en or not clean_es:
                            continue

                        # length filter
                        if len(clean_en.split()) < MAX_LENGTH and len(clean_es.split()) < MAX_LENGTH:
                            cleaned_pairs.append(
                                {"en": clean_en, "es": clean_es}
                            )

                region_data[folder_name] = cleaned_pairs  # cleaned pairs per region

print("Loaded regions that meet the requirements:", list(region_data.keys()))

Loaded regions that meet the requirements: ['es-AR', 'es-CL', 'es-CO', 'es-CR', 'es-DO', 'es-EC', 'es-HN', 'es-NI', 'es-PA', 'es-PE', 'es-PR', 'es-SV', 'es-UY', 'es-VE']


In [None]:
# translation is like a text-to-text problem, conditional language model P({Spanish dialect} | {English})
# input: en_sentence
# output: es_sentence
# import multilingual translation model and the tool needed to prepare text
# the trainning process is to maximize the log-likelihood of the target sequence tokens (token-level cross-entropy).
# In the encoder part, the model applied multiple layers of self-attention + feed-forward networks and produces contextual embedding
#   model_inputs["input_ids"], ["attention_mask"]
# In the decoder part, it masked self-attention over previously generated target tokens,
#   Cross-attention over the encoder outputs, A final linear layer + softmax over the vocabulary to get P(y_t | y_{<t}, x)
#   model_inputs["labels"]
model_name = "google/mt5-small"

tokenizer = T5TokenizerFast.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
# raw text -> token IDs for subword tokenization and SentencePiece
# Rather than BoW or fixed-length vectors, the model sees a sequence of (subword) indices;
# the transformer turns them into contextual embeddings via self-attention.

max_source_length = 128
max_target_length = 128
task_prefix = "translate English to Spanish: "

def preprocess_batch(batch):
    # Build input (source) text with the translation prefix
    inputs = [task_prefix + s for s in batch["input_text"]]
    targets = batch["target_text"]

    # Tokenize inputs (English to source IDs)
    model_inputs = tokenizer(
        inputs,
        max_length=max_source_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )

    # Tokenize targets (Spanish/dialect to labels)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=max_target_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

    # Attach labels to model inputs
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
'''
Fore more sufficient trainning,
MAX_EXAMPLES_PER_REGION = Large as possible
QUICK_DEBUG = False # epoch will be 3
'''

# Train a separate MT5 model for each dialect/region in `region_data`
# Base directory where per-region models will be saved
base_save_dir = "/content/drive/MyDrive/CS4120/mt5/my_saved_mt5_model"
os.makedirs(base_save_dir, exist_ok=True)
MAX_EXAMPLES_PER_REGION = 1000

def build_region_dataset(region_name):
    """
    Build a HuggingFace Dataset for a single region only.
    Direction: English (input) -> Spanish/dialect (target).
    """
    pairs = region_data[region_name]   # list of {"en": ..., "es": ...}
    region_examples = []

    # Pair the target and source
    for p in pairs:
        region_examples.append(
            {
                "input_text": p["en"],   # English source
                "target_text": p["es"],  # Spanish/dialect target
                "region": region_name,
            }
        )

    dataset_region = Dataset.from_list(region_examples)
    # Subsample to at most MAX_EXAMPLES_PER_REGION to speed up training
    if len(dataset_region) > MAX_EXAMPLES_PER_REGION:
        dataset_region = dataset_region.shuffle(seed=42).select(range(MAX_EXAMPLES_PER_REGION))

    # train/val split
    dataset_region = dataset_region.train_test_split(test_size=0.1, shuffle=True)
    return dataset_region["train"], dataset_region["test"]



In [None]:
#API# API key: 0e8ecf600b1bba55a13718e1a632c7bfcf91269d

# Toggle for very quick debug runs vs. more serious training by changing number of epoch
QUICK_DEBUG = False

# Train the model seperately for each dialect, so each model is only specialized
# change the following for loop to manually select the dialect to train and save its model
# for region_name in ["es-CL","es-AR","es-MX","es-ES","std_es"]:
# for region_name in region_data.keys(): # for full trainning cross all dialects
for region_name in ["es-AR"]:
    print(f"\n========== Training MT5 for region: {region_name} ==========")

    # Build dataset only for this dialect (with subsampling inside)
    train_ds, val_ds = build_region_dataset(region_name)

    print(f"{region_name}: {len(train_ds)} train examples, {len(val_ds)} val examples")

    # Tokenize for this region
    train_tokenized = train_ds.map(
        preprocess_batch,
        batched=True,
        remove_columns=train_ds.column_names,
    )
    val_tokenized = val_ds.map(
        preprocess_batch,
        batched=True,
        remove_columns=val_ds.column_names,
    )

    # Fresh new MT5 model for this region(dialect)
    model = MT5ForConditionalGeneration.from_pretrained(model_name)

    # Data collator + training args
    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

    # Lighter training settings
    if QUICK_DEBUG:
        num_epochs = 1
    else:
        num_epochs = 3

    training_args = Seq2SeqTrainingArguments(
        output_dir=f"mt5-gnome-en-es-{region_name}",
        per_device_train_batch_size=4, # batch size for trainning
        per_device_eval_batch_size=4,
        learning_rate=3e-4,
        num_train_epochs=num_epochs,
        logging_steps=50, # record the trainning process
        predict_with_generate=True, # Perform translation
        fp16=False, # for faster trainning
    )

    # handles shifting decoder inputs and applying cross-entropy only on non-padding tokens.
    # Computes cross-entropy loss on your labels, Runs gradient descent
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_tokenized,
        eval_dataset=val_tokenized,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    # Train only on this dialect
    trainer.train()

    # Save this region‚Äôs model into its own directory, and overwrite the existing files
    region_save_dir = os.path.join(base_save_dir, region_name)
    os.makedirs(region_save_dir, exist_ok=True)
    trainer.save_model(region_save_dir)
    tokenizer.save_pretrained(region_save_dir)

    print(f"Saved MT5 model for {region_name} to {region_save_dir}")


es-AR: 900 train examples, 100 val examples


Map:   0%|          | 0/900 [00:00<?, ? examples/s]



Map:   0%|          | 0/100 [00:00<?, ? examples/s]

comet_ml is installed but the Comet API Key is not configured. Please set the `COMET_API_KEY` environment variable to enable Comet logging. Check out the documentation for other ways of configuring it: https://www.comet.com/docs/v2/guides/experiment-management/configure-sdk/#set-the-api-key
  trainer = Seq2SeqTrainer(
[34m[1mwandb[0m: Currently logged in as: [33mczzzttttt1[0m ([33mczzzttttt1-northeastern-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
50,32.0713
100,15.8428
150,10.7037
200,5.8521
250,2.918
300,2.1083
350,1.4707
400,1.1562
450,1.309
500,1.2302


Saved MT5 model for es-AR to /content/drive/MyDrive/CS4120/mt5/my_saved_mt5_model/es-AR


# **Start to run from the below if you DON'T want to retrain the model.**
Still need to run some functions and libraries

In [None]:
from transformers import MT5ForConditionalGeneration, T5TokenizerFast
import os
import torch

In [None]:
# Load the tokenizer and model from the saved directory
base_save_dir = "/content/drive/MyDrive/CS4120/mt5/my_saved_mt5_model"

# change to 'es-AR', 'es-PR', etc. when you want another dialect
region_name = "es-CL"
model_path = os.path.join(base_save_dir, region_name)

print(f"Attempting to load model from: {model_path}")
if not os.path.exists(model_path):
    print(f"Error: Model path '{model_path}' does not exist.")
elif not os.listdir(model_path):
    print(f"Error: Model path '{model_path}' is empty.")
else:
    print("Directory contents:")
    for item in os.listdir(model_path):
        print("  -", item)

# Load tokenizer from the base MT5 model
base_model_name = "google/mt5-small"
tokenizer = T5TokenizerFast.from_pretrained(base_model_name)

# Load the fine-tuned weights for THIS dialect from the local folder
model = MT5ForConditionalGeneration.from_pretrained(model_path)

print(f"Model loaded successfully from: {model_path}")
print(f"Tokenizer loaded from base model: {base_model_name}")

Attempting to load model from: /content/drive/MyDrive/CS4120/mt5/my_saved_mt5_model/es-CL
Directory contents:
  - config.json
  - generation_config.json
  - model.safetensors
  - tokenizer_config.json
  - special_tokens_map.json
  - spiece.model
  - tokenizer.json
  - training_args.bin




Model loaded successfully from: /content/drive/MyDrive/CS4120/mt5/my_saved_mt5_model/es-CL
Tokenizer loaded from base model: google/mt5-small


In [None]:
# decoding process to find y_hat = argmax P(y|x)
#	So use heuristics:
#	Greedy: at each step take the most probable next token.
#	Beam search: keep the top k partial sequences (beam size), expand each, keep top k again.
#	Add length penalties to avoid favoring short sequences.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def translate_mt5(
    text_en, # The English text to be translated
    model,   # The MT5 model used for translation
    tokenizer, # The tokenizer corresponding to the MT5 model
    num_beams=1, # Number of beams for beam search. 1 means greedy decoding.
    do_sample=False, # Whether to use sampling; False for deterministic decoding (beam search/greedy)
    max_length=128, # Maximum length of the generated target sequence
    length_penalty=1, # Penalty for generating longer sequences
    temperature=1, # Controls randomness in sampling. Lower values make output more deterministic.
    top_p=None, # Top-p (nucleus) sampling parameter
):
    # Prepare the input text with the task prefix
    input_text = task_prefix + text_en
    # Tokenize the input text and move it to the appropriate device (CPU/GPU)
    inputs = tokenizer(
        input_text,
        return_tensors="pt", # Return PyTorch tensors
        truncation=True,     # Truncate sequences longer than max_source_length
        max_length=max_source_length,
    ).to(device)

    # Define generation arguments
    gen_kwargs = {
        "max_length": max_length,
        "num_beams": num_beams,
        "length_penalty": length_penalty,
        "do_sample": do_sample,
        "temperature": temperature,
    }

    # Add top_p to generation arguments if specified
    if top_p is not None:
        gen_kwargs["top_p"] = top_p

    # Generate the output sequence (translated text token IDs)
    output_ids = model.generate(**inputs, **gen_kwargs)
    # Decode the generated token IDs back into human-readable text, skipping special tokens
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

In [None]:
decoding_configs = [
    {"name": "greedy",        "num_beams": 1, "do_sample": False, "length_penalty": 1.0},
    {"name": "beam_4",        "num_beams": 4, "do_sample": False, "length_penalty": 1.0},
    {"name": "beam_8",        "num_beams": 8, "do_sample": False, "length_penalty": 1.0},
    {"name": "beam_4_lp_0.6", "num_beams": 4, "do_sample": False, "length_penalty": 0.6},
    {"name": "beam_4_lp_1.4", "num_beams": 4, "do_sample": False, "length_penalty": 1.4},
    # Optional:
    # {"name": "top_p_0.9", "num_beams": 1, "do_sample": True,  "top_p": 0.9, "temperature": 0.7},
]

In [None]:
# Defind those variable again in here(if you want to use the existing saved model rather than retrainning the model)
max_source_length = 128
max_target_length = 128
task_prefix = "translate English to Spanish: "

In [None]:
# Some test sentences to be translated
test_examples = [
    "Keyboard Accessibility Preferences",
    "Shows the status of keyboard accessibility features",
    "There was an error launching the help viewer.",
]

# Apply the model and get the translation with customized parameters(beam_# ...)
for text in test_examples:
    print(f"\nSOURCE: {text}")
    for cfg in decoding_configs:
        out = translate_mt5(
            text_en=text,
            model=model,
            tokenizer=tokenizer,
            num_beams=cfg.get("num_beams", 1),
            do_sample=cfg.get("do_sample", False),
            length_penalty=cfg.get("length_penalty", 1.0),
            temperature=cfg.get("temperature", 1.0),
            top_p=cfg.get("top_p", None),
        )
        print(f"[{cfg['name']}] {out}")


SOURCE: Keyboard Accessibility Preferences
[greedy] ¬ø                                                                                                                              
[beam_4] tom ¬ø         .                                                                                                                   
[beam_8] tom ¬ø        .                                                                                                                    
[beam_4_lp_0.6] ¬ø .
[beam_4_lp_1.4] tom ¬ø         .                                                                                                                   

SOURCE: Shows the status of keyboard accessibility features
[greedy] ¬ø                                                                                                                              
[beam_4] tom ¬ø    .                                                                                                                        
[beam_8] tom ¬øa
[beam_4_lp

Output explanation:

1. Greedy: the model simply picks the word with the highest probability as the next word in the sequence. It can be suboptimal because a locally optimal choice at one step might lead to a globally bad translation later on.
2. Beam: Instead of just picking the single best word at each step, beam search keeps track of the num_beams (e.g., 4 or 8) most probable partial translations. Therefore, __it's less likely to get stuck in local optima. Increasing num_beams usually leads to better quality, up to a point.__
3. lp_#: This parameter is used with beam search to influence the length of the generated translation. Models sometimes have a bias towards generating shorter sequences. A higher number will encourage to generate longer sequences. However, __if the outputs are the same, it means the length penalties doesn't alter the most probable sequence for this model__.

# Evaluation for 'BLEU', 'chrF', 'METEOR', 'COMET'

In [None]:
!pip install -q evaluate sacrebleu nltk comet-ml

In [None]:
!pip install -q evaluate sacrebleu nltk comet-ml unbabel-comet

In [None]:
import evaluate
import nltk
import os
import torch
from tqdm.auto import tqdm
from datasets import Dataset
from tqdm.auto import tqdm
from transformers import T5TokenizerFast, MT5ForConditionalGeneration
import nltk

In [None]:
# make sure required NLTK resources exist
nltk.download("wordnet")
nltk.download("omw-1.4")

def compute_metrics(sources, references, predictions):
    """
    Compute BLEU, chrF, METEOR, and COMET, mirroring the RNN notebook.

    Parameters
    ----------
    sources: list[str]  ‚Äì English source sentences
    references: list[str]  ‚Äì Spanish reference translations
    predictions: list[str]  ‚Äì Spanish MT5 predictions

    Returns:
    dict with keys: 'BLEU', 'chrF', 'METEOR', 'COMET'
    """
    results = {}

    metric_bleu  = evaluate.load("sacrebleu")
    metric_chrf  = evaluate.load("chrf")
    metric_meteor = evaluate.load("meteor")
    metric_comet = evaluate.load("comet")

    # reformat list[list[str]] for references
    formatted_refs = [[r] for r in references]

    # BLEU
    bleu_res = metric_bleu.compute(
        predictions=predictions,
        references=formatted_refs
    )
    results["BLEU"] = bleu_res["score"]
    print(f"BLEU:   {results['BLEU']:.2f}")

    # chrF
    chrf_res = metric_chrf.compute(
        predictions=predictions,
        references=formatted_refs
    )
    results["chrF"] = chrf_res["score"]
    print(f"chrF:   {results['chrF']:.2f}")

    # METEOR
    meteor_res = metric_meteor.compute(
        predictions=predictions,
        references=references
    )
    results["METEOR"] = meteor_res["meteor"]
    print(f"METEOR: {results['METEOR']:.4f}")

    # COMET
    comet_res = metric_comet.compute(
        predictions=predictions,
        references=references,
        sources=sources
    )
    results["COMET"] = comet_res["mean_score"]
    print(f"COMET:  {results['COMET']:.4f}")

    return results

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
# find the saved models for dialects
base_save_dir = "/content/drive/MyDrive/CS4120/mt5/my_saved_mt5_model"
all_subdirs = [
    d for d in os.listdir(base_save_dir)
    if os.path.isdir(os.path.join(base_save_dir, d))
]

dialects_with_model = []
for d in all_subdirs:
    full_dir = os.path.join(base_save_dir, d)
    files = os.listdir(full_dir)
    has_config = any(f.startswith("config") and f.endswith(".json") for f in files)
    has_weights = any(f.endswith(".bin") or f.endswith(".safetensors") for f in files)
    if has_config and has_weights:
        dialects_with_model.append(d)

print("Dialects with saved MT5 model:")
for d in dialects_with_model:
    print("  -", d)

if not dialects_with_model:
    raise RuntimeError("No dialect subdirectories with saved MT5 models were found.")


Dialects with saved MT5 model:
  - es-CL
  - es-AR


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
results_by_region = {}
MAX_EVAL_EXAMPLES = 1 # change this number for more complete eval



for region_name in dialects_with_model:
    print(f"\n===============================")
    print(f"Evaluating dialect: {region_name}")
    print(f"===============================")

    model_path = os.path.join(base_save_dir, region_name)

    # Reload tokenizer & model for THIS dialect
    # Option 1: tokenizer was also saved per dialect:
    # tokenizer = T5TokenizerFast.from_pretrained(model_path)
    #
    # Option 2: tokenizer is shared across dialects (base model):
    base_model_name = "google/mt5-small"
    tokenizer = T5TokenizerFast.from_pretrained(base_model_name)

    model = MT5ForConditionalGeneration.from_pretrained(model_path)
    model.to(device)
    model.eval()

    # Rebuild the region specific dataset and get the TEST split
    train_ds, test_ds = build_region_dataset(region_name)

    # subsample test set for quicker evaluation
    if len(test_ds) > MAX_EVAL_EXAMPLES:
        test_ds = test_ds.shuffle(seed=123).select(range(MAX_EVAL_EXAMPLES))
        print(f"Subsampled test set to {len(test_ds)} examples for {region_name}.")
    else:
        print(f"Using all {len(test_ds)} test examples for {region_name}.")

    sources = []      # English source sentences
    references = []   # Spanish/dialect translations
    predictions = []  # MT5 translations

    # Run MT5 to translate EN to ES dialect
    for example in tqdm(test_ds, desc=f"{region_name} evaluation"):
        src_en = example["input_text"]
        tgt_es = example["target_text"]

        # Add the same prefix used during training
        input_text = task_prefix + src_en

        enc = tokenizer(
            input_text,
            max_length=max_source_length,
            truncation=True,
            return_tensors="pt",
        )

        input_ids = enc["input_ids"].to(device)
        attention_mask = enc["attention_mask"].to(device)

        with torch.no_grad():
            generated_ids = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=max_target_length,
                num_beams=4,
                early_stopping=True,
            )

        pred_text = tokenizer.decode(
            generated_ids[0],
            skip_special_tokens=True
        ).strip()

        sources.append(src_en.strip())
        references.append(tgt_es.strip())
        predictions.append(pred_text)

    print(f"\nMetrics for {region_name}:")
    scores = compute_metrics(
        sources=sources,
        references=references,
        predictions=predictions,
    )
    results_by_region[region_name] = scores


Evaluating dialect: es-CL
Subsampled test set to 1 examples for es-CL.


es-CL evaluation:   0%|          | 0/1 [00:00<?, ?it/s]


Metrics for es-CL:


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.6.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/2760a223ac957f30acfb18c8aa649b01cf1d75f2/checkpoints/model.ckpt`
/usr/local/lib/python3.12/dist-packages/pytorch_lightning/core/saving.py:197: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


BLEU:   0.00
chrF:   0.69
METEOR: 0.0000


INFO:pytorch_lightning.utilities.rank_zero:üí° Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


COMET:  0.3190

Evaluating dialect: es-AR




Subsampled test set to 1 examples for es-AR.


es-AR evaluation:   0%|          | 0/1 [00:00<?, ?it/s]


Metrics for es-AR:


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.6.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/2760a223ac957f30acfb18c8aa649b01cf1d75f2/checkpoints/model.ckpt`
/usr/local/lib/python3.12/dist-packages/pytorch_lightning/core/saving.py:197: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
INFO:pytorch_lightning.utilities.rank_zero:üí° Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.


BLEU:   5.52
chrF:   3.93
METEOR: 0.0862


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


COMET:  0.2400


In [None]:
# summary table across dialects

print("\n----- MT5 Evaluation Summary (per dialect) -----")
header = f"{'Region':<10} {'BLEU':>7} {'chrF':>7} {'METEOR':>9} {'COMET':>9}"
print(header)
print("-" * len(header))
for region_name, scores in results_by_region.items():
    bleu   = scores.get("BLEU",   0.0)
    chrf   = scores.get("chrF",   0.0)
    meteor = scores.get("METEOR", 0.0)
    comet  = scores.get("COMET",  0.0)
    print(f"{region_name:<10} {bleu:7.2f} {chrf:7.2f} {meteor:9.4f} {comet:9.4f}")


===== MT5 Evaluation Summary (per dialect) =====
Region        BLEU    chrF    METEOR     COMET
----------------------------------------------
es-CL         0.00    0.69    0.0000    0.3190
es-AR         5.52    3.93    0.0862    0.2400


# End of code