In [None]:
import os
import torch
import glob
import json
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoConfig
from datasets import Dataset

# 1. File patterns
LV_PATTERN = "../txt/latvian_sentences_*.txt"
GLOSS_PATTERN = "../txt/lsl_glosses_*.txt"
PRUNED_MODEL_PATH = "../mt5-pruned"

# 2. Append all files to master list
lv_lines = []
gloss_lines = []
total_files = 0

for file_path in sorted(glob.glob(LV_PATTERN)):
    with open(file_path, "r", encoding="utf-8") as f:
        lv_lines.extend([line.strip() for line in f if line.strip()])
    total_files += 1

for file_path in sorted(glob.glob(GLOSS_PATTERN)):
    with open(file_path, "r", encoding="utf-8") as f:
        gloss_lines.extend([line.strip() for line in f if line.strip()])

# 3. Check validity
assert len(lv_lines) == len(gloss_lines), f"❌ Mismatch! LV lines: {len(lv_lines)}, Gloss lines: {len(gloss_lines)}"
print(f"✅ Successfully loaded {total_files} batches with {len(lv_lines)} total pairs.")

# 4. Create a dataset
data = {"lv": lv_lines, "gloss": gloss_lines}
raw_dataset = Dataset.from_dict(data)

# Split into Train (90%) and Test (10%) so we can verify learning
split_dataset = raw_dataset.train_test_split(test_size=0.1)
# print("Data split:", split_dataset)

# 5. Load model
tokenizer = AutoTokenizer.from_pretrained(PRUNED_MODEL_PATH)
config = AutoConfig.from_pretrained(PRUNED_MODEL_PATH)
model = AutoModelForSeq2SeqLM.from_config(config)

state_dict = torch.load(os.path.join(PRUNED_MODEL_PATH, "pytorch_model.bin"))
model.load_state_dict(state_dict)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
print(f"Pruned Model loaded on: {device}")

  from .autonotebook import tqdm as notebook_tqdm


✅ Successfully loaded 2 batches with 272 total pairs.


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
The tokenizer you are loading from '../mt5-pruned' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


Pruned Model loaded on: cpu


In [None]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

# --- 1. SETUP ---
# Re-define preprocess to be sure
def preprocess_function(examples):
    inputs = examples["lv"]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["gloss"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

tokenized_datasets = split_dataset.map(preprocess_function, batched=True)

# --- 2. AGGRESSIVE TRAINING ARGUMENTS ---
args = Seq2SeqTrainingArguments(
    output_dir="./mt5-lsl-model",
    eval_strategy="no",             # Skip eval to speed it up
    save_strategy="no",             # Don't save checkpoints yet
    learning_rate=1e-3,             # MUCH HIGHER (was 2e-5)
    per_device_train_batch_size=8,
    num_train_epochs=50,            # LONGER (was 20)
    weight_decay=0.01,
    predict_with_generate=True,
    logging_steps=10,
    optim="adafactor",
    report_to="none"
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    # We train on the WHOLE dataset to force memorization for this test
    # (We are temporarily ignoring the test split to ensure it learns)
    eval_dataset=tokenized_datasets["train"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

Map: 100%|██████████| 244/244 [00:00<00:00, 13006.75 examples/s]
Map: 100%|██████████| 28/28 [00:00<00:00, 4584.83 examples/s]
  trainer = Seq2SeqTrainer(


# -----

In [None]:
import json
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

# --- 1. LOAD THE MAP (The "Rosetta Stone") ---
# This file tells us: "Old ID 15020 is now New ID 5"
with open(os.path.join(PRUNED_MODEL_PATH, "vocab_map.json"), "r") as f:
    # The JSON is saved as { "new_id": old_id }, so we reverse it.
    new2old_map = json.load(f)
    old2new_map = {v: int(k) for k, v in new2old_map.items()}

# Find the "New" ID for the Unknown token (UNK)
# We need this for words like "translate" if they weren't in your pruning text
original_unk_id = tokenizer.unk_token_id
new_unk_id = old2new_map.get(original_unk_id, 0) # Default to 0 if weirdness happens

print(f"Loaded vocab map. Remapping {len(old2new_map)} tokens.")

# --- 2. DEFINE THE REMAPPING FUNCTION ---
def remap_tokens(token_ids):
    # Convert list of Old IDs to New IDs
    # If a token wasn't in our pruning list, turn it into UNK (new_unk_id)
    return [old2new_map.get(tid, new_unk_id) for tid in token_ids]

# --- 3. UPDATED PREPROCESS FUNCTION ---
def preprocess_function(examples):
    # A. Tokenize Inputs (Standard way - produces HUGE IDs)
    inputs = ["translate Latvian to Gloss: " + ex for ex in examples["lv"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)

    # B. Tokenize Targets (Standard way - produces HUGE IDs)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["gloss"], max_length=128, truncation=True)

    # C. MANUAL REMAPPING STEP (The Fix!)
    # We replace the huge IDs with the tiny mapped IDs
    model_inputs["input_ids"] = [remap_tokens(ids) for ids in model_inputs["input_ids"]]
    
    # We must also remap the labels!
    model_inputs["labels"] = [remap_tokens(ids) for ids in labels["input_ids"]]
    
    return model_inputs

# Apply the new function
tokenized_datasets = split_dataset.map(preprocess_function, batched=True)

# --- 4. TRAINING ARGUMENTS ---
args = Seq2SeqTrainingArguments(
    output_dir="./mt5-lsl-model",
    eval_strategy="no",             
    save_strategy="no",             
    learning_rate=1e-3,             
    per_device_train_batch_size=8,
    num_train_epochs=50,            
    weight_decay=0.01,
    predict_with_generate=True,
    logging_steps=10,
    optim="adafactor",
    report_to="none"
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["train"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

Loaded vocab map. Remapping 608 tokens.


Map: 100%|██████████| 244/244 [00:00<00:00, 18051.15 examples/s]
Map: 100%|██████████| 28/28 [00:00<00:00, 6243.51 examples/s]
  trainer = Seq2SeqTrainer(


In [None]:
# --- 3. TRAIN ---
print("Starting training...")
trainer.train()

In [13]:
print("Saving model...")

save_dir = "E:/Documents/GitHub/LSL/mt5-lsl-model"

model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

print("Done.")


Saving model...
Done.


## Testing

In [12]:
import torch
import json
import os

# --- 1. SETUP MAPS (Crucial!) ---
# We need to load the map to convert between "Big Tokenizer" and "Small Model"
vocab_map_path = os.path.join(PRUNED_MODEL_PATH, "vocab_map.json")

with open(vocab_map_path, "r") as f:
    new2old_map = json.load(f)
    # We need both directions!
    old2new_map = {int(v): int(k) for k, v in new2old_map.items()} # Big -> Small
    new2old_map = {int(k): int(v) for k, v in new2old_map.items()} # Small -> Big

# Identify the UNK token ID in the new mapping
# If a word (like "translate") isn't in our map, we point it to the pruned UNK ID.
# Usually, UNK is ID 2 in standard T5, let's find where ID 2 went.
original_unk_id = tokenizer.unk_token_id
pruned_unk_id = old2new_map.get(original_unk_id, 0) # Fallback to 0 if not found

# --- 2. CUSTOM TRANSLATION FUNCTION ---
def predict_gloss(text):
    # A. Prepare Input: Simply use the raw text
    full_text = text
    
    # B. Tokenize (Get Big IDs)
    inputs = tokenizer(full_text, return_tensors="pt", truncation=True, max_length=128)
    input_ids = inputs.input_ids[0].tolist()

    # C. Remap Input (Big IDs -> Small IDs)
    pruned_input_ids = [old2new_map.get(tid, pruned_unk_id) for tid in input_ids]
    
    # Convert back to tensor and move to GPU
    input_tensor = torch.tensor([pruned_input_ids]).to(model.device)

    # D. Generate (Model produces Small IDs)
    with torch.no_grad():
        outputs = model.generate(input_tensor, max_new_tokens=128)
    
    # E. Remap Output (Small IDs -> Big IDs)
    output_ids = outputs[0].tolist()
    original_output_ids = [new2old_map.get(tid, tokenizer.unk_token_id) for tid in output_ids]

    # F. Decode
    result = tokenizer.decode(original_output_ids, skip_special_tokens=True)
    return result

# --- 3. RUN TESTS ---
print("\n--- RESULTS ---")

test_sentences = [
    lv_lines[10],
    "Vai tu esi labs cilvēks?",
    "Man ir ļoti lielas mājas.",
    "Sveika!",
    "Labdien, [NAME]!"
]

for text in test_sentences:
    gloss = predict_gloss(text)
    print(f"\nInput:  {text}")
    print(f"Result: {gloss}")


--- RESULTS ---

Input:  Mans vārds ir [NAME].
Result: mans vārds ir [NAME]

Input:  Vai tu esi labs cilvēks?
Result: vai tu esmu labs

Input:  Man ir ļoti lielas mājas.
Result: mans ir ļoti liels māja

Input:  Sveika!
Result: sveiks

Input:  Labdien, [NAME]!
Result: labdien [NAME]
