### Training

In [None]:
import os
import torch
import json
import glob
import time
import shutil
from datasets import Dataset
from transformers import T5Tokenizer, AutoModelForSeq2SeqLM, EarlyStoppingCallback, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

# 1. Files
LV_PATTERN = "../txt/latvian_sentences_*.txt"
GLOSS_PATTERN = "../txt/lsl_glosses_*.txt"
PRUNED_MODEL_PATH = "../mt5-pruned"
SAVE_DIR = "../mt5-lsl-model"

# 2. Append all files to master list
lv_lines = []
gloss_lines = []
total_files = 0

for file_path in sorted(glob.glob(LV_PATTERN)):
    with open(file_path, "r", encoding="utf-8") as f:
        lv_lines.extend([line.strip() for line in f if line.strip()])
    total_files += 1

for file_path in sorted(glob.glob(GLOSS_PATTERN)):
    with open(file_path, "r", encoding="utf-8") as f:
        gloss_lines.extend([line.strip() for line in f if line.strip()])

# 3. Check validity
assert len(lv_lines) == len(gloss_lines), f"❌ Mismatch! Sentence lines: {len(lv_lines)}, gloss lines: {len(gloss_lines)}"
print(f"✅ Loaded {total_files} batches with {len(lv_lines)} total pairs.")

# 4. Calculate stats to save later
all_lv_text = " ".join(lv_lines)
all_gloss_text = " ".join(gloss_lines)
unique_lv_words = len(set(all_lv_text.split()))
unique_gloss_words = len(set(all_gloss_text.split()))

# 5. Create a dataset, Train (90%) and Test (10%) split
data = {"lv": lv_lines, "gloss": gloss_lines}
raw_dataset = Dataset.from_dict(data)
split_dataset = raw_dataset.train_test_split(test_size=0.1)
print("Data split:", split_dataset.num_rows)

# 6. Load model
tokenizer = T5Tokenizer.from_pretrained(PRUNED_MODEL_PATH)
model = AutoModelForSeq2SeqLM.from_pretrained(PRUNED_MODEL_PATH)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
print(f"Pruned Model loaded on: {device}")

# 7. Vocabulary remap
with open(os.path.join(PRUNED_MODEL_PATH, "vocab_map.json"), "r") as f:
    new2old_map = json.load(f)
    old2new_map = {v: int(k) for k, v in new2old_map.items()}

original_unk_id = tokenizer.unk_token_id
new_unk_id = old2new_map.get(original_unk_id, 0)

def remap_tokens(token_ids):
    return [old2new_map.get(tid, new_unk_id) for tid in token_ids]

def preprocess_function(examples):
    inputs = examples["lv"]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)

    labels = tokenizer(
        text_target=examples["gloss"],
        max_length=128,
        truncation=True
    )

    model_inputs["input_ids"] = [remap_tokens(ids) for ids in model_inputs["input_ids"]]
    model_inputs["labels"] = [remap_tokens(ids) for ids in labels["input_ids"]]
    
    return model_inputs

tokenized_datasets = split_dataset.map(preprocess_function, batched=True)

print(f"Remapped {len(old2new_map)} tokens.")

✅ Loaded 3 batches with 630 total pairs.
Data split: {'train': 567, 'test': 63}
Pruned Model loaded on: cpu


Map: 100%|██████████| 567/567 [00:00<00:00, 7927.74 examples/s]
Map: 100%|██████████| 63/63 [00:00<00:00, 5980.34 examples/s]

Remapped 1210 tokens.





In [None]:
# 8. Training parameters
args = Seq2SeqTrainingArguments(
    output_dir=SAVE_DIR,
    learning_rate=1e-3,
    num_train_epochs=50,

    eval_strategy="epoch",
    save_strategy="epoch",

    per_device_train_batch_size=8,
    weight_decay=0.01,
    logging_steps=50,
    save_total_limit=1,

    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    predict_with_generate=True,
    optim="adafactor",
    report_to="none"
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

  trainer = Seq2SeqTrainer(


In [48]:
# 9. Train
start_time = time.time()
train_result = trainer.train()



Epoch,Training Loss,Validation Loss
1,8.151,1.525597
2,2.5038,0.654168
3,1.0966,0.526155
4,0.8345,0.419393
5,0.5673,0.410888
6,0.4589,0.364141
7,0.3937,0.320774
8,0.2608,0.304701
9,0.2705,0.381154
10,0.2161,0.348261


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


In [53]:
# 10. Save model
model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)

print(f"Saved model to {SAVE_DIR}")

# 11. Cleanup
print("Cleaning up temporary checkpoints...")
for item in os.listdir(SAVE_DIR):
    item_path = os.path.join(SAVE_DIR, item)
    if os.path.isdir(item_path) and item.startswith("checkpoint-"):
        try:
            shutil.rmtree(item_path)
            print(f"Deleted: {item}")
        except Exception as e:
            print(f"Could not delete {item}: {e}")

# 12. Save metadata
end_time = time.time()
training_duration = end_time - start_time

total_steps = train_result.global_step
best_epoch_trained = None
for log_entry in trainer.state.log_history:
    if 'best_model_checkpoint' in log_entry:
        if 'epoch' in log_entry:
            best_epoch_trained = int(log_entry['epoch'])
            break
if best_epoch_trained is None and trainer.state.log_history:
    last_log = trainer.state.log_history[-1]
    if 'epoch' in last_log:
        best_epoch_trained = int(last_log['epoch'])

target_indices = [10, 60, 100, 160, 200, 260, 300, 360, 400, 460, 500, 560, 600, 660, 700]
test_samples = []

for idx in target_indices:
    if idx < len(lv_lines):
        test_samples.append(lv_lines[idx])

sample_results = []
model.eval()

for text in test_samples:
    inputs = tokenizer(text, return_tensors="pt", max_length=128, truncation=True)
    mapped_ids = [remap_tokens(ids) for ids in inputs.input_ids.tolist()]
    input_tensor = torch.tensor(mapped_ids).to(device)
    
    with torch.no_grad():
        outputs = model.generate(input_tensor, max_new_tokens=128)
    
    out_ids = outputs[0].tolist()
    orig_ids = [int(new2old_map.get(str(tid), tokenizer.unk_token_id)) for tid in out_ids]
    decoded = tokenizer.decode(orig_ids, skip_special_tokens=True)
    
    sample_results.append({"input": text, "output": decoded})

metadata = {
    "dataset_stats": {
        "total_pairs": len(lv_lines),
        "total_files": total_files,
        "unique_sentence_words": unique_lv_words,
        "unique_gloss_words": unique_gloss_words,
        "tokens_remapped": len(old2new_map),
        "train_split": len(tokenized_datasets["train"]),
        "test_split": len(tokenized_datasets["test"])
    },
    "training_stats": {
        "duration_seconds": round(training_duration, 2),
        "epochs_planned": args.num_train_epochs,
        "epochs_completed": best_epoch_trained,
        "batch_size": args.per_device_train_batch_size,
        "learning_rate": args.learning_rate,
        "final_train_loss": train_result.training_loss,
        "final_eval_loss": trainer.state.best_metric,
        "best_checkpoint_step": trainer.state.best_model_checkpoint.split("-")[-1] if trainer.state.best_model_checkpoint else total_steps
    },
    "test_samples": sample_results
}

# 8c. Save to JSON
meta_path = os.path.join(SAVE_DIR, "model_metadata.json")
with open(meta_path, "w", encoding="utf-8") as f:
    json.dump(metadata, f, indent=4, ensure_ascii=False)

print(f"✅ Metadata saved to: {meta_path}")


Saved model to ../mt5-lsl-model
Cleaning up temporary checkpoints...
✅ Metadata saved to: ../mt5-lsl-model\model_metadata.json


## Testing

In [None]:
import torch
import json
import os

# --- 1. SETUP MAPS (Crucial!) ---
# We need to load the map to convert between "Big Tokenizer" and "Small Model"
vocab_map_path = os.path.join(PRUNED_MODEL_PATH, "vocab_map.json")

with open(vocab_map_path, "r") as f:
    new2old_map = json.load(f)
    # We need both directions!
    old2new_map = {int(v): int(k) for k, v in new2old_map.items()} # Big -> Small
    new2old_map = {int(k): int(v) for k, v in new2old_map.items()} # Small -> Big

original_unk_id = tokenizer.unk_token_id
pruned_unk_id = old2new_map.get(original_unk_id, 0) # Fallback to 0 if not found


# --- 2. CUSTOM TRANSLATION FUNCTION ---
def predict_gloss(text):
    # Tokenize input
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
    input_ids = inputs.input_ids[0].tolist()
    
    # Remap to pruned IDs
    pruned_input_ids = [old2new_map.get(tid, pruned_unk_id) for tid in input_ids]
    input_tensor = torch.tensor([pruned_input_ids]).to(model.device)

    # Generate
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_tensor,
            max_length=32,
            num_beams=3,
            early_stopping=True,
            # no_repeat_ngram_size=2 # If we don't want many repeats!
        )
    
    # Remap output IDs back to full tokenizer
    output_ids = outputs[0].tolist()
    original_output_ids = [new2old_map.get(tid, tokenizer.unk_token_id) for tid in output_ids]

    # Decode
    return tokenizer.decode(original_output_ids, skip_special_tokens=True)


# --- 3. RUN TESTS ---
print("\n--- RESULTS ---")

test_sentences = [
    lv_lines[0],
    lv_lines[20],
    lv_lines[30],
    lv_lines[40],
    lv_lines[50]
]

for text in test_sentences:
    gloss = predict_gloss(text)
    print(f"\nInput:  {text}")
    print(f"Result: {gloss}")


--- RESULTS ---

Input:  Čau!
Result: sveiks

Input:  Kāds ir jūsu vārds?
Result: kāds ir tavs vārds

Input:  Kāds ir tavs vārds?
Result: kāds ir tavs vārds

Input:  Priecājos ar Jums iepazīties!
Result: prieks ar tavs iepazīties

Input:  Man iet labi.
Result: mans iet labs
