In [14]:
import os
import json
import torch
from transformers import T5Tokenizer, AutoModelForSeq2SeqLM

# 1. Setup
TRAINED_MODEL_PATH = "E:/Documents/GitHub/LSL/mt5-lsl-model" 
PRUNED_MODEL_PATH = "../mt5-pruned"
LV_PATTERN = "../txt/latvian_sentences_*.txt"

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Running on: {device}")

tokenizer = T5Tokenizer.from_pretrained(TRAINED_MODEL_PATH)
model = AutoModelForSeq2SeqLM.from_pretrained(TRAINED_MODEL_PATH)
model.to(device)
model.eval()
print("Model loaded.")

# 2. Load the Vocab Map (The "Rosetta Stone")
vocab_map_path = os.path.join(PRUNED_MODEL_PATH, "vocab_map.json")

with open(vocab_map_path, "r") as f:
    new2old_map = json.load(f)
    old2new_map = {int(v): int(k) for k, v in new2old_map.items()} # Big ID -> Small ID
    new2old_map = {int(k): int(v) for k, v in new2old_map.items()} # Small ID -> Big ID

# 3. Identify the UNK token ID in the new mapping
original_unk_id = tokenizer.unk_token_id
pruned_unk_id = old2new_map.get(original_unk_id, 0)

print("Maps loaded.")

def predict_gloss(text):
    # 1. Tokenize input using the standard tokenizer
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
    input_ids = inputs.input_ids[0].tolist()
    
    # 2. Remap Big IDs -> Small Pruned IDs
    pruned_input_ids = [old2new_map.get(tid, pruned_unk_id) for tid in input_ids]
    input_tensor = torch.tensor([pruned_input_ids]).to(device)

    # 3. Generate (Inference)
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_tensor,
            max_length=32,
            num_beams=3,
            early_stopping=True,
            no_repeat_ngram_size=2
        )
    
    # 4. Remap Output Small IDs -> Big IDs
    output_ids = outputs[0].tolist()
    original_output_ids = [new2old_map.get(tid, tokenizer.unk_token_id) for tid in output_ids]

    # 5. Decode back to text
    return tokenizer.decode(original_output_ids, skip_special_tokens=True)

Running on: cpu
Model loaded.
Maps loaded.


In [15]:
print("--- TESTING TRANSLATION ---")

test_sentences = [
    "Vai tu esi labs cilvēks?",
    "Man ir ļoti lielas mājas.",
    "Sveika!",
    "Labdien, [NAME]!",
    "[NAME] ir īss...",
    "Braucam pie zaļās gaismas.",
    "Viņš nopirka divus kreklus",
    "Jānis Edgars"
]

for text in test_sentences:
    gloss = predict_gloss(text)
    print(f"\nInput:  {text}")
    print(f"Result: {gloss}")

--- TESTING TRANSLATION ---

Input:  Vai tu esi labs cilvēks?
Result: vai tu esi labs cilvēks

Input:  Man ir ļoti lielas mājas.
Result: mans ir liels māja

Input:  Sveika!
Result: vai Sveika

Input:  Labdien, [NAME]!
Result: [NAME]

Input:  [NAME] ir īss...
Result: [NAME] ir īss

Input:  Braucam pie zaļās gaismas.
Result: vai zaļās gaismas

Input:  Viņš nopirka divus kreklus
Result: vai nopirkt krekls krevs

Input:  Jānis Edgars
Result: mans
