In [1]:
import torch
import json
import os
import re
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoConfig

# --- CONFIGURATION ---
PRUNED_MODEL_PATH = "../mt5-pruned"
TRAINED_MODEL_PATH = "../mt5-lsl-model"

# --- 1. HELPER FUNCTIONS (Logic) ---

def expand_number(number_str):
    """Converts '24' -> '20 4', '1990' -> '1000 900 90'"""
    # Simple logic: remove non-digits just in case
    s = re.sub(r"\D", "", str(number_str))
    if not s: return number_str
    
    length = len(s)
    parts = []
    for i, digit in enumerate(s):
        if digit != '0':
            # Calculate place value (e.g., 2 * 10^1 = 20)
            place_value = int(digit) * (10**(length - i - 1))
            parts.append(str(place_value))
    
    # If the number was just "0" or "00", handle it
    if not parts and "0" in s: return "0"
    
    return " ".join(parts)

def fingerspell(name_str):
    """Converts 'Jānis' -> 'j ā n i s'"""
    return " ".join(list(name_str.lower()))

# --- 2. THE TRANSLATOR CLASS ---

class LSLTranslator:
    def __init__(self, pruned_path, trained_path):
        print("Loading system...")
        
        # A. Load the Map (Rosetta Stone)
        map_path = os.path.join(pruned_path, "vocab_map.json")
        with open(map_path, "r") as f:
            new2old = json.load(f)
            # Create both directions
            self.new2old_map = {int(k): int(v) for k, v in new2old.items()} # Small -> Big
            self.old2new_map = {v: k for k, v in self.new2old_map.items()} # Big -> Small

        # B. Load Tokenizer (Original Big Tokenizer)
        self.tokenizer = AutoTokenizer.from_pretrained(trained_path)
        
        # C. Load Model (Tiny Pruned Architecture)
        config = AutoConfig.from_pretrained(trained_path)
        # Ensure config knows the small vocab size
        config.vocab_size = len(self.new2old_map) 
        
        self.model = AutoModelForSeq2SeqLM.from_pretrained(trained_path, config=config)
        
        # Move to GPU if available
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model.to(self.device)
        self.model.eval()

        # D. Define UNK IDs
        self.original_unk_id = self.tokenizer.unk_token_id
        self.pruned_unk_id = self.old2new_map.get(self.original_unk_id, 0)

    def predict(self, text):
        """
        Runs the raw text through the model using the remapping logic.
        Input: "translate Latvian to Gloss: [NAME] nopirka [NUM] kreklus."
        Output: "[NAME] pirkt [NUM] krekls"
        """
        # 1. Tokenize (Big IDs)
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
        input_ids = inputs.input_ids[0].tolist()

        # 2. Remap (Big -> Small)
        pruned_ids = [self.old2new_map.get(tid, self.pruned_unk_id) for tid in input_ids]
        input_tensor = torch.tensor([pruned_ids]).to(self.device)

        # 3. Generate (Model works in Small IDs)
        with torch.no_grad():
            outputs = self.model.generate(input_tensor, max_new_tokens=128)

        # 4. Remap (Small -> Big)
        output_ids = outputs[0].tolist()
        original_ids = [self.new2old_map.get(tid, self.original_unk_id) for tid in output_ids]

        # 5. Decode
        return self.tokenizer.decode(original_ids, skip_special_tokens=True)

# --- 3. THE "WRAPPER" LOGIC ---

def run_full_translation(translator, user_input):
    print(f"\nUser Input: '{user_input}'")
    
    # --- STEP A: PRE-PROCESSING (Extract & Replace) ---
    processed_input = user_input
    variables = {}

    # 1. Handle Numbers (Regex for digits)
    # We find all numbers, e.g., "24"
    numbers = re.findall(r'\b\d+\b', processed_input)
    if numbers:
        # For simplicity in this demo, we handle the first number found.
        # In a full app, you'd loop through them with unique IDs like [NUM1], [NUM2]
        num_val = numbers[0]
        processed_input = processed_input.replace(num_val, "[NUM]", 1)
        variables["[NUM]"] = expand_number(num_val) # Store "20 4"

    # 2. Handle Names (Heuristic for Demo)
    # NOTE: For production, use Spacy or a list of known names.
    # Here, we assume "Jānis" is a name because you mentioned it.
    known_names = ["Jānis", "Anna", "Pēteris", "Oto"] 
    found_name = None
    
    for name in known_names:
        if name in processed_input:
            found_name = name
            break
            
    if found_name:
        processed_input = processed_input.replace(found_name, "[NAME]", 1)
        variables["[NAME]"] = fingerspell(found_name) # Store "j ā n i s"

    print(f"Sent to Model: '{processed_input}'")

    # --- STEP B: MODEL INFERENCE ---
    # (Optional: Add the prefix if your model relies on it, otherwise leave raw)
    # model_input_text = "translate Latvian to Gloss: " + processed_input 
    model_input_text = processed_input # Assuming we removed prefix
    
    gloss_output = translator.predict(model_input_text)
    print(f"Model Raw Output: '{gloss_output}'")

    # --- STEP C: POST-PROCESSING (Inject Back) ---
    final_output = gloss_output
    
    # Swap [NAME] -> "j ā n i s"
    if "[NAME]" in final_output and "[NAME]" in variables:
        final_output = final_output.replace("[NAME]", variables["[NAME]"])
        
    # Swap [NUM] -> "20 4"
    if "[NUM]" in final_output and "[NUM]" in variables:
        final_output = final_output.replace("[NUM]", variables["[NUM]"])

    print(f"Final Result: '{final_output}'")
    return final_output

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# --- 4. EXECUTION ---
if __name__ == "__main__":
    # Initialize
    system = LSLTranslator(PRUNED_MODEL_PATH, TRAINED_MODEL_PATH)
    
    # Test Case
    text = "Jānis nopirka 24 kreklus."
    run_full_translation(system, text)

Loading system...


The tokenizer you are loading from '../mt5-lsl-model' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.



User Input: 'Jānis nopirka 24 kreklus.'
Sent to Model: '[NAME] nopirka [NUM] kreklus.'
Model Raw Output: '[NAME] krelēt [NAME] krelēt'
Final Result: 'j ā n i s krelēt j ā n i s krelēt'
