In [None]:
# Download mt5-small from Hugging Face

# from huggingface_hub import snapshot_download

# snapshot_download(
#     repo_id="google/mt5-small",
#     local_dir="E:/Documents/GitHub/LSL/mt5-small",
#     local_dir_use_symlinks=False
# )

  from .autonotebook import tqdm as notebook_tqdm
Fetching 20 files: 100%|██████████| 20/20 [07:12<00:00, 21.61s/it]


'E:\\Documents\\GitHub\\LSL\\mt5-small'

In [1]:
import torch
import json
import glob
import os
import shutil
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoConfig

# --- PATH CONFIGURATION (WINDOWS) ---
BASE_DIR = r"E:\DOCUMENTS\GITHUB\LSL"
ORIGINAL_MODEL_PATH = os.path.join(BASE_DIR, "MT5-SMALL")
OUTPUT_DIR = os.path.join(BASE_DIR, "MT5-PRUNED")

# Extra text to ensure common words are kept (Optional - currently empty)
EXTRA_LATVIAN_TEXT = """
"""

def prune_and_save():
    print(f"Loading model from {ORIGINAL_MODEL_PATH}...")
    # Load from your local folder
    tokenizer = AutoTokenizer.from_pretrained(ORIGINAL_MODEL_PATH)
    model = AutoModelForSeq2SeqLM.from_pretrained(ORIGINAL_MODEL_PATH)

    print("Scanning text data to find used tokens...")
    used_tokens = set()
    
    # 1. Keep special T5 tokens (padding, end-of-sentence, etc.)
    # T5 uses the first ~100 IDs for special sentinels
    for i in range(256): 
        used_tokens.add(i)
        
    # 2. Scan your .txt files
    # This looks for ALL .txt files in your LSL folder
    text_files = glob.glob(os.path.join(BASE_DIR, "*.txt"))
    
    content = []
    for file_path in text_files:
        print(f"   Reading: {os.path.basename(file_path)}")
        with open(file_path, "r", encoding="utf-8") as f:
            content.extend(f.readlines())
    
    # 3. Add extra text
    content.append(EXTRA_LATVIAN_TEXT)
    
    # 4. Tokenize
    print(f"   Processing {len(content)} lines of text...")
    with tokenizer.as_target_tokenizer():
        for line in content:
            if not line.strip(): continue
            ids = tokenizer(line.strip())['input_ids']
            used_tokens.update(ids)

    print(f"   Found {len(used_tokens)} unique tokens needed (out of {model.config.vocab_size}).")
    
    # --- PRUNING ---
    print("Creating new embeddings...")
    new_vocab_size = len(used_tokens)
    
    # Map Old ID -> New ID
    sorted_used_ids = sorted(list(used_tokens))
    old2new = {old_id: new_id for new_id, old_id in enumerate(sorted_used_ids)}
    new2old = {new_id: old_id for new_id, old_id in enumerate(sorted_used_ids)}
    
    # Resize layers
    new_embeddings = torch.nn.Embedding(new_vocab_size, model.config.d_model)
    new_lm_head = torch.nn.Linear(model.config.d_model, new_vocab_size, bias=False)
    
    print("Copying weights...")
    with torch.no_grad():
        for old_id, new_id in old2new.items():
            new_embeddings.weight[new_id] = model.shared.weight[old_id]
            new_lm_head.weight[new_id] = model.lm_head.weight[old_id]
            
    # Apply changes
    model.shared = new_embeddings
    model.encoder.embed_tokens = new_embeddings
    model.decoder.embed_tokens = new_embeddings
    model.lm_head = new_lm_head
    model.config.vocab_size = new_vocab_size
    
    # --- SAVING ---
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)

    print(f"Saving new model to {OUTPUT_DIR}...")
    
    # 1. Save the PyTorch Model (The small file)
    torch.save(model.state_dict(), os.path.join(OUTPUT_DIR, "pytorch_model.bin"))
    
    # 2. Save the Config
    model.config.save_pretrained(OUTPUT_DIR)
    
    # 3. Save the Vocabulary Map (Crucial for decoding later)
    with open(os.path.join(OUTPUT_DIR, "vocab_map.json"), "w") as f:
        json.dump(new2old, f)

    # 4. Copy Tokenizer Files (CRITICAL STEP)
    # We need these to process text before mapping IDs
    files_to_copy = ["spiece.model", "tokenizer_config.json", "special_tokens_map.json", "tokenizer.json"]
    for filename in files_to_copy:
        src = os.path.join(ORIGINAL_MODEL_PATH, filename)
        if os.path.exists(src):
            shutil.copy(src, os.path.join(OUTPUT_DIR, filename))
            print(f"   Copied {filename}")

    print("\n--- DONE! ---")
    print(f"You can now delete the folder: {ORIGINAL_MODEL_PATH}")

if __name__ == "__main__":
    prune_and_save()

  from .autonotebook import tqdm as notebook_tqdm


Loading model from E:\DOCUMENTS\GITHUB\LSL\MT5-SMALL...


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
The tokenizer you are loading from 'E:\DOCUMENTS\GITHUB\LSL\MT5-SMALL' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


Scanning text data to find used tokens...
   Reading: latvian_sentences_1.txt
   Reading: latvian_sentences_2.txt
   Reading: lsl_glosses_1.txt
   Reading: lsl_glosses_2.txt
   Processing 549 lines of text...
   Found 608 unique tokens needed (out of 250112).
Creating new embeddings...
Copying weights...
Saving new model to E:\DOCUMENTS\GITHUB\LSL\MT5-PRUNED...




   Copied spiece.model
   Copied tokenizer_config.json
   Copied special_tokens_map.json

--- DONE! ---
You can now delete the folder: E:\DOCUMENTS\GITHUB\LSL\MT5-SMALL
