In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [2]:
from datasets import load_dataset

dataset = load_dataset("lang-uk/malyuk", split="train", streaming=True)
# dataset = dataset[:300000}

text = []
for i, row in enumerate(dataset['text']):
    if i > 100000:
        break
    text.append(row)

# text

README.md: 0.00B [00:00, ?B/s]

Resolving data files:   0%|          | 0/237 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/237 [00:00<?, ?it/s]

In [3]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

# Training base vocab

In [4]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from transformers import PreTrainedTokenizerFast

# text = court[0].astype(str).tolist()
tok = Tokenizer(BPE(unk_token="<unk>"))
tok.pre_tokenizer = Whitespace()

trainer = BpeTrainer(
    vocab_size=40000,
    min_frequency=2,
    special_tokens=["<s>", "</s>", "<pad>", "<unk>", "<mask>"]
)

tok.train_from_iterator(text, trainer=trainer)

tokA = PreTrainedTokenizerFast(
    tokenizer_object=tok,
    bos_token="<s>", eos_token="</s>",
    unk_token="<unk>", sep_token="</s>",
    cls_token="<s>", pad_token="<pad>", mask_token="<mask>",
)

tokA.save_pretrained("tokA")






('tokA/tokenizer_config.json',
 'tokA/special_tokens_map.json',
 'tokA/tokenizer.json')

In [5]:
from transformers import AutoTokenizer

tokA = AutoTokenizer.from_pretrained("/kaggle/working/tokA", use_fast=True)
vocab = tokA.get_vocab()
tokens = [t for t, _id in sorted(vocab.items(), key=lambda x: x[1])]
tokens[:10]

['<s>', '</s>', '<pad>', '<unk>', '<mask>', '!', '"', '#', '$', '%']

Due to problen with "from zett.utils import get_surface_form_matrix" we tokk function "get_surface_form_matrix" from Zett repo in GitHub.

In [6]:
CHARS_TO_BYTES = {
    "Ā": 0,
    "ā": 1,
    "Ă": 2,
    "ă": 3,
    "Ą": 4,
    "ą": 5,
    "Ć": 6,
    "ć": 7,
    "Ĉ": 8,
    "ĉ": 9,
    "Ċ": 10,
    "ċ": 11,
    "Č": 12,
    "č": 13,
    "Ď": 14,
    "ď": 15,
    "Đ": 16,
    "đ": 17,
    "Ē": 18,
    "ē": 19,
    "Ĕ": 20,
    "ĕ": 21,
    "Ė": 22,
    "ė": 23,
    "Ę": 24,
    "ę": 25,
    "Ě": 26,
    "ě": 27,
    "Ĝ": 28,
    "ĝ": 29,
    "Ğ": 30,
    "ğ": 31,
    "Ġ": 32,
    "!": 33,
    '"': 34,
    "#": 35,
    "$": 36,
    "%": 37,
    "&": 38,
    "'": 39,
    "(": 40,
    ")": 41,
    "*": 42,
    "+": 43,
    ",": 44,
    "-": 45,
    ".": 46,
    "/": 47,
    "0": 48,
    "1": 49,
    "2": 50,
    "3": 51,
    "4": 52,
    "5": 53,
    "6": 54,
    "7": 55,
    "8": 56,
    "9": 57,
    ":": 58,
    ";": 59,
    "<": 60,
    "=": 61,
    ">": 62,
    "?": 63,
    "@": 64,
    "A": 65,
    "B": 66,
    "C": 67,
    "D": 68,
    "E": 69,
    "F": 70,
    "G": 71,
    "H": 72,
    "I": 73,
    "J": 74,
    "K": 75,
    "L": 76,
    "M": 77,
    "N": 78,
    "O": 79,
    "P": 80,
    "Q": 81,
    "R": 82,
    "S": 83,
    "T": 84,
    "U": 85,
    "V": 86,
    "W": 87,
    "X": 88,
    "Y": 89,
    "Z": 90,
    "[": 91,
    "\\": 92,
    "]": 93,
    "^": 94,
    "_": 95,
    "`": 96,
    "a": 97,
    "b": 98,
    "c": 99,
    "d": 100,
    "e": 101,
    "f": 102,
    "g": 103,
    "h": 104,
    "i": 105,
    "j": 106,
    "k": 107,
    "l": 108,
    "m": 109,
    "n": 110,
    "o": 111,
    "p": 112,
    "q": 113,
    "r": 114,
    "s": 115,
    "t": 116,
    "u": 117,
    "v": 118,
    "w": 119,
    "x": 120,
    "y": 121,
    "z": 122,
    "{": 123,
    "|": 124,
    "}": 125,
    "~": 126,
    "ġ": 127,
    "Ģ": 128,
    "ģ": 129,
    "Ĥ": 130,
    "ĥ": 131,
    "Ħ": 132,
    "ħ": 133,
    "Ĩ": 134,
    "ĩ": 135,
    "Ī": 136,
    "ī": 137,
    "Ĭ": 138,
    "ĭ": 139,
    "Į": 140,
    "į": 141,
    "İ": 142,
    "ı": 143,
    "Ĳ": 144,
    "ĳ": 145,
    "Ĵ": 146,
    "ĵ": 147,
    "Ķ": 148,
    "ķ": 149,
    "ĸ": 150,
    "Ĺ": 151,
    "ĺ": 152,
    "Ļ": 153,
    "ļ": 154,
    "Ľ": 155,
    "ľ": 156,
    "Ŀ": 157,
    "ŀ": 158,
    "Ł": 159,
    "ł": 160,
    "¡": 161,
    "¢": 162,
    "£": 163,
    "¤": 164,
    "¥": 165,
    "¦": 166,
    "§": 167,
    "¨": 168,
    "©": 169,
    "ª": 170,
    "«": 171,
    "¬": 172,
    "Ń": 173,
    "®": 174,
    "¯": 175,
    "°": 176,
    "±": 177,
    "²": 178,
    "³": 179,
    "´": 180,
    "µ": 181,
    "¶": 182,
    "·": 183,
    "¸": 184,
    "¹": 185,
    "º": 186,
    "»": 187,
    "¼": 188,
    "½": 189,
    "¾": 190,
    "¿": 191,
    "À": 192,
    "Á": 193,
    "Â": 194,
    "Ã": 195,
    "Ä": 196,
    "Å": 197,
    "Æ": 198,
    "Ç": 199,
    "È": 200,
    "É": 201,
    "Ê": 202,
    "Ë": 203,
    "Ì": 204,
    "Í": 205,
    "Î": 206,
    "Ï": 207,
    "Ð": 208,
    "Ñ": 209,
    "Ò": 210,
    "Ó": 211,
    "Ô": 212,
    "Õ": 213,
    "Ö": 214,
    "×": 215,
    "Ø": 216,
    "Ù": 217,
    "Ú": 218,
    "Û": 219,
    "Ü": 220,
    "Ý": 221,
    "Þ": 222,
    "ß": 223,
    "à": 224,
    "á": 225,
    "â": 226,
    "ã": 227,
    "ä": 228,
    "å": 229,
    "æ": 230,
    "ç": 231,
    "è": 232,
    "é": 233,
    "ê": 234,
    "ë": 235,
    "ì": 236,
    "í": 237,
    "î": 238,
    "ï": 239,
    "ð": 240,
    "ñ": 241,
    "ò": 242,
    "ó": 243,
    "ô": 244,
    "õ": 245,
    "ö": 246,
    "÷": 247,
    "ø": 248,
    "ù": 249,
    "ú": 250,
    "û": 251,
    "ü": 252,
    "ý": 253,
    "þ": 254,
    "ÿ": 255,
}
BYTES_TO_CHARS = {v: k for k, v in CHARS_TO_BYTES.items()}

In [7]:
MAX_CHARS_PER_TOKEN = 16

def unicode_to_zett_bytechars(token: str, maxlen: int = MAX_CHARS_PER_TOKEN) -> str:
    b = token.encode("utf-8", errors="replace")[:maxlen]
    return "".join(BYTES_TO_CHARS[byte] for byte in b)

In [8]:
from tqdm.auto import tqdm
from transformers import ByT5Tokenizer

def get_surface_form_matrix(
    tokenizer_or_tokens, maxlen, tokenizer_to_use=None, padding=0, verbose=False
):
    # tokens are expected to be byte encoded
    if isinstance(tokenizer_or_tokens, list):
        tokens = tokenizer_or_tokens
    else:
        tokenizer = tokenizer_or_tokens
        tokens = tokenizer.convert_ids_to_tokens(range(len(tokenizer)))

    vocab_size = len(tokens)
    surface_form_matrix = np.full(
        (vocab_size + padding, maxlen),
        tokenizer_to_use.pad_token_id if tokenizer_to_use is not None else 0,
        dtype=np.int32,
    )

    n_truncated = 0

    for i, token in tqdm(enumerate(tokens), total=vocab_size, disable=not verbose):
        if token in tokenizer_to_use.all_special_tokens:
            surface_form_matrix[i, 0] = tokenizer_to_use.convert_tokens_to_ids(token)
            continue

        token = unicode_to_zett_bytechars(token, maxlen=maxlen)
        token_bytes = bytes([CHARS_TO_BYTES[c] for c in token])

        if isinstance(tokenizer_to_use, ByT5Tokenizer):
            ids = tokenizer_to_use.convert_tokens_to_ids([chr(i) for i in token_bytes])
        else:
            # assume hn tokenizer uses byte pretokenization
            ids = [x.id for x in tokenizer_to_use._tokenizer.model.tokenize(token)]

        if len(ids) > maxlen:
            ids = ids[:maxlen]
            n_truncated += 1

        surface_form_matrix[i, : len(ids)] = ids

    return surface_form_matrix, n_truncated

In [9]:
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, AutoModelForMaskedLM
# from zett.utils import get_surface_form_matrix

base_model = AutoModelForMaskedLM.from_pretrained("FacebookAI/xlm-roberta-base").to(device).eval()
hypernet = AutoModel.from_pretrained("benjamin/zett-hypernetwork-xlm-roberta-base", trust_remote_code=True).to(device).eval()

hn_tokenizer = AutoTokenizer.from_pretrained("benjamin/zett-hypernetwork-xlm-roberta-base")

source_embeddings = base_model.get_input_embeddings().weight.detach()

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

2026-02-02 15:00:56.797196: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1770044457.016295      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1770044457.074298      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1770044457.587679      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770044457.587741      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770044457.587765      55 computation_placer.cc:177] computation placer alr

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of the model checkpoint at FacebookAI/xlm-roberta-base were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


config.json: 0.00B [00:00, ?B/s]

configuration_hypernet.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/benjamin/zett-hypernetwork-xlm-roberta-base:
- configuration_hypernet.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_hypernet.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/benjamin/zett-hypernetwork-xlm-roberta-base:
- modeling_hypernet.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/82.5M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/18.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [10]:
pred_in_chunks = []

for i in range(0, len(tokens), 512):
    batch_tokens = tokens[i:i+512]

    target_surface_forms, _ = get_surface_form_matrix(
        batch_tokens, # byte representation of the tokens to predict
        maxlen=hypernet.config.hn_surface_maxlen,
        tokenizer_to_use=hn_tokenizer,
    )

    lang_index = torch.tensor([17], device=device, dtype=torch.long)
    with torch.no_grad():
        # the last output is the predicted bias in case the model uses a bias (e.g. XLM-R)
        predicted_input_embeddings, predicted_output_embeddings, _ = hypernet(
            torch.from_numpy(target_surface_forms).to(device),
            source_embeddings=source_embeddings,
            lang_index=lang_index
        )

    pred_in_chunks.append(predicted_input_embeddings.detach().cpu())

predicted_input_embeddings = torch.cat(pred_in_chunks, dim=0) # shap: [40000, 768]

# Embeddings

In [None]:
from transformers import AutoModelForSequenceClassification

zett_clf = AutoModelForSequenceClassification.from_pretrained(
    "FacebookAI/xlm-roberta-base",
    num_labels=7,
    problem_type="multi_label_classification"
).to(device)

new_emb = torch.nn.Embedding(
    predicted_input_embeddings.size(0),
    predicted_input_embeddings.size(1),
    padding_idx=tokA.pad_token_id
).to(device)

new_emb.weight.data[:] = predicted_input_embeddings.to(device)

zett_clf.roberta.embeddings.word_embeddings = new_emb

zett_clf.config.vocab_size = tokA.vocab_size
zett_clf.config.pad_token_id = tokA.pad_token_id
zett_clf.config.bos_token_id = tokA.bos_token_id
zett_clf.config.eos_token_id = tokA.eos_token_id
zett_clf.config.unk_token_id = tokA.unk_token_id
zett_clf.config.mask_token_id = tokA.mask_token_id

# After Zero-Shot Tokenizer Transfer

In [None]:
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments

train_after = train_ds.map(lambda batch: tokA(batch["text"], truncation=True), batched=True)
val_after = val_ds.map(lambda batch: tokA(batch["text"], truncation=True), batched=True)

collator_after = DataCollatorWithPadding(tokenizer=tokA)

In [None]:
args_after = TrainingArguments(
    output_dir="/kaggle/working/emotions",
    eval_strategy="epoch",
    save_strategy="no",
    learning_rate=2e-5, # 0.0001
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=20,
    weight_decay=0.005,
    report_to="none",
    seed=32,
    fp16=torch.cuda.is_available(),
)

trainer_after = Trainer(
    model=zett_clf,
    args=args_after,
    train_dataset=train_after,
    eval_dataset=val_after,
    tokenizer=tokA,
    data_collator=collator_after,
    compute_metrics=compute_metrics,
)

In [None]:
print("After Zett")
trainer_after.train()
after_eval = trainer_after.evaluate()
after_eval

---