In [1]:
import torch
from usta_model import UstaModel
from usta_tokenizer import UstaTokenizer

u_tokenizer = UstaTokenizer("tokenizer.json")

prompt = "the capital of united"

tokens = u_tokenizer.encode(prompt)
tokens

tensor([ 0, 61,  1, 61,  2, 61,  3])

In [2]:
torch.manual_seed(1)
u_model = UstaModel(vocab_size=len(u_tokenizer.vocab), embedding_dim=4, num_heads=2, context_length=32, num_layers=3)

out = u_model(tokens)
out.shape

torch.Size([7, 64])

In [3]:
u_model

UstaModel(
  (embedding): UstaEmbedding(
    (embedding): Embedding(64, 4)
  )
  (layers): Sequential(
    (0): UstaDecoderBlock(
      (self_attention): UstaMultiHeadAttention(
        (multi_head_attention): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=4, out_features=4, bias=True)
        )
        (projection): Linear(in_features=4, out_features=4, bias=True)
      )
      (norm1): UstaLayerNorm()
      (mlp): UstaMLP(
        (gate_proj): Linear(in_features=4, out_features=4, bias=True)
        (up_proj): Linear(in_features=4, out_features=4, bias=True)
        (down_proj): Linear(in_features=4, out_features=4, bias=True)
        (gelu): GELU()
      )
      (norm2): UstaLayerNorm()
    )
    (1): UstaDecoderBlock(
      (self_attention): UstaMultiHeadAttention(
        (multi_head_attention): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=4, out_features=4, bias=True)
        )
        (projection): L

In [4]:
import torch

probs = torch.softmax(out[-1], dim=-1)
max_prob, max_index = torch.max(probs, dim=-1)
max_prob, max_index, probs

(tensor(0.0312, grad_fn=<MaxBackward0>),
 tensor(55),
 tensor([0.0281, 0.0205, 0.0108, 0.0083, 0.0149, 0.0074, 0.0086, 0.0084, 0.0206,
         0.0135, 0.0196, 0.0203, 0.0188, 0.0105, 0.0195, 0.0125, 0.0145, 0.0164,
         0.0175, 0.0076, 0.0211, 0.0132, 0.0168, 0.0211, 0.0089, 0.0240, 0.0198,
         0.0214, 0.0103, 0.0137, 0.0134, 0.0191, 0.0115, 0.0074, 0.0104, 0.0190,
         0.0158, 0.0125, 0.0139, 0.0231, 0.0146, 0.0106, 0.0155, 0.0167, 0.0165,
         0.0098, 0.0075, 0.0132, 0.0129, 0.0264, 0.0228, 0.0072, 0.0232, 0.0115,
         0.0164, 0.0312, 0.0187, 0.0173, 0.0179, 0.0136, 0.0259, 0.0074, 0.0151,
         0.0132], grad_fn=<SoftmaxBackward0>))

In [5]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

q_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B")
q_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-0.6B")

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
q_tokens = q_tokenizer.encode(prompt)
q_tokens

[1782, 6722, 315, 28192]

In [7]:
q_model.generate(torch.tensor([q_tokens]))

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


tensor([[ 1782,  6722,   315, 28192,  5302,   315,   374,   642,   374,   198,
           785,  6722,   315,   279,  3639,  4180,   315, 22235,   374,  1112,
            30,  6771,   752,  1744]])

In [8]:
# input = [1782, 6722, 315, 28192]
# output = [38297, 315, 279, 5302]
# expected = [6722, 315, 28192, 5302]

In [9]:
q_out = q_model(torch.tensor([q_tokens]))
q_out

CausalLMOutputWithPast(loss=None, logits=tensor([[[ 4.3589,  4.4961,  3.3736,  ...,  0.9272,  0.9272,  0.9272],
         [ 6.2842,  9.0974,  2.1320,  ..., -2.9427, -2.9427, -2.9427],
         [ 4.8185,  6.7840,  1.7485,  ..., -2.8132, -2.8133, -2.8133],
         [ 5.2263,  7.3146,  3.2717,  ..., -3.0105, -3.0105, -3.0105]]],
       grad_fn=<UnsafeViewBackward0>), past_key_values=DynamicCache(layers=[<transformers.cache_utils.DynamicLayer object at 0x137e83290>, <transformers.cache_utils.DynamicLayer object at 0x137e82e10>, <transformers.cache_utils.DynamicLayer object at 0x137e820f0>, <transformers.cache_utils.DynamicLayer object at 0x137e83f50>, <transformers.cache_utils.DynamicLayer object at 0x1449a2f30>, <transformers.cache_utils.DynamicLayer object at 0x137ec8c50>, <transformers.cache_utils.DynamicLayer object at 0x132c4dd90>, <transformers.cache_utils.DynamicLayer object at 0x132c4df70>, <transformers.cache_utils.DynamicLayer object at 0x132c4e690>, <transformers.cache_utils.Dyna

In [10]:
q_out.logits.shape

torch.Size([1, 4, 151936])

In [11]:
q_out.logits[0, 0, :].shape

torch.Size([151936])

In [12]:
probs = torch.softmax(q_out.logits[0, 2, :], dim=-1)
max_prob, max_index = torch.max(probs, dim=-1)
max_prob, max_index, probs

(tensor(0.3644, grad_fn=<MaxBackward0>),
 tensor(279),
 tensor([7.1116e-06, 5.0768e-05, 3.3015e-07,  ..., 3.4478e-09, 3.4478e-09,
         3.4478e-09], grad_fn=<SoftmaxBackward0>))

In [13]:
q_tokenizer.decode([max_index])

' the'

In [14]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

g_tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-1b-it")
g_model = AutoModelForCausalLM.from_pretrained("google/gemma-3-1b-it")

In [15]:
g_tokens = g_tokenizer.encode("the capital of united")
# input = [2, 1437, 5279, 529, 26974]
# output = [107, 2148, 138, 236743, 1786]
# expected = [1437, 5279, 529, 26974, 5022]
g_tokens

[2, 1437, 5279, 529, 26974]

In [16]:
g_tokenizer.encode(" states")

[2, 5022]

In [17]:
g_tokenizer.decode([156702])

'క్ష్'

In [18]:
g_model.generate(torch.tensor([g_tokens]), max_new_tokens=1)

tensor([[    2,  1437,  5279,   529, 26974,  5022]])

In [19]:
g_out = g_model(torch.tensor([g_tokens]))
g_out.logits.shape

torch.Size([1, 5, 262144])

In [20]:
probs = torch.softmax(g_out.logits[0, 4, :], dim=-1)
max_prob, max_index = torch.max(probs, dim=-1)
max_prob, max_index, probs

(tensor(0.6543, grad_fn=<MaxBackward0>),
 tensor(5022),
 tensor([1.0801e-16, 1.2916e-08, 5.7721e-10,  ..., 8.2236e-18, 8.6557e-18,
         9.9664e-18], grad_fn=<SoftmaxBackward0>))