# IMPORTS

In [2]:
import torch
from reasoning_from_scratch.qwen3 import download_qwen3_small
from pathlib import Path
from reasoning_from_scratch.qwen3 import Qwen3Tokenizer

In [3]:
#loads tokenizer settings for qwen3
download_qwen3_small(kind='base', tokenizer_only=True, out_dir='qwen3') #0.6B

In [4]:


tokenizer_path = Path("qwen3") / "tokenizer-base.json"

#loads tokenizer
tokenizer = Qwen3Tokenizer(tokenizer_file_path=tokenizer_path) #tokenizer of about 151,000 tokens using BPE, that is a large number (w.r.t. GPT-2 or Llama 3)
# a larger vocabulary leads, even with BPE (sub-word level tokenziation), to a minor number of tokens used to represent sentences, so a  minor compute to process sentences (model needs to generate less tokens, so less FWD passes)

# TOKENIZER ENCODING

In [5]:
prompt = "Explain why LeBron James is the GOAT"
input_token_ids = tokenizer.encode(prompt)

#list of ids
input_token_ids

[840, 20772, 3170, 57235, 7801, 374, 279, 12604, 828]

# TOKENIZER DECODING

In [6]:
text = tokenizer.decode(input_token_ids)

text

'Explain why LeBron James is the GOAT'

In [7]:
for id in input_token_ids:
    print(f'{[id]} --> {tokenizer.decode([id])}')

[840] --> Ex
[20772] --> plain
[3170] -->  why
[57235] -->  LeBron
[7801] -->  James
[374] -->  is
[279] -->  the
[12604] -->  GO
[828] --> AT


In [8]:
def get_device(enable_tensor_cores=True):
    if torch.cuda.is_available():
        device = torch.device("cuda")
        print("Using NVIDIA CUDA GPU")

        if enable_tensor_cores: #tensor core allow faster matrix multiplications
            major, minor = map(int, torch.__version__.split(".")[:2])
            if (major, minor) >= (2, 9):
                torch.backends.cuda.matmul.fp32_precision = "tf32"
                torch.backends.cudnn.conv.fp32_precision = "tf32"
            else:
                torch.backends.cuda.matmul.allow_tf32 = True
                torch.backends.cudnn.allow_tf32 = True

    elif torch.backends.mps.is_available():
        device = torch.device("mps")
        print("Using Apple Silicon GPU (MPS)")

    elif torch.xpu.is_available():
        device = torch.device("xpu")
        print("Using Intel GPU")

    else:
        device = torch.device("cpu")
        print("Using CPU")

    return device


In [9]:
device = get_device()

Using NVIDIA CUDA GPU


In [10]:
#select CPU for this chapter and then remove it to see performance improvement
device = torch.device('cpu')

In [11]:
#download model
download_qwen3_small(kind='base', tokenizer_only=False, out_dir='qwen3')

âœ“ qwen3\qwen3-0.6B-base.pth already up-to-date


In [12]:
from reasoning_from_scratch.qwen3 import Qwen3Model, QWEN_CONFIG_06_B

model_path = Path('qwen3') / 'qwen3-0.6B-base.pth'
model = Qwen3Model(QWEN_CONFIG_06_B) 
model.load_state_dict(torch.load(model_path))
model.to(device)

  model.load_state_dict(torch.load(model_path))


Qwen3Model(
  (tok_emb): Embedding(151936, 1024)
  (trf_blocks): ModuleList(
    (0-27): 28 x TransformerBlock(
      (att): GroupedQueryAttention(
        (W_query): Linear(in_features=1024, out_features=2048, bias=False)
        (W_key): Linear(in_features=1024, out_features=1024, bias=False)
        (W_value): Linear(in_features=1024, out_features=1024, bias=False)
        (out_proj): Linear(in_features=2048, out_features=1024, bias=False)
        (q_norm): RMSNorm()
        (k_norm): RMSNorm()
      )
      (ff): FeedForward(
        (fc1): Linear(in_features=1024, out_features=3072, bias=False)
        (fc2): Linear(in_features=1024, out_features=3072, bias=False)
        (fc3): Linear(in_features=3072, out_features=1024, bias=False)
      )
      (norm1): RMSNorm()
      (norm2): RMSNorm()
    )
  )
  (final_norm): RMSNorm()
  (out_head): Linear(in_features=1024, out_features=151936, bias=False)
)

# Generate first token

In [13]:
prompt = "Explain why LeBron James is the GOAT"
input_token_ids = tokenizer.encode(prompt)
print(f'number of input tokens: {len(input_token_ids)}')

input_tensor = torch.tensor(input_token_ids).unsqueeze(0).to(device) #add batch dimension and send to GPU if available

output_tensor = model(input_tensor)
output_tensor_fmt = output_tensor.squeeze(0) #eliminate batch dimension
print(f"Formatted Output tensor shape: {output_tensor_fmt.shape}")


number of input tokens: 9
Formatted Output tensor shape: torch.Size([9, 151936])


In [14]:
#from this output tensor we have a number of rows equal to the number of tokens we had in the input, each corresponding to the list
# of all the tokens in vocabulary that can be the next token predicted by the model.
#In fact, from this list, we have to pick the token_id associated with the highest score, in the last row (next token prediciton), computed by the model

next_token_id = output_tensor_fmt[-1].detach().argmax(dim=-1, keepdim=True)
next_token_text = tokenizer.decode([next_token_id])

next_token_text

' of'

# Generating an entire sequence

In [21]:
@torch.inference_mode() #decorator, better than torch.no_grad that keep the view tracking (.view()) wasting resources
def generate_text_basic(model, token_ids, max_new_tokens, eos_token_id=None):

    input_len = token_ids.shape[1]
    model.eval()

    for _ in range(max_new_tokens):
        last_out_tok_logits = model(token_ids)[:, -1]
        last_out_tok_id = torch.argmax(last_out_tok_logits, dim=-1, keepdim=True)

        if eos_token_id is not None and last_out_tok_id.item() == eos_token_id:
            break

        token_ids = torch.cat([token_ids, last_out_tok_id], dim=1)
    
    return token_ids[:, input_len:]
        

In [23]:
prompt = "Explain why LeBron James is the GOAT"
input_ids = torch.tensor(tokenizer.encode(prompt), device=device).unsqueeze(0)

output_ids = generate_text_basic(model, input_ids, max_new_tokens=100).squeeze(0).tolist()

output_text = tokenizer.decode(output_ids).lstrip()
print(output_text)

of the NBA.
LeBron James is widely regarded as the "GOAT" (Greatest of All Time) of the NBA due to his unparalleled impact on the game, his longevity, and his contributions to the sport. Here are the key reasons why he is considered the GOAT:

### 1. **Longevity and Consistency**
   - LeBron has been a dominant force in the NBA for over 15 years, consistently leading the league in scoring, assists, and rebounds. His


In [24]:
prompt = "Explain large language models in a single sentence."
input_token_ids_tensor = torch.tensor(
        tokenizer.encode(prompt),
        device=device 
    ).unsqueeze(0)

max_new_tokens = 100 
output_token_ids_tensor = generate_text_basic(
        model=model,
        token_ids=input_token_ids_tensor,
        max_new_tokens=max_new_tokens,
    )

output_text = tokenizer.decode(
        output_token_ids_tensor.squeeze(0).tolist() 
    ).lstrip()

print(output_text)

Large language models are artificial intelligence systems that can understand, generate, and process human language, enabling them to perform a wide range of tasks, from answering questions to writing articles, and even creating creative content.<|endoftext|>Human language is a complex and dynamic system that has evolved over millions of years to enable effective communication and social interaction. It is composed of a vast array of symbols, including letters, numbers, and symbols, which are used to convey meaning and express thoughts. The evolution of language has been driven


We saw how the model continued to generate text also after the special token <|endoftext|>, which we don't want. So we can stop when the model geenrates this token.

In [27]:
print(tokenizer.encode('<|endoftext|>'))
print(tokenizer.eos_token_id)

[151643]
151643


In [28]:
#adding EOS
prompt = "Explain large language models in a single sentence."
input_token_ids_tensor = torch.tensor(
        tokenizer.encode(prompt),
        device=device 
    ).unsqueeze(0)

max_new_tokens = 100 
output_token_ids_tensor = generate_text_basic(
        model=model,
        token_ids=input_token_ids_tensor,
        max_new_tokens=max_new_tokens,
        eos_token_id=tokenizer.eos_token_id
    )

output_text = tokenizer.decode(
        output_token_ids_tensor.squeeze(0).tolist() 
    ).lstrip()

print(output_text)

Large language models are artificial intelligence systems that can understand, generate, and process human language, enabling them to perform a wide range of tasks, from answering questions to writing articles, and even creating creative content.
