# IMPORTS

In [6]:
import torch
from reasoning_from_scratch.qwen3 import download_qwen3_small
from pathlib import Path
from reasoning_from_scratch.qwen3 import Qwen3Tokenizer

In [None]:
#loads tokenizer settings for qwen3
download_qwen3_small(kind='base', tokenizer_only=True, out_dir='qwen3') #0.6B

In [None]:


tokenizer_path = Path("qwen3") / "tokenizer-base.json"

#loads tokenizer
tokenizer = Qwen3Tokenizer(tokenizer_file_path=tokenizer_path) #tokenizer of about 151,000 tokens using BPE, that is a large number (w.r.t. GPT-2 or Llama 3)
# a larger vocabulary leads, even with BPE (sub-word level tokenziation), to a minor number of tokens used to represent sentences, so a  minor compute to process sentences (model needs to generate less tokens, so less FWD passes)

# TOKENIZER ENCODING

In [3]:
prompt = "Explain why LeBron James is the GOAT"
input_token_ids = tokenizer.encode(prompt)

#list of ids
input_token_ids

[840, 20772, 3170, 57235, 7801, 374, 279, 12604, 828]

# TOKENIZER DECODING

In [4]:
text = tokenizer.decode(input_token_ids)

text

'Explain why LeBron James is the GOAT'

In [5]:
for id in input_token_ids:
    print(f'{[id]} --> {tokenizer.decode([id])}')

[840] --> Ex
[20772] --> plain
[3170] -->  why
[57235] -->  LeBron
[7801] -->  James
[374] -->  is
[279] -->  the
[12604] -->  GO
[828] --> AT


In [None]:
def get_device(enable_tensor_cores=True):
    if torch.cuda.is_available():
        device = torch.device("cuda")
        print("Using NVIDIA CUDA GPU")

        if enable_tensor_cores: #tensor core allow faster matrix multiplications
            major, minor = map(int, torch.__version__.split(".")[:2])
            if (major, minor) >= (2, 9):
                torch.backends.cuda.matmul.fp32_precision = "tf32"
                torch.backends.cudnn.conv.fp32_precision = "tf32"
            else:
                torch.backends.cuda.matmul.allow_tf32 = True
                torch.backends.cudnn.allow_tf32 = True

    elif torch.backends.mps.is_available():
        device = torch.device("mps")
        print("Using Apple Silicon GPU (MPS)")

    elif torch.xpu.is_available():
        device = torch.device("xpu")
        print("Using Intel GPU")

    else:
        device = torch.device("cpu")
        print("Using CPU")

    return device


In [None]:
device = get_device()

Using NVIDIA CUDA GPU


In [3]:
#select CPU for this chapter and then remove it to see performance improvement
device = torch.device('cpu')

In [7]:
#download model
download_qwen3_small(kind='base', tokenizer_only=False, out_dir='qwen3')

qwen3-0.6B-base.pth: 100% (1433 MiB / 1433 MiB)


In [9]:
from reasoning_from_scratch.qwen3 import Qwen3Model, QWEN_CONFIG_06_B

model_path = Path('qwen3') / 'qwen3-0.6B-base.pth'
model = Qwen3Model(QWEN_CONFIG_06_B) 
model.load_state_dict(torch.load(model_path))
model.to(device)

  model.load_state_dict(torch.load(model_path))


Qwen3Model(
  (tok_emb): Embedding(151936, 1024)
  (trf_blocks): ModuleList(
    (0-27): 28 x TransformerBlock(
      (att): GroupedQueryAttention(
        (W_query): Linear(in_features=1024, out_features=2048, bias=False)
        (W_key): Linear(in_features=1024, out_features=1024, bias=False)
        (W_value): Linear(in_features=1024, out_features=1024, bias=False)
        (out_proj): Linear(in_features=2048, out_features=1024, bias=False)
        (q_norm): RMSNorm()
        (k_norm): RMSNorm()
      )
      (ff): FeedForward(
        (fc1): Linear(in_features=1024, out_features=3072, bias=False)
        (fc2): Linear(in_features=1024, out_features=3072, bias=False)
        (fc3): Linear(in_features=3072, out_features=1024, bias=False)
      )
      (norm1): RMSNorm()
      (norm2): RMSNorm()
    )
  )
  (final_norm): RMSNorm()
  (out_head): Linear(in_features=1024, out_features=151936, bias=False)
)