# Chapter1 LLM

In [1]:
!nvidia-smi

Sun Aug 17 11:40:18 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.88                 Driver Version: 580.88         CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4060 ...  WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   43C    P5              4W /  140W |    6073MiB /   8188MiB |     33%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

+----------------------------------------------

## Qwen/Qwen2.5-0.5B-instruct
加载一个模型推理，可以对模型和tokenizer进行分别加载

In [None]:
import os
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"

In [3]:
import torch, time
print("CUDA available:", torch.cuda.is_available())

CUDA available: True


In [4]:
from transformers import AutoModelForCausalLM

start = time.time()
model = AutoModelForCausalLM.from_pretrained(
    'Qwen/Qwen2.5-0.5B-instruct',
    device_map='cuda',
    torch_dtype='auto',
    trust_remote_code=True,
)
print(f"Model loaded in {time.time() - start:.2f}s")



Model loaded in 8.54s


In [5]:
print(model)

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((896,), eps=1e-06)
    (rotary_emb): Qwen2RotaryEmbe

In [6]:
# 查看里面的specali token
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2.5-0.5B-instruct')


In [7]:
from transformers import Qwen2Tokenizer
tk_exp = Qwen2Tokenizer.from_pretrained('Qwen/Qwen2.5-0.5B-instruct')

In [8]:
tokenizer.special_tokens_map

{'eos_token': '<|im_end|>',
 'pad_token': '<|endoftext|>',
 'additional_special_tokens': ['<|im_start|>',
  '<|im_end|>',
  '<|object_ref_start|>',
  '<|object_ref_end|>',
  '<|box_start|>',
  '<|box_end|>',
  '<|quad_start|>',
  '<|quad_end|>',
  '<|vision_start|>',
  '<|vision_end|>',
  '<|vision_pad|>',
  '<|image_pad|>',
  '<|video_pad|>']}

# Chapter2 基础使用方式
加载好模型之后可以只用原始的model和tokenizer进行推理；

* step1：加载模型
* step2：构建prompt和tokenizer
* step3：推理和解码

模型已经加载好了，我们直接从第二部开始

In [9]:
## 第一种手动构建
prompt = '请说一个和找工作相关的冷笑话'
message = [
    {'role': 'system', 'content': 'You are Qwen, Created by Alibaba Clod. You are a helpful assistant.'}, 
    {'role': 'user', 'content': prompt}
]


In [10]:
text = tokenizer.apply_chat_template(
    message, 
    tokenize=False,
    add_generation_prompt=True
)

In [11]:
print(text)

<|im_start|>system
You are Qwen, Created by Alibaba Clod. You are a helpful assistant.<|im_end|>
<|im_start|>user
请说一个和找工作相关的冷笑话<|im_end|>
<|im_start|>assistant



In [12]:
model_input = tokenizer([text], return_tensors='pt').to(model.device)

In [13]:
print(text)
print('===='*10)
print(model_input)

<|im_start|>system
You are Qwen, Created by Alibaba Clod. You are a helpful assistant.<|im_end|>
<|im_start|>user
请说一个和找工作相关的冷笑话<|im_end|>
<|im_start|>assistant

{'input_ids': tensor([[151644,   8948,    198,   2610,    525,   1207,  16948,     11,   4290,
            553,  54364,   2435,    347,     13,   1446,    525,    264,  10950,
          17847,     13, 151645,    198, 151644,    872,    198,  14880,  36587,
          46944,  33108, 114953, 105470,  99476, 109959, 151645,    198, 151644,
          77091,    198]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}


In [14]:
# 推理和解码
generated_ids = model.generate(
    **model_input, 
    max_new_tokens = 512
)

generated_ids =[
    output_ids [len(input_ids):] for input_ids, output_ids in zip(model_input.input_ids, generated_ids)
] 

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)

  attn_output = torch.nn.functional.scaled_dot_product_attention(


在一家公司的面试官面前，你突然发现你的简历里没有提到你的家乡。这时，面试官问道：“你在哪里出生的？”你会回答：（笑）“我在美国出生。”


使用 transformers的pipeline简化流程

In [15]:
from transformers import pipeline

# step1 生成pipline
generator = pipeline(
    'text-generation',  # decoder only   #如果是encoder +decoder使用 text2text-generation
    model = model,
    tokenizer = tokenizer,
    return_full_text=False,
    max_new_tokens = 512, 
    do_sample = False
)

# step1 构建一个prompt
messages = [
    {'role':'user', 'content':'写一个幽默的冷笑话'}
]

# step3 输出并解码
output = generator(messages) 

Device set to use cuda
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


In [16]:
print(output[0]['generated_text'])

为什么程序员总是喜欢用“Hello World”来结束他们的程序？因为这是他们用来庆祝自己成功编写的第一个程序。
