In [3]:
## 设置hf-mirror镜像站，用于下载Qwen/Qwen2.5-0.5B模型
import os

os.environ["HF_ENDPOINT"] = "https://hf-mirror.com/"

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
import torch

In [5]:
model_name = "Qwen/Qwen2.5-0.5B"
device = "cuda:0"

config = AutoConfig.from_pretrained(pretrained_model_name_or_path=model_name)
if config.pad_token_id is None:
    config.pad_token_id = (
        config.eos_token_id
    )  # 避免提示：Setting pad_token_id to eos_token_id:None for open-end generation.
model = AutoModelForCausalLM.from_config(config=config).to(device)  # 用from_config方法
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [4]:
config

Qwen2Config {
  "_name_or_path": "Qwen/Qwen2.5-0.5B",
  "architectures": [
    "Qwen2ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151643,
  "hidden_act": "silu",
  "hidden_size": 896,
  "initializer_range": 0.02,
  "intermediate_size": 4864,
  "max_position_embeddings": 32768,
  "max_window_layers": 24,
  "model_type": "qwen2",
  "num_attention_heads": 14,
  "num_hidden_layers": 24,
  "num_key_value_heads": 2,
  "pad_token_id": 151643,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 1000000.0,
  "sliding_window": null,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.46.1",
  "use_cache": true,
  "use_mrope": false,
  "use_sliding_window": false,
  "vocab_size": 151936
}

In [5]:
model

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896, padding_idx=151643)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (n

In [6]:
model.config

Qwen2Config {
  "_attn_implementation_autoset": true,
  "_name_or_path": "Qwen/Qwen2.5-0.5B",
  "architectures": [
    "Qwen2ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151643,
  "hidden_act": "silu",
  "hidden_size": 896,
  "initializer_range": 0.02,
  "intermediate_size": 4864,
  "max_position_embeddings": 32768,
  "max_window_layers": 24,
  "model_type": "qwen2",
  "num_attention_heads": 14,
  "num_hidden_layers": 24,
  "num_key_value_heads": 2,
  "pad_token_id": 151643,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 1000000.0,
  "sliding_window": null,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.46.1",
  "use_cache": true,
  "use_mrope": false,
  "use_sliding_window": false,
  "vocab_size": 151936
}

In [7]:
# 可以看出现在模型是一个完全初始化的状态
prompt = "我今天心情很"
model_inputs = tokenizer([prompt], return_tensors="pt").to(model.device)
print(model_inputs)  # input_ids和attention_mask
with torch.inference_mode():
    model_output_ids = model.generate(**model_inputs, max_new_tokens=64)
generated_ids = [
    output_ids[len(input_ids) :]
    for input_ids, output_ids in zip(model_inputs.input_ids, model_output_ids)
]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)

{'input_ids': tensor([[ 35946, 100644, 104753,  99165]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1]], device='cuda:0')}
�� Wifi niche dalla dalla dalla Wifi Wifiレーション_readySansHoTypeError_ipHoHoHoḈḈ敦 chiến廓的那一幾個Ḉ敦影业影业影业影业疑问York skills	insert	insert	insert幾個幾個幾個幾個幾個幾個幾個幾個幾個幾個幾個幾個幾個幾個幾個幾個幾個幾個幾個幾個幾個幾個Binder$output$output$output$output


In [8]:
model_forward_output = model.forward(**model_inputs)
print(model_forward_output.loss)
print(model_forward_output.logits)
print(model_forward_output.logits.shape)

None
tensor([[[ 0.1496, -0.1355,  0.6572,  ...,  0.2197,  0.0963,  0.9431],
         [-0.4937,  0.3464,  0.6783,  ..., -0.3669, -0.0607,  1.2321],
         [-0.3737,  0.4515,  0.2831,  ..., -0.3511,  0.0225,  1.4325],
         [ 0.0517,  0.3105, -0.0512,  ..., -0.5628, -0.4646,  1.2509]]],
       device='cuda:0', grad_fn=<UnsafeViewBackward0>)
torch.Size([1, 4, 151936])


In [1]:
from utils import load_jsonl

data = load_jsonl(path="../data/webText2019zh_1k.jsonl")
train_texts = data[:900]
eval_texts = data[900:]

In [6]:
from torch.utils.data import Dataset


class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]["text"]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )

        # 获取 input_ids 和 attention_mask
        input_ids = encoding["input_ids"].squeeze()  # 去掉批量维度
        attention_mask = encoding["attention_mask"].squeeze()

        return {"input_ids": input_ids, "attention_mask": attention_mask}


train_dataset = TextDataset(train_texts, tokenizer, max_length=512)
eval_dataset = TextDataset(eval_texts, tokenizer, max_length=512)

In [9]:
print(train_dataset[0]['input_ids'])
print(tokenizer.decode(train_dataset[0]['input_ids']))
print(train_dataset[0]['attention_mask'])

tensor([104719,  80268,  13343,  80268,  48692,  80268,  41406,   9370, 100714,
         99800,  94432,  99729, 110878,   3837,  28291,  99601,  45629, 104511,
         99425, 100228,  99894,  60726, 121292,  33447, 110798,  99486,  56006,
        107514, 101494,  99898, 106678,   8997, 101895,  80268,  13343,  80268,
         48692,  80268,  41406,   9370, 100714,  99800,  99519,  80443, 100386,
        100162, 110596, 113757,   8997,  31207,  26232,   9370,  52183, 114615,
          3837, 113717,  14880, 113145,  97084, 101893,  99800, 101454,   3837,
         30709,  99952,  16530,  99813, 104305,   8997, 109619,  43268, 104305,
         99466, 108845, 100651,   6313,    198, 101038,  99466,  99934, 105593,
        105098, 100313,   9370, 102078,  99800,  99898,   3837,  99491, 103198,
          3837, 112496, 121258, 100655,  52510, 103615, 112622, 109971,  77959,
        101833, 121493, 104008,   3837, 104029, 101467, 116128,   8997,  28072,
        105073,   9370, 113575,  20412, 

In [11]:
batch_size = 4
train_dataloader = torch.utils.data.DataLoader(
    dataset=train_dataset, batch_size=batch_size
)
eval_dataloader = torch.utils.data.DataLoader(
    dataset=eval_dataset, batch_size=batch_size
)

In [12]:
next(train_dataloader.__iter__())

{'input_ids': tensor([[104719,  80268,  13343,  ..., 151643, 151643, 151643],
         [100007, 103964,     51,  ..., 151643, 151643, 151643],
         [100344, 102073, 104467,  ..., 151643, 151643, 151643],
         [118919,  99503, 100251,  ..., 151643, 151643, 151643]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])}

In [13]:
from tqdm import tqdm
def train(model, train_dataloader, loss_func, optimizer, device):
    model.train()
    losses = []
    progress_bar = tqdm(train_dataloader, desc="Training", total=len(train_dataloader))
    for step,batch in enumerate(progress_bar):
        inputs_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        text = inputs_ids.to(device)
        attention_mask = attention_mask.to(device)
        text_input = text[:, :-1]  # 模型的输入是text的前n-1个token
        logits = model.forward(
            input_ids=text_input, attention_mask=attention_mask
        ).logits
        text_expected = text[:, 1:]  # 模型的期望输出是text的第2个token到第n个token
        text_expected = text_expected.reshape(-1)
        logits = logits.view(-1, logits.shape[-1])
        loss = loss_func(logits, text_expected)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        losses.append(loss.item())
        progress_bar.set_postfix(loss=loss.item(), refresh=True)
    return sum(losses) / (len(losses) * batch_size)


@torch.inference_mode()  # 验证的时候关闭梯度计算
def eval(model, eval_dataloader, loss_func, device):
    model.eval()
    losses = []
    progress_bar = tqdm(eval_dataloader, desc="Evaluating", total=len(eval_dataloader))
    for step,batch in enumerate(progress_bar):
        inputs_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        attention_mask = attention_mask.to(device)
        text = inputs_ids.to(device)
        text_input = text[:, :-1]  # 模型的输入是text的前n-1个token
        logits = model.forward(
            input_ids=text_input, attention_mask=attention_mask
        ).logits
        text_expected = text[:, 1:]  # 模型的期望输出是text的第2个token到第n个token
        text_expected = text_expected.reshape(-1)
        logits = logits.view(-1, logits.shape[-1])
        loss = loss_func(logits, text_expected)
        progress_bar.set_postfix(loss=loss.item(), refresh=True)
        losses.append(loss.item())
    return sum(losses) / (len(losses) * batch_size)

In [14]:
loss_func = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = torch.optim.Adam(params=model.parameters())

In [None]:
epochs = 3
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    train_loss = train(
        model=model,
        train_dataloader=train_dataloader,
        loss_func=loss_func,
        optimizer=optimizer,
        device=device,
    )
    eval_loss = eval(
        model=model, eval_dataloader=eval_dataloader, loss_func=loss_func, device=device
    )
    print(f"Epoch:{epoch + 1}, train_loss:{train_loss}, eval_loss:{eval_loss}")


Epoch 1/2


Training: 100%|██████████| 225/225 [01:55<00:00,  1.94it/s, loss=8.1] 
Evaluating: 100%|██████████| 25/25 [00:04<00:00,  5.08it/s, loss=8.01]


Epoch:0, train_loss:2.1377231385972766, eval_loss:2.0396485424041746
Epoch 2/2


Training: 100%|██████████| 225/225 [02:11<00:00,  1.72it/s, loss=7.38]
Evaluating: 100%|██████████| 25/25 [00:05<00:00,  4.72it/s, loss=7.94]

Epoch:1, train_loss:1.8994581349690756, eval_loss:2.072402739524841



