In [2]:
import os
import json
import argparse
from pathlib import Path
from tqdm import tqdm
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from typing import List, Dict, Tuple


  from .autonotebook import tqdm as notebook_tqdm


# step1 apply_chat_template example

In [3]:
def load_prompts(jsonl_path):
    """Load prompts and references from JSONL file."""
    prompts = []
    
    with open(jsonl_path, 'r', encoding='utf-8') as f:
        for line in f:
            if not line.strip():
                continue
            data = json.loads(line.strip())
            data.pop('completion', None)
            prompts.append(data)
            
    return prompts

In [4]:
prompts = load_prompts('test_conversational.jsonl')

In [5]:
prompt_example = prompts[0]

In [6]:
print("Prompt Example:\n", prompt_example)

Prompt Example:
 {'prompt': [{'role': 'user', 'content': 'SUBREDDIT: r/relationships\nTITLE: Me [19 F] with my friend [19 M], not sure if I may have messed things up already.\nPOST: Hello hello everybody. I hope this isn\'t too trivial of a question to ask on here, but I\'ve been feeling a bit out of my depth when it comes to this situation (I\'ve had only one relationship before, and for many reasons, it was out of the ordinary).\n\nOkay! So, a couple of weeks ago, I started talking to this guy on Facebook, through a student group that we were both part of. I thought he was sort of cute, so I sent him a PM just to talk, etc, etc. We\'re both transfer students at the same school, so I knew that we could eventually meet in person once we both moved on-campus. So, we did, and we hung out maybe twice, just as friends.\n\nOkay. So, everything is going pretty well. We talk over Facebook and Snapchat, whatever. So, Saturday night, I was just hanging out with people and kind of being bored, w

In [54]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-1.7B",
                                          trust_remote_code=True,
                                          use_fast=True
                                          )
tokenizer.padding_side = "left"

In [8]:
prompt_apply_template = tokenizer.apply_chat_template(
    prompt_example['prompt'],  
    truncation="True",
    add_generation_prompt=True,
    tokenize=False,
)
prompt_tokenized = tokenizer(
    prompt_apply_template,
)
print("Prompt with Chat Template:\n", prompt_apply_template)
print("Prompt tokenized:\n ", prompt_tokenized)

Prompt with Chat Template:
 <|im_start|>user
SUBREDDIT: r/relationships
TITLE: Me [19 F] with my friend [19 M], not sure if I may have messed things up already.
POST: Hello hello everybody. I hope this isn't too trivial of a question to ask on here, but I've been feeling a bit out of my depth when it comes to this situation (I've had only one relationship before, and for many reasons, it was out of the ordinary).

Okay! So, a couple of weeks ago, I started talking to this guy on Facebook, through a student group that we were both part of. I thought he was sort of cute, so I sent him a PM just to talk, etc, etc. We're both transfer students at the same school, so I knew that we could eventually meet in person once we both moved on-campus. So, we did, and we hung out maybe twice, just as friends.

Okay. So, everything is going pretty well. We talk over Facebook and Snapchat, whatever. So, Saturday night, I was just hanging out with people and kind of being bored, when I got a Snapchat fr

# step2 process dataset

In [9]:
from datasets import load_dataset
TRAIN_DATA = "../../research/alignment/data/TLDR/sft/train_conversational.jsonl"
TEST_DATA = "../../research/alignment/data/TLDR/sft/test_conversational.jsonl"
dataset = load_dataset(
    "json",
    data_files={
        "train": TRAIN_DATA,
        "test": TEST_DATA
    }
)

In [10]:
dataset = dataset.remove_columns("completion")
print(f"dataset: {dataset}")

dataset: DatasetDict({
    train: Dataset({
        features: ['prompt'],
        num_rows: 116722
    })
    test: Dataset({
        features: ['prompt'],
        num_rows: 6553
    })
})


In [11]:
def process_chat(item):
    formatted_chat = tokenizer.apply_chat_template(
        item["prompt"],
        tokenize=False,
        add_generation_prompt=True,
    )
    
    tokenized_output = tokenizer(
        formatted_chat,
        truncation=True,
        max_length=tokenizer.model_max_length,
        padding=False,
    )
    
    return tokenized_output

In [48]:
dataset = dataset.map(
    process_chat,
    batch_size=True,
    num_proc=36,
    remove_columns=["prompt"],
)
print(f"dataset: {dataset}")

Map (num_proc=36): 100%|██████████| 116722/116722 [00:05<00:00, 19929.01 examples/s]
Map (num_proc=36): 100%|██████████| 6553/6553 [00:02<00:00, 2657.61 examples/s]

dataset: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 116722
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 6553
    })
})





In [49]:
batch_inputs = dataset["train"][0:10]
print(f"batch_inputs: {batch_inputs}")
input_ids = batch_inputs["input_ids"]

batch_inputs: {'input_ids': [[151644, 872, 198, 29038, 787, 4103, 952, 25, 435, 14, 85824, 198, 50328, 25, 358, 320, 69, 14, 17, 17, 8, 614, 311, 7071, 700, 421, 358, 1366, 311, 2058, 1414, 1493, 7571, 476, 537, 323, 1035, 12213, 311, 5112, 67092, 198, 2946, 25, 2806, 2704, 421, 419, 17180, 1588, 714, 432, 594, 5802, 264, 1430, 13, 4710, 3707, 26485, 510, 4498, 358, 320, 69, 14, 17, 17, 8, 3937, 1526, 847, 1156, 1931, 84498, 220, 17, 1635, 4134, 1576, 566, 4362, 3550, 1283, 264, 1042, 315, 4924, 926, 437, 220, 432, 88489, 752, 803, 1091, 358, 3381, 13, 1084, 572, 264, 27102, 882, 304, 847, 2272, 4152, 311, 5382, 448, 847, 6554, 323, 5499, 3432, 279, 6012, 311, 3931, 1059, 700, 315, 847, 2272, 13, 358, 646, 16698, 1576, 315, 432, 572, 458, 14269, 35750, 323, 419, 7412, 572, 15175, 323, 3207, 944, 1414, 1246, 311, 3484, 448, 752, 13, 1205, 9482, 553, 1435, 30426, 369, 264, 2254, 476, 773, 1283, 2087, 311, 264, 18780, 448, 847, 4780, 13, 3197, 358, 1744, 1182, 358, 6426, 566, 1101, 9482, 

In [57]:
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

prompts_dataloader = DataLoader(
    dataset["train"],
    batch_size=4,
    collate_fn=DataCollatorWithPadding(tokenizer=tokenizer, padding='longest'),
)

batch = next(iter(prompts_dataloader))
print(f"Batch input_ids shape: {batch['input_ids'].shape}")
print(f"Batch input_ids: {batch['input_ids']}")
print(f"Batch attention_mask shape: {batch['attention_mask'].shape}")
print(f"Batch attention_mask: {batch['attention_mask']}")

Batch input_ids shape: torch.Size([4, 436])
Batch input_ids: tensor([[151644,    872,    198,  ..., 151644,  77091,    198],
        [151643, 151643, 151643,  ..., 151644,  77091,    198],
        [151643, 151643, 151643,  ..., 151644,  77091,    198],
        [151643, 151643, 151643,  ..., 151644,  77091,    198]])
Batch attention_mask shape: torch.Size([4, 436])
Batch attention_mask: tensor([[1, 1, 1,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1]])


In [64]:
model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen3-1.7B",
    trust_remote_code=True,
    device_map="auto",
)
model.eval()

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  3.22it/s]


Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 2048)
    (layers): ModuleList(
      (0-27): 28 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=1024, bias=False)
          (v_proj): Linear(in_features=2048, out_features=1024, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
        )
        (mlp): Qwen3MLP(
          (gate_proj): Linear(in_features=2048, out_features=6144, bias=False)
          (up_proj): Linear(in_features=2048, out_features=6144, bias=False)
          (down_proj): Linear(in_features=6144, out_features=2048, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): Qwen3RMSNorm((2048,), eps=1e-06)
        (post_attention_layer

In [65]:
with torch.no_grad():
    outputs = model.generate(
        input_ids=batch['input_ids'].to(model.device),
        attention_mask=batch['attention_mask'].to(model.device),
        max_new_tokens=1024,
        temperature=0.1,
        top_p=0.9,
    )

KeyboardInterrupt: 

## 方法2: 使用 vLLM (最快，适合大规模生成)

vLLM 提供了更高的吞吐量，特别适合大规模生成任务。

In [None]:
# 使用 vLLM (需要先安装: pip install vllm)
from vllm import LLM, SamplingParams

# 初始化 vLLM 模型，自动使用所有可用GPU
llm = LLM(
    model="Qwen/Qwen3-1.7B",
    trust_remote_code=True,
    tensor_parallel_size=torch.cuda.device_count(),  # 使用所有GPU
    dtype="bfloat16",
)

# 设置生成参数
sampling_params = SamplingParams(
    temperature=0.7,
    top_p=0.9,
    max_tokens=1024,
)

print(f"Using {torch.cuda.device_count()} GPUs for vLLM")

In [None]:
# 使用 vLLM 批量生成
def generate_with_vllm(examples):
    """使用 vLLM 批量生成"""
    # 格式化提示
    formatted_prompts = []
    for prompt_messages in examples["prompt"]:
        formatted = tokenizer.apply_chat_template(
            prompt_messages,
            tokenize=False,
            add_generation_prompt=True,
        )
        formatted_prompts.append(formatted)
    
    # 批量生成（vLLM 会自动处理批次）
    outputs = llm.generate(formatted_prompts, sampling_params)
    
    # 提取生成的文本
    completions = [output.outputs[0].text for output in outputs]
    
    return {"completion": completions}

# 对数据集进行生成
vllm_results = dataset.map(
    generate_with_vllm,
    batched=True,
    batch_size=64,  # vLLM 可以处理更大的批次
)

print(f"vLLM generation complete!")