In [1]:
import json
from modelscope import snapshot_download
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
from torch.utils.data import Dataset,DataLoader
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [3]:

labels=['涉政','色情','暴力','正常']
json_data = []
with open('./dataset/data.jsonl','r',encoding='utf-8') as f:
    for data in f.readlines():
        data_dict = json.loads(data)
        json_data.append(data_dict)
with open('./dataset.jsonl', 'w', encoding='utf-8') as f:
    for data in json_data:
        data['input'] = data['text']
        data['output'] = labels[data['label']]
        del data['text']       # 正确删除 key
        del data['label']
        f.write(json.dumps(data, ensure_ascii=False) + '\n')

In [4]:
json_datasets = []
with open('./dataset.jsonl', 'r', encoding='utf-8') as f:
    for data in f.readlines():
        json_datasets.append(json.loads(data))
len(json_datasets)

3763

In [5]:
#模型下载
from modelscope import snapshot_download
model_dir = snapshot_download('Qwen/Qwen3-4B',local_dir='./qwen')
model = AutoModelForCausalLM.from_pretrained('./qwen')
tokenizer = AutoTokenizer.from_pretrained('./qwen')

Downloading Model from https://www.modelscope.cn to directory: /boot/qwen


Loading checkpoint shards: 100%|██████████| 3/3 [00:04<00:00,  1.47s/it]


In [6]:
def token(dataset,max_length):
    new_dataset = []
    flags=['涉政','色情','暴力','正常']
    content = """
        请根据下列用户输入的文本:{text}，从候选标签:{labels}中选择最合适的一个标签作为输出。
        请严格只输出最可能的一个标签，不要输出解释或任何额外内容。
        """
    for data in dataset:
        q = data['input']
        a = data['output']
        
        prompt = [{"role":"user","content":content.format(text=q,labels=flags)}]
        # 使用 tokenizer 处理文本
        q = tokenizer.apply_chat_template(prompt, tokenize=False,add_generation_prompt=True)
       
        # print(len(q))
        q_input_ids = tokenizer.encode(q,add_special_tokens=False)
        a_input_ids = tokenizer.encode(a,add_special_tokens=False)
        input_ids = q_input_ids+a_input_ids
        # print(len(input_ids))
        attention_mask = [1] * len(input_ids)
        labels= len(q_input_ids)*[-100] + a_input_ids
       
        if len(input_ids)<max_length:
            padding_length = max_length - len(input_ids) 
            input_ids = [tokenizer.pad_token_id] * padding_length + input_ids
            labels = [-100]*padding_length + labels
            attention_mask = [0]*padding_length + attention_mask
        else:
            input_ids = input_ids[:max_length]
            labels = labels[:max_length]
            attention_mask = attention_mask[:max_length]
        new_dataset.append( {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
        })
    return new_dataset

In [7]:
datasets = token(dataset=json_datasets,max_length=200)

In [8]:

labels=['涉政','色情','暴力','正常']
from torch.utils.data import Dataset
class MyDataset(Dataset):
    def __init__(self, datasets):
        super().__init__()
        self.datasets = datasets  # 保存数据
    def __len__(self):
        return len(self.datasets)
    def __getitem__(self, idx):
        data =  self.datasets[idx]
        return {
        "input_ids": torch.tensor(data['input_ids'], dtype=torch.long),
        "attention_mask": torch.tensor(data['attention_mask'], dtype=torch.long),
        "labels": torch.tensor(data['labels'], dtype=torch.long),
        }
  

In [9]:
dataset  =MyDataset(datasets=datasets)

In [10]:
from peft import get_peft_model, LoraConfig, TaskType
# LoRA 配置示例
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["c_attn", "q_proj","v_proj"],  # GPT2 相关模块示例
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 2,949,120 || all params: 4,025,417,216 || trainable%: 0.0733


In [14]:
from torch.optim import AdamW
from tqdm import tqdm
train_loader = DataLoader(dataset, batch_size=2, shuffle=True)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.train()

optimizer = AdamW(model.parameters(), lr=5e-5)

for epoch in range(1):
    loop = tqdm(train_loader, leave=True)
    total_loss = 0
    for batch in loop:
        # print(batch)
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        # print(labels)
        # break
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        # 检查 loss 是否为 NaN
        if torch.isnan(loss):
            print("⚠️ NaN loss detected. Skipping batch.")
            continue
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        loop.set_description(f"Epoch {epoch}")
        loop.set_postfix(loss=loss.item())
    
    print(f"Epoch {epoch} average loss: {total_loss / len(train_loader)}")
output_dir = './trained'
# 保存微调好的模型
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

Epoch 0: 100%|██████████| 1882/1882 [07:33<00:00,  4.15it/s, loss=0.000339]


Epoch 0 average loss: 0.17393784378823082


('./trained/tokenizer_config.json',
 './trained/special_tokens_map.json',
 './trained/chat_template.jinja',
 './trained/vocab.json',
 './trained/merges.txt',
 './trained/added_tokens.json',
 './trained/tokenizer.json')

In [4]:
# 加载微调好的模型进行推理
output_dir = './trained'
device = "cuda" if torch.cuda.is_available() else "cpu"
# 加载模型和 tokenizer
tokenizer = AutoTokenizer.from_pretrained(output_dir)
model = AutoModelForCausalLM.from_pretrained(output_dir)
model.to(device)  # 加载到 GPU 或 CPU
model.eval()
# 输入文本
text = "她的阴蒂又大又敏感，一碰就高潮"
flags=['涉政','色情','暴力','正常']
input_text = """你是一位专业的文本分类专家。请根据下列用户输入的文本，从候选标签中选择最合适的一个标签作为输出。
        文本：{text}
        候选标签：{labels}
        请严格只输出最可能的一个标签，不要输出解释或任何额外内容。
"""
prompt = input_text.format(text=text,labels=flags)
content = [{"role":"user","content":prompt}]
q =tokenizer.apply_chat_template(content,tokenize=False,add_generation_prompt=True)
# 编码输入
inputs = tokenizer(q, return_tensors="pt").to(device)

# 文本生成
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=50,            # 控制生成长度
        do_sample=False,               # 使用采样以增加多样性（也可设为 False 走贪心）
        top_k=50,                     # top-k 采样
        top_p=0.95,                   # nucleus sampling
        temperature=0.8,              # 控制生成多样性
        pad_token_id=tokenizer.eos_token_id  # 避免 padding 报错
    )

# 解码输出
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("生成结果：", generated_text)

Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.25s/it]
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


生成结果： user
你是一位专业的文本分类专家。请根据下列用户输入的文本，从候选标签中选择最合适的一个标签作为输出。
        文本：她的阴蒂又大又敏感，一碰就高潮
        候选标签：['涉政', '色情', '暴力', '正常']
        请严格只输出最可能的一个标签，不要输出解释或任何额外内容。

assistant
色情
</think>

色情


In [5]:
output_dir = './qwen'
device = "cuda" if torch.cuda.is_available() else "cpu"
# 加载模型和 tokenizer
tokenizer = AutoTokenizer.from_pretrained(output_dir)
model = AutoModelForCausalLM.from_pretrained(output_dir)
model.to(device)  # 加载到 GPU 或 CPU
model.eval()
# 输入文本
text = "她的阴蒂又大又敏感，一碰就高潮"
flags=['涉政','色情','暴力','正常']
input_text = """你是一位专业的文本分类专家。请根据下列用户输入的文本，从候选标签中选择最合适的一个标签作为输出。
        文本：{text}
        候选标签：{labels}
        请严格只输出最可能的一个标签，不要输出解释或任何额外内容。
"""
prompt = input_text.format(text=text,labels=flags)
content = [{"role":"user","content":prompt}]
q =tokenizer.apply_chat_template(content,tokenize=False,add_generation_prompt=True)
# 编码输入
inputs = tokenizer(q, return_tensors="pt").to(device)

# 文本生成
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=50,            # 控制生成长度
        do_sample=False,               # 使用采样以增加多样性（也可设为 False 走贪心）
        top_k=50,                     # top-k 采样
        top_p=0.95,                   # nucleus sampling
        temperature=0.8,              # 控制生成多样性
        pad_token_id=tokenizer.eos_token_id  # 避免 padding 报错
    )

# 解码输出
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("生成结果：", generated_text)

Loading checkpoint shards: 100%|██████████| 3/3 [00:04<00:00,  1.36s/it]
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


生成结果： user
你是一位专业的文本分类专家。请根据下列用户输入的文本，从候选标签中选择最合适的一个标签作为输出。
        文本：她的阴蒂又大又敏感，一碰就高潮
        候选标签：['涉政', '色情', '暴力', '正常']
        请严格只输出最可能的一个标签，不要输出解释或任何额外内容。

assistant
<think>
好的，我现在需要处理用户的这个文本分类请求。首先，用户给出的文本是：“她的阴蒂又大又敏感，一碰就高潮”。候选标签有四个：涉政、色情、暴力、正常。我需要
