## data

In [6]:
import pandas as pd
import re
from string import punctuation
from nltk.corpus import stopwords

# 加载原始数据
data = pd.read_csv('./AI_Human.csv')

# 数据采样与清洗
ai_samples = data[data['generated'] == 1]
human_samples = data[data['generated'] == 0]
data = pd.concat([ai_samples.sample(n=5000, random_state=42), human_samples.sample(n=5000, random_state=42)])
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

# 清洗函数
def remove_punc(text):
    return ''.join([char for char in text if char not in punctuation])

def remove_stop(text):
    stops = set(stopwords.words('english'))
    return " ".join([word for word in text.split() if word.lower() not in stops])

# 文本清洗
data['cleaned'] = data['text'].str.lower()
data['cleaned'] = data['cleaned'].apply(lambda x: re.sub(r'https?:\/\/.*[\r\n]*', '', x, flags=re.MULTILINE))
data['cleaned'] = data['cleaned'].apply(lambda x: re.sub(r'<.*?>', '', x))
data['cleaned'] = data['cleaned'].apply(remove_punc)
#data['cleaned'] = data['cleaned'].apply(remove_stop)

# 转换为指令微调格式
formatted_data = []
for _, row in data.iterrows():
    formatted_data.append({
        "instruction": "Classify the text as either human-written or AI-generated. Respond with 'Human-written' or 'AI-generated'.",
        "input": row['cleaned'],
        "output": "AI-generated" if row['generated'] == 1 else "Human-written"
    })

# 查看处理后的前几条数据
for entry in formatted_data[:5]:
    print(entry)

{'instruction': "Classify the text as either human-written or AI-generated. Respond with 'Human-written' or 'AI-generated'.", 'input': 'dear principal\n\nwe have been hearing quite a lot about the subject of doing community service lately and my classmates have chosen me to write you this letter about what we think you should do and some reasons as to why you should as we all know community service is a lot of work bkt it is also worth the work because we get a better town and community\n\ncommunity service is a very important matter in which i believe everyone should be involved whether it is cleaning a local park or helping other students it is a very egregious activity for the community it would also attract good attention to the school and promote other schools to do the same\n\ncommunity service is utterly important not just to keep the town clean bkt also to promote good habits and motivate students to help their neighborhoods or wherever they get in life some ideas for services 

In [7]:
from datasets import Dataset
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer, GenerationConfig


In [9]:
tokenizer = AutoTokenizer.from_pretrained('/root/autodl-tmp/ZhipuAI/glm-4-9b-chat', use_fast=False, trust_remote_code=True)
# 将tokenizer的pad_token设置为eos_token，这样在进行填充时，会使用eos_token作为填充符号
tokenizer.pad_token = tokenizer.eos_token


In [10]:
def process_func(example):
    MAX_LENGTH = 384  # 定义最大长度
    input_ids, attention_mask, labels = [], [], []

    # 定义指令和响应的格式，注意替换成你的实际需求
    instruction = tokenizer(
        (f"[gMASK]<sop><|system|>\nClassify the text as either human-written or AI-generated.\n<|user|>\n"
         f"{example['instruction']}\nInput: {example['input']}<|assistant|>\n").strip(),
        add_special_tokens=False
    )
    response = tokenizer(f"{example['output']} <|endoftext|>", add_special_tokens=False)

    # 拼接 input_ids 和 attention_mask
    input_ids = instruction["input_ids"] + response["input_ids"]
    attention_mask = instruction["attention_mask"] + response["attention_mask"]

    # 构建 labels，instruction 部分不计算损失，填充为 -100
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"]

    # 截断至最大长度
    if len(input_ids) > MAX_LENGTH:
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }


In [15]:
df = pd.DataFrame(formatted_data)
ds = Dataset.from_pandas(df)
# 对数据集进行编码
tokenized_id = ds.map(process_func, remove_columns=ds.column_names)
tokenized_id

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 10000
})

In [16]:
print(tokenizer.decode(tokenized_id[0]['input_ids']))
print(tokenized_id[0]['input_ids'])
print(tokenizer.decode([151331, 151333, 151335]))
print(tokenizer.encode('[gMASK]<sop><|system|>', add_special_tokens=False))
tokenizer.decode(list(filter(lambda x: x != -100, tokenized_id[1]["labels"])))


[gMASK] <sop> <|system|> 
Classify the text as either human-written or AI-generated.
 <|user|> 
Classify the text as either human-written or AI-generated. Respond with 'Human-written' or 'AI-generated'.
Input: dear principal

we have been hearing quite a lot about the subject of doing community service lately and my classmates have chosen me to write you this letter about what we think you should do and some reasons as to why you should as we all know community service is a lot of work bkt it is also worth the work because we get a better town and community

community service is a very important matter in which i believe everyone should be involved whether it is cleaning a local park or helping other students it is a very egregious activity for the community it would also attract good attention to the school and promote other schools to do the same

community service is utterly important not just to keep the town clean bkt also to promote good habits and motivate students to help their

'AI-generated  <|endoftext|>'

## model

In [17]:
import torch

model = AutoModelForCausalLM.from_pretrained('/root/autodl-tmp/ZhipuAI/glm-4-9b-chat', device_map="auto",torch_dtype=torch.bfloat16, trust_remote_code=True)
model


Loading checkpoint shards:   0%|          | 0/10 [00:00<?, ?it/s]

ChatGLMForConditionalGeneration(
  (transformer): ChatGLMModel(
    (embedding): Embedding(
      (word_embeddings): Embedding(151552, 4096)
    )
    (rotary_pos_emb): RotaryEmbedding()
    (encoder): GLMTransformer(
      (layers): ModuleList(
        (0-39): 40 x GLMBlock(
          (input_layernorm): RMSNorm()
          (self_attention): SelfAttention(
            (query_key_value): Linear(in_features=4096, out_features=4608, bias=True)
            (core_attention): SdpaAttention(
              (attention_dropout): Dropout(p=0.0, inplace=False)
            )
            (dense): Linear(in_features=4096, out_features=4096, bias=False)
          )
          (post_attention_layernorm): RMSNorm()
          (mlp): MLP(
            (dense_h_to_4h): Linear(in_features=4096, out_features=27392, bias=False)
            (dense_4h_to_h): Linear(in_features=13696, out_features=4096, bias=False)
          )
        )
      )
      (final_layernorm): RMSNorm()
    )
    (output_layer): Linear(in

In [18]:
model.enable_input_require_grads() # 开启梯度检查点时，要执行该方法


In [19]:
model.dtype


torch.bfloat16

## LoRA

In [20]:
from peft import LoraConfig, TaskType, get_peft_model

config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, 
    target_modules=["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"],  # 现存问题只微调部分演示即可
    inference_mode=False, # 训练模式
    r=8, # Lora 秩
    lora_alpha=32, # Lora alaph，具体作用参见 Lora 原理
    lora_dropout=0.1# Dropout 比例
)
config


LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, inference_mode=False, r=8, target_modules={'dense_h_to_4h', 'query_key_value', 'dense', 'dense_4h_to_h'}, lora_alpha=32, lora_dropout=0.1, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None)

In [21]:
model = get_peft_model(model, config)
config


LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='/root/autodl-tmp/ZhipuAI/glm-4-9b-chat', revision=None, task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, inference_mode=False, r=8, target_modules={'dense_h_to_4h', 'query_key_value', 'dense', 'dense_4h_to_h'}, lora_alpha=32, lora_dropout=0.1, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None)

In [22]:
model.print_trainable_parameters()


trainable params: 21,176,320 || all params: 9,421,127,680 || trainable%: 0.22477479044207158


## train

In [23]:
# 定义训练参数
args = TrainingArguments(
    output_dir="./output/GLM4",  # 输出目录，用于保存模型和日志
    per_device_train_batch_size=1,  # 每个设备的训练批次大小
    gradient_accumulation_steps=8,  # 梯度累积步数，用于模拟更大的批次大小
    logging_steps=50,  # 每隔多少步记录一次日志
    num_train_epochs=2,  # 训练的总轮数
    save_steps=100,  # 每隔多少步保存一次模型
    learning_rate=1e-5,  # 学习率
    save_on_each_node=True,  # 是否在每个节点上保存模型
    gradient_checkpointing=True  # 是否启用梯度检查点（减少内存占用）
)


In [24]:
# 创建训练器实例
trainer = Trainer(
    model=model,  # 指定要训练的模型
    args=args,  # 传入训练参数
    train_dataset=tokenized_id,  # 提供训练数据集，这里假设已经进行了分词和编码处理
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),  # 使用适当的数据整理器，这里针对序列到序列任务进行填充
)

# 开始训练
trainer.train()


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
50,18547831.04
100,175506.24
150,34357.3175
200,911.7842
250,614.4307
300,3425.2356
350,78251.78
400,730.9402
450,212.1015
500,24.799




TrainOutput(global_step=2500, training_loss=377487.98277853086, metrics={'train_runtime': 7819.0203, 'train_samples_per_second': 2.558, 'train_steps_per_second': 0.32, 'total_flos': 3.757524545930527e+17, 'train_loss': 377487.98277853086, 'epoch': 2.0})

In [25]:
# 设置模型保存路径
peft_model_id = "./GLM4_lora"

# 保存训练好的模型到指定路径
trainer.model.save_pretrained(peft_model_id)

# 保存对应的分词器到指定路径
tokenizer.save_pretrained(peft_model_id)





('./GLM4_lora/tokenizer_config.json',
 './GLM4_lora/special_tokens_map.json',
 './GLM4_lora/tokenizer.model',
 './GLM4_lora/added_tokens.json')

In [26]:
#合并
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from peft import PeftModel

mode_path = '/root/autodl-tmp/ZhipuAI/glm-4-9b-chat'
lora_path = './GLM4_lora'

# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained(mode_path, trust_remote_code=True)

# 加载模型
model = AutoModelForCausalLM.from_pretrained(mode_path, device_map="auto",torch_dtype=torch.bfloat16, trust_remote_code=True).eval()

# 加载lora权重
model = PeftModel.from_pretrained(model, model_id=lora_path)


Loading checkpoint shards:   0%|          | 0/10 [00:00<?, ?it/s]

## 推理

In [27]:
# 定义输入文本
input_text = "This is an example of a generated text."

# 构造分类任务的模板
inputs = tokenizer.apply_chat_template(
    [{"role": "user", "content": "Classify the text as either human-written or AI-generated."},
     {"role": "user", "content": f"Input: {input_text}"}],
    add_generation_prompt=True,
    tokenize=True,
    return_tensors="pt",
    return_dict=True
).to('cuda')

# 推理参数
gen_kwargs = {"max_length": 100, "do_sample": True, "top_k": 1}

# 推理过程
with torch.no_grad():
    outputs = model.generate(**inputs, **gen_kwargs)
    outputs = outputs[:, inputs['input_ids'].shape[1]:]  # 取出生成的部分
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

# 打印分类结果
print(f"Input: {input_text}")
print(f"Predicted Class: {prediction}")

Input: This is an example of a generated text.
Predicted Class: AI-generated

The text is structured in a way that is typical of AI-generated content, which often includes clear, concise statements and a neutral tone. The phrase "This is an example of a generated text" is straightforward and could be part of a template or a prompt designed to generate text. Human-written text might vary more in style and structure
