In [1]:
# 导入必要的库
import os
import pandas as pd
import json
from transformers import BertTokenizer, BertForSequenceClassification, DPRContextEncoder, DPRConfig, Trainer, TrainingArguments, AutoTokenizer
from datasets import Dataset
import torch
import codecs
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score

In [2]:
# 设置数据文件夹路径
src_data_folder = './data'

# 定义数据读取函数
def get_data(file_path):
    """读取数据"""
    text = []
    target = []
    with codecs.open(file_path, 'r', encoding='utf-8') as fin:
        for line in fin:
            tmp_line = json.loads(line)
            text.append(tmp_line["fact"].strip())
            target.append(tmp_line["meta"]["accusation"][0])
    return pd.DataFrame({'text': text, 'target': target})

# 读取数据
print('读取数据...')
train_df = get_data(os.path.join(src_data_folder, 'data_train.json'))
valid_df = get_data(os.path.join(src_data_folder, 'data_valid.json'))
test_df = get_data(os.path.join(src_data_folder, 'data_test.json'))

# 创建Hugging Face Dataset对象
train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(valid_df)
test_dataset = Dataset.from_pandas(test_df)

读取数据...


In [3]:
# 加载知识库
final_test_df = get_data(os.path.join(src_data_folder, 'final_test.json'))
knowledge_base = Dataset.from_pandas(final_test_df)
knowledge_base

Dataset({
    features: ['text', 'target'],
    num_rows: 35922
})

In [4]:
len(knowledge_base)

35922

In [5]:
# 设置 HF_HOME 环境变量,指定 Hugging Face 库使用的缓存目录
os.environ['HF_HOME'] = "/home/sunmr/.cache/huggingface/hub"
model_dir = "/home/sunmr/.cache/huggingface/hub/models--hfl--chinese-roberta-wwm-ext-large"

# 加载分类模型
model = BertForSequenceClassification.from_pretrained(model_dir, num_labels=len(set(train_df['target'])), local_files_only=True)

  return self.fget.__get__(instance, owner)()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /home/sunmr/.cache/huggingface/hub/models--hfl--chinese-roberta-wwm-ext-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# 初始化检索模型
os.environ['HF_HOME'] = "/home/sunmr/.cache/huggingface/hub"
model_name = "/home/sunmr/.cache/huggingface/hub/models--facebook--dpr-ctx_encoder-single-nq-base"

# 减少批量大小
batch_size = 32  # 降低批量大小以减少内存使用

tokenizer = BertTokenizer.from_pretrained(model_dir, local_files_only=True)
retriever_model = DPRContextEncoder.from_pretrained(model_name, local_files_only=True)

Some weights of the model checkpoint at /home/sunmr/.cache/huggingface/hub/models--facebook--dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
# 分批处理数据
def encode_knowledge(knowledge_base, model, tokenizer, max_length=512, batch_size=32):
    all_embeddings = []
    for i in range(0, len(knowledge_base), batch_size):
        batch = knowledge_base.select(range(i, min(i + batch_size, len(knowledge_base))))
        encoded_knowledge = tokenizer(batch['text'], padding=True, truncation=True, max_length=max_length,
                                      return_tensors='pt')

        with torch.no_grad():
            knowledge_outputs = model(**encoded_knowledge, return_dict=True)  # 添加 return_dict=True
            
            # 从模型输出中提取 CLS 令牌的隐藏状态
            cls_embeddings = knowledge_outputs.pooler_output  # 使用 pooler_output
            all_embeddings.append(cls_embeddings)

        # 清空缓存
        torch.cuda.empty_cache()

        # 删除不再需要的变量
        del knowledge_outputs, encoded_knowledge
        torch.cuda.empty_cache()
    # 合并所有批次的嵌入
    all_embeddings = torch.cat(all_embeddings, dim=0)
    return all_embeddings

In [8]:
import torch.nn.functional as F
from transformers import BertModel

# 创建一个单独的 BertModel 实例
bert_model = BertModel.from_pretrained(model_dir, local_files_only=True)

def get_bert_model_outputs(input_ids, attention_mask):
    """使用 BertModel 获取隐藏状态"""
    with torch.no_grad():
        outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
    return outputs

def calculate_similarity(encoded_texts, encoded_knowledge):
    # 使用模型获取文本的向量表示
    text_outputs = get_bert_model_outputs(encoded_texts['input_ids'], encoded_texts['attention_mask'])

    # 直接使用 encoded_knowledge 张量
    knowledge_embeddings = encoded_knowledge

    # 获取 CLS 令牌的隐藏状态
    text_embeddings = text_outputs.last_hidden_state[:, 0, :]

    # 计算相似度
    similarity_scores = torch.mm(text_embeddings, knowledge_embeddings.t())
    return similarity_scores

In [10]:
# 编码知识库
encoded_knowledge = encode_knowledge(knowledge_base, retriever_model, tokenizer, max_length=512, batch_size=32)  # 使用 retriever_model

In [12]:
# 编码文本
def encode_texts_with_retrieval(texts, encoded_knowledge, model, tokenizer, max_length):
    encoded_texts = tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
    
    for i, text in enumerate(texts):
        text_input_ids = encoded_texts['input_ids'][i]
        
        # 获取知识相关的输入 ID
        knowledge_input_ids = encoded_knowledge[i][:max_length - len(text_input_ids)]  # 调整长度
        
        # 检查长度是否合适
        if len(text_input_ids) + len(knowledge_input_ids) > max_length:
            # 这里可以进行裁剪或填充
            # 例如裁剪到 max_length
            knowledge_input_ids = knowledge_input_ids[:max_length - len(text_input_ids)]
            
        # 拼接
        encoded_texts['input_ids'][i] = torch.cat([text_input_ids, knowledge_input_ids], dim=-1)
        
    return encoded_texts

In [13]:
# 使用tokenizer编码数据集
max_length = 512
train_dataset = train_dataset.map(lambda examples: encode_texts_with_retrieval(examples['text'], encoded_knowledge, model, tokenizer, max_length=max_length), batched=True)
valid_dataset = valid_dataset.map(lambda examples: encode_texts_with_retrieval(examples['text'], encoded_knowledge, model, tokenizer, max_length=max_length), batched=True)
test_dataset = test_dataset.map(lambda examples: encode_texts_with_retrieval(examples['text'], encoded_knowledge, model, tokenizer, max_length=max_length), batched=True)

Map:   0%|          | 0/154592 [00:00<?, ? examples/s]

Map:   0%|          | 0/17131 [00:00<?, ? examples/s]

Map:   0%|          | 0/32508 [00:00<?, ? examples/s]

In [14]:
print(train_dataset)
print(valid_dataset)
print(test_dataset)

Dataset({
    features: ['text', 'target', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 154592
})
Dataset({
    features: ['text', 'target', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 17131
})
Dataset({
    features: ['text', 'target', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 32508
})


In [15]:
# 初始化LabelEncoder
label_encoder = LabelEncoder()

# 拟合LabelEncoder并转换标签
train_labels_encoded = label_encoder.fit_transform(train_df['target'])
valid_labels_encoded = label_encoder.transform(valid_df['target'])
test_labels_encoded = label_encoder.transform(test_df['target'])

print(train_labels_encoded)
print(valid_labels_encoded)
print(test_labels_encoded)

[ 95  95  95 ... 165 169 184]
[108 108 108 ...  10  20 154]
[108 108  70 ... 102 102  80]


In [16]:
# 定义处理批次的函数
def process_batch(batch, indices):
    batch_labels = torch.tensor(train_labels_encoded[indices])
    return {'labels': batch_labels}

# 将标签转换为 torch.Tensor 类型，并添加到数据集中
train_dataset = train_dataset.map(process_batch, batched=True, batch_size=512, with_indices=True)
valid_dataset = valid_dataset.map(process_batch, batched=True, batch_size=512, with_indices=True)
test_dataset = test_dataset.map(process_batch, batched=True, batch_size=512, with_indices=True)

print(train_dataset)
print(valid_dataset)
print(test_dataset)

Map:   0%|          | 0/154592 [00:00<?, ? examples/s]

Map:   0%|          | 0/17131 [00:00<?, ? examples/s]

Map:   0%|          | 0/32508 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'target', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 154592
})
Dataset({
    features: ['text', 'target', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 17131
})
Dataset({
    features: ['text', 'target', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 32508
})


In [17]:
# 定义训练参数
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,  # 增加训练轮数
    per_device_train_batch_size=16,  # 增加训练批大小
    per_device_eval_batch_size=16,
    warmup_steps=2000,  # 减少预热步数
    weight_decay=0.001,
    learning_rate=5e-5,  # 添加学习率
    logging_dir='./logs',
    logging_steps=200,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    fp16=True,  # 启用混合精度训练
    gradient_accumulation_steps=2,  # 梯度累积
    # early_stopping_patience=3,  # 早停策略
    # seed=42,  # 设置随机种子
)



In [18]:
# 定义Trainer
# 使用 Trainer 类来管理模型训练过程
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset
)

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [19]:
# 在模型训练之前确保参数的连续性
for name, param in model.named_parameters():
    if not param.is_contiguous():
        print(f"Making {name} contiguous.")
        param.data = param.data.contiguous()

Making bert.encoder.layer.0.attention.self.query.weight contiguous.
Making bert.encoder.layer.0.attention.self.key.weight contiguous.
Making bert.encoder.layer.0.attention.self.value.weight contiguous.
Making bert.encoder.layer.0.attention.output.dense.weight contiguous.
Making bert.encoder.layer.0.intermediate.dense.weight contiguous.
Making bert.encoder.layer.0.output.dense.weight contiguous.
Making bert.encoder.layer.1.attention.self.query.weight contiguous.
Making bert.encoder.layer.1.attention.self.key.weight contiguous.
Making bert.encoder.layer.1.attention.self.value.weight contiguous.
Making bert.encoder.layer.1.attention.output.dense.weight contiguous.
Making bert.encoder.layer.1.intermediate.dense.weight contiguous.
Making bert.encoder.layer.1.output.dense.weight contiguous.
Making bert.encoder.layer.2.attention.self.query.weight contiguous.
Making bert.encoder.layer.2.attention.self.key.weight contiguous.
Making bert.encoder.layer.2.attention.self.value.weight contiguous.
Ma

In [20]:
# 训练模型，调用 trainer.train() 开始训练
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.6062,8.637261
2,0.4963,9.272371
3,0.4241,9.478762
4,0.3496,9.87066
5,0.2739,10.282928
6,0.2094,10.812604
7,0.1636,11.018623
8,0.094,11.745259
9,0.0615,12.113572
10,0.0428,12.467425


TrainOutput(global_step=48310, training_loss=0.32033424975585306, metrics={'train_runtime': 62081.7434, 'train_samples_per_second': 24.901, 'train_steps_per_second': 0.778, 'total_flos': 1.4416308172249498e+18, 'train_loss': 0.32033424975585306, 'epoch': 10.0})

In [21]:
# 评估模型，使用 trainer.evaluate() 在验证集上评估模型性能
eval_results = trainer.evaluate()
print(f"验证集评估结果: {eval_results}")

验证集评估结果: {'eval_loss': 12.467425346374512, 'eval_runtime': 200.9035, 'eval_samples_per_second': 85.27, 'eval_steps_per_second': 5.331, 'epoch': 10.0}


In [22]:
# 进行预测
predictions = trainer.predict(test_dataset)
predicted_labels_encoded = predictions.predictions.argmax(-1)

# 使用 LabelEncoder 将整数编码转换回原始标签
predicted_labels = label_encoder.inverse_transform(predicted_labels_encoded)

# 打印预测结果
for i, label in enumerate(predicted_labels):
    print(f"预测指控：{label}")
    print(f"实际指控：{test_df['target'].iloc[i]}")
    print("------")

预测指控：信用卡诈骗
实际指控：盗窃
------
预测指控：盗窃
实际指控：盗窃
------
预测指控：强奸
实际指控：强奸
------
预测指控：盗窃
实际指控：盗窃
------
预测指控：故意伤害
实际指控：故意伤害
------
预测指控：贪污
实际指控：故意伤害
------
预测指控：故意伤害
实际指控：故意伤害
------
预测指控：盗窃
实际指控：盗窃
------
预测指控：故意伤害
实际指控：故意伤害
------
预测指控：盗窃
实际指控：盗窃
------
预测指控：盗窃
实际指控：盗窃
------
预测指控：盗窃
实际指控：盗窃
------
预测指控：合同诈骗
实际指控：诈骗
------
预测指控：盗窃
实际指控：盗窃
------
预测指控：盗窃
实际指控：盗窃
------
预测指控：故意伤害
实际指控：故意伤害
------
预测指控：盗窃
实际指控：盗窃
------
预测指控：故意伤害
实际指控：故意伤害
------
预测指控：盗窃
实际指控：盗窃
------
预测指控：诈骗
实际指控：诈骗
------
预测指控：抢劫
实际指控：盗窃
------
预测指控：盗窃
实际指控：盗窃
------
预测指控：盗窃
实际指控：盗窃
------
预测指控：强奸
实际指控：盗窃
------
预测指控：盗窃
实际指控：盗窃
------
预测指控：盗窃
实际指控：盗窃
------
预测指控：盗窃
实际指控：盗窃
------
预测指控：抢劫
实际指控：盗窃
------
预测指控：盗窃
实际指控：盗窃
------
预测指控：盗窃
实际指控：盗窃
------
预测指控：盗窃
实际指控：盗窃
------
预测指控：盗窃
实际指控：盗窃
------
预测指控：诈骗
实际指控：诈骗
------
预测指控：非法侵入住宅
实际指控：盗窃
------
预测指控：盗窃
实际指控：盗窃
------
预测指控：故意伤害
实际指控：故意伤害
------
预测指控：故意伤害
实际指控：故意伤害
------
预测指控：诈骗
实际指控：诈骗
------
预测指控：诈骗
实际指控：诈骗
------
预测指控：故意伤害
实际指控：故意伤害
------
预测指控：故意伤害
实际指控：故意伤害
------
预测指控：抢夺
实际

In [23]:
# 使用 sklearn.metrics 中的 accuracy_score 和 f1_score 来计算准确率和F1分数
from sklearn.metrics import accuracy_score, f1_score

# 准备实际标签
true_labels = test_df['target'].values

# 计算准确率
accuracy = accuracy_score(true_labels, predicted_labels)
print(f"预测准确率：{accuracy}")

# 计算 Macro F1
macro_f1 = f1_score(true_labels, predicted_labels, average='macro')
print(f"Macro F1: {macro_f1}")

# 计算 Micro F1
micro_f1 = f1_score(true_labels, predicted_labels, average='micro')
print(f"Micro F1: {micro_f1}")

预测准确率：0.8871662360034454
Macro F1: 0.8271501624283938
Micro F1: 0.8871662360034454
