In [44]:
from datasets import Dataset, concatenate_datasets
from transformers import BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments, AdamW
import json
import torch

In [2]:
highlight_path = "HighlightData.json"
HighlightData = {}
with open(highlight_path, encoding="utf-8") as f:
    HighlightData = json.load(f)

In [19]:
len(HighlightData["context"])

239

In [21]:
len(HighlightData["highlights"])

239

In [5]:
contexts = HighlightData['context']
highlights = HighlightData['highlights']

In [6]:
tokenizer = BertTokenizer.from_pretrained('hfl/chinese-roberta-wwm-ext-large')

In [7]:
# sentences = contexts[0][0][0].split('，')
# highlight = highlights[0][0][0]
# sentence_highlights = [highlight[start:start+len(sentence)] for start, sentence in enumerate(sentences)]
# input_ids = []
# label_ids = []
# for i, sentence in enumerate(sentences):
#     encoded = tokenizer.encode_plus(
#         sentence,
#         add_special_tokens=True,
#         truncation=True,
#         max_length=16,
#         padding='max_length'
#     )
#     input_ids.append(encoded['input_ids'])
#     # 如果句子应被突出显示，则在 [CLS] 位置标记为 1，否则标记为 0
#     cls_label = 1 if highlight[i] == 1 else 0
#     label_ids.append(cls_label)
# sentences,sentence_highlights,input_ids,label_ids

In [8]:
def preprocess_function(contexts, highlights):
    inputs = []
    labels = []
    attention = []

    for doc_contexts,doc_highlights in zip(contexts,highlights):
        for doc_context, doc_highlight in zip(doc_contexts, doc_highlights):
            for context, highlight in zip(doc_context, doc_highlight):
                sentences = context.split('，')  # 按逗号分割句子
                sentence_highlights = [highlight[start:start+len(sentence)] for start, sentence in enumerate(sentences)]

                for i, (sentence, sentence_highlight) in enumerate(zip(sentences, sentence_highlights)):
                    encoded = tokenizer.encode_plus(
                        sentence,
                        add_special_tokens=True,
                        truncation=True,
                        max_length=512,
                        padding='max_length'
                    )
                    
                    # 如果句子应被突出显示，则在 [CLS] 位置标记为 1，否则标记为 0
                    cls_label = 1 if any(sentence_highlight) else 0

                    inputs.append(encoded['input_ids'])
                    labels.append(cls_label)
                    attention.append(encoded['attention_mask'])

    # 转换为 tensor
    inputs = torch.tensor(inputs, dtype=torch.long)
    labels = torch.tensor(labels, dtype=torch.long)
    attention = torch.tensor(attention, dtype=torch.long)
    return {'input_ids': inputs, 'attention': attention, 'labels': labels}

In [9]:
dataset = Dataset.from_dict(preprocess_function(contexts,highlights))

In [39]:
dataset

Dataset({
    features: ['input_ids', 'attention', 'labels'],
    num_rows: 221571
})

In [31]:
labels_cal = dataset['labels']
pos = 0
neg = 0
for i in labels_cal:
    if i == 0:
        neg = neg + 1
    else:
        pos = pos + 1

pos,neg*0.4

(32597, 75589.6)

In [29]:
# 分离正例和负例
positive_examples = dataset.filter(lambda example: example['labels'] == 1)
negative_examples = dataset.filter(lambda example: example['labels'] == 0)

Filter: 100%|██████████| 221571/221571 [00:48<00:00, 4601.45 examples/s]
Filter: 100%|██████████| 221571/221571 [00:47<00:00, 4682.22 examples/s]


In [32]:
# 计算40%负例的数量
negative_sample_size = int(0.4 * len(negative_examples))

# 随机抽样
negative_sampled = negative_examples.shuffle(seed=42).select(range(negative_sample_size))

In [36]:
# 合并正例和抽样后的负例
balanced_dataset = concatenate_datasets([positive_examples, negative_sampled])

# 随机打乱数据集
# balanced_dataset = balanced_dataset.shuffle(seed=42)

In [38]:
balanced_dataset

Dataset({
    features: ['input_ids', 'attention', 'labels'],
    num_rows: 108186
})

In [48]:
model = BertForSequenceClassification.from_pretrained("hfl/chinese-roberta-wwm-ext-large", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/chinese-roberta-wwm-ext-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [40]:
# 划分数据集
train_dataset, test_dataset = balanced_dataset.train_test_split(test_size=0.2).values()

In [42]:
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

In [43]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1

In [54]:
# 定义训练参数
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_eval_batch_size=16,
)

# 创建Trainer对象
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    optimizers=(AdamW(model.parameters(), lr=5e-5), None)  # 自定义优化器和调度器
)

RuntimeError: MPS backend out of memory (MPS allocated: 17.91 GB, other allocations: 256.67 MB, max allowed: 18.13 GB). Tried to allocate 4.00 KB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [50]:
trainer.train()

RuntimeError: MPS backend out of memory (MPS allocated: 17.58 GB, other allocations: 514.67 MB, max allowed: 18.13 GB). Tried to allocate 128.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [None]:
# 评估模型
results = trainer.evaluate()
print(results)

In [None]:
# 预测
predictions = trainer.predict(test_dataset)
print(predictions)

In [34]:
train_dataset['input_ids'][0]

[101,
 4197,
 1400,
 6821,
 702,
 7027,
 7481,
 4638,
 833,
 2372,
 3341,
 6821,
 702,
 6432,
 3209,
 7564,
 3844,
 4638,
 671,
 763,
 671,
 763,
 671,
 702,
 4801,
 2595,
 4638,
 671,
 763,
 6206,
 3724,
 511,
 102,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,

In [17]:
# 设置优化器和学习率调度器
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
criterion = torch.nn.CrossEntropyLoss().to('cuda')  # 将损失函数移动到GPU

In [19]:
# 训练数据加载器
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

In [36]:
for batch_idx, batch in enumerate(train_loader):
    print(batch_idx,len(batch['input_ids']))

0 512
1 512
2 512
3 512
4 512
5 512
6 512
7 512
8 512
9 512
10 512
11 512
12 512
13 512
14 512
15 512
16 512
17 512
18 512
19 512
20 512
21 512
22 512
23 512
24 512
25 512
26 512
27 512
28 512
29 512
30 512
31 512
32 512
33 512
34 512
35 512
36 512
37 512
38 512
39 512
40 512
41 512
42 512
43 512
44 512
45 512
46 512
47 512
48 512
49 512
50 512
51 512
52 512
53 512
54 512
55 512
56 512
57 512
58 512
59 512
60 512
61 512
62 512
63 512
64 512
65 512
66 512
67 512
68 512
69 512
70 512
71 512
72 512
73 512
74 512
75 512
76 512
77 512
78 512
79 512
80 512
81 512
82 512
83 512
84 512
85 512
86 512
87 512
88 512
89 512
90 512
91 512
92 512
93 512
94 512
95 512
96 512
97 512
98 512
99 512
100 512
101 512
102 512
103 512
104 512
105 512
106 512
107 512
108 512
109 512
110 512
111 512
112 512
113 512
114 512
115 512
116 512
117 512
118 512
119 512
120 512
121 512
122 512
123 512
124 512
125 512
126 512
127 512
128 512
129 512
130 512
131 512
132 512
133 512
134 512
135 512
136 512
137 512
138 51

In [26]:
# 模型训练
model.train()
for epoch in range(1):
    for batch_idx, batch in enumerate(train_loader):
        # 假设batch是一个列表，并且列表中的元素顺序是 [input_ids, attention_mask, labels]
        inputs, attention_mask, labels = batch
        
        # 执行模型的前向传播
        outputs = model(
            input_ids=inputs,
            attention_mask=attention_mask,
            labels=labels
        )
        loss = outputs.loss
        
        # 执行反向传播和优化器步骤
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        # 打印日志信息
        if batch_idx % 10 == 0:
            print(f"Epoch {epoch}, Batch {batch_idx}, Loss {loss.item()}")

AttributeError: 'str' object has no attribute 'size'