In [8]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import LongformerTokenizer, LongformerForSequenceClassification
from torch.optim import AdamW
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

In [9]:
# 文件名列表和文件夹路径
file_names = ['TPL.txt', 'OKE.txt', 'CHK.txt', 'BKR.txt']
file_folder = 'experiment'
texts = []
labels = ['Negligible', 'Medium', 'High', 'Low']  # 示例标签

# 读取文件内容
for file_name in file_names:
    file_path = os.path.join(file_folder, file_name)
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        texts.append(content)

# 初始化 Longformer tokenizer 和分类模型
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
model = LongformerForSequenceClassification.from_pretrained('allenai/longformer-base-4096', num_labels=4)

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', '

In [10]:
# 自定义数据集类
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=4096):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoded_input = self.tokenizer(text, return_tensors='pt', truncation=True, padding='max_length', max_length=self.max_length)
        input_ids = encoded_input['input_ids'].squeeze()
        attention_mask = encoded_input['attention_mask'].squeeze()
        return input_ids, attention_mask, label
 
# 标签编码
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# 创建数据集和数据加载器
dataset = TextDataset(texts, encoded_labels, tokenizer)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# 定义优化器
optimizer = AdamW(model.parameters(), lr=2e-5)

# 训练模型
model.train()
for epoch in range(6):  # 训练6个epoch
    total_loss = 0
    for batch in dataloader:
        input_ids, attention_mask, labels_batch = batch
        labels_batch = labels_batch.to(torch.long)  # 将标签转换为长整型
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels_batch)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(dataloader)}")


# 测试模型
model.eval()
preds = []
true_labels = []
with torch.no_grad():
    for batch in dataloader:
        input_ids, attention_mask, labels_batch = batch
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds.extend(torch.argmax(logits, dim=1).tolist())
        true_labels.extend(labels_batch.tolist())

# 打印分类报告
print(classification_report(true_labels, preds, target_names=label_encoder.classes_))

Epoch 1, Loss: 1.447771966457367
Epoch 2, Loss: 1.3757325410842896
Epoch 3, Loss: 1.146630346775055
Epoch 4, Loss: 1.1591147780418396
Epoch 5, Loss: 0.9996614456176758
Epoch 6, Loss: 0.8636733889579773
              precision    recall  f1-score   support

        High       1.00      1.00      1.00         1
         Low       1.00      1.00      1.00         1
      Medium       1.00      1.00      1.00         1
  Negligible       1.00      1.00      1.00         1

    accuracy                           1.00         4
   macro avg       1.00      1.00      1.00         4
weighted avg       1.00      1.00      1.00         4



In [11]:
save_directory = './saved_model_v1'

if not os.path.exists(save_directory):
    os.makedirs(save_directory)
    
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

('./saved_model_v1/tokenizer_config.json',
 './saved_model_v1/special_tokens_map.json',
 './saved_model_v1/vocab.json',
 './saved_model_v1/merges.txt',
 './saved_model_v1/added_tokens.json')

In [17]:
# 加载模型和分词器
save_directory = './saved_model_v1'
loaded_tokenizer = LongformerTokenizer.from_pretrained(save_directory)
loaded_model = LongformerForSequenceClassification.from_pretrained(save_directory)

def predict(texts):
    inputs = loaded_tokenizer(texts, return_tensors='pt', truncation=True, padding='max_length', max_length=4096)
    outputs = loaded_model(**inputs)
    predictions = outputs.logits.argmax(dim=-1)
    predicted_labels = label_encoder.inverse_transform(predictions.numpy())
    return predicted_labels

# 预测示例
with open('experiment/OVV.txt', 'r', encoding='utf-8') as file:
    new_texts = [file.read()]  # 包装成列表

predictions = predict(new_texts)
print(predictions)

['High']
