In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
import torch.optim as optim
import torch.nn as nn
from tqdm import tqdm

# 读取文件
file_path = '1.csv'  # 请将路径替换为您的文件路径
data = pd.read_csv(file_path, encoding='gbk')

# 确定标签数量
num_labels = len(data['label'].unique())

# 加载BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

# 数据预处理函数
def preprocess(data, tokenizer, max_length=128):
    encodings = tokenizer(
        data['question'].tolist(),
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )
    labels = torch.tensor(data['label'].tolist())
    return encodings, labels

# 将数据划分为训练集和验证集
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

# 预处理训练和验证数据
train_encodings, train_labels = preprocess(train_data, tokenizer)
val_encodings, val_labels = preprocess(val_data, tokenizer)

# 创建Dataset对象
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

train_dataset = CustomDataset(train_encodings, train_labels)
val_dataset = CustomDataset(val_encodings, val_labels)

# 数据加载器
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# 加载BERT模型
model = BertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=num_labels)

# 选择优化器和损失函数
optimizer = optim.AdamW(model.parameters(), lr=5e-5)
criterion = nn.CrossEntropyLoss()

# 将模型设置为训练模式
model.train()

# 训练循环
num_epochs = 10
for epoch in range(num_epochs):
    print(f'Epoch {epoch + 1}/{num_epochs}')
    epoch_loss = 0
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].squeeze(1)
        attention_mask = batch['attention_mask'].squeeze(1)
        labels = batch['labels']

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
    
    avg_loss = epoch_loss / len(train_loader)
    print(f'Average training loss: {avg_loss}')

# 评估模型
model.eval()
val_loss = 0
correct = 0
total = 0

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].squeeze(1)
        attention_mask = batch['attention_mask'].squeeze(1)
        labels = batch['labels']

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        val_loss += loss.item()

        _, predicted = torch.max(outputs.logits, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

avg_val_loss = val_loss / len(val_loader)
accuracy = correct / total

print(f'Validation Loss: {avg_val_loss}')
print(f'Validation Accuracy: {accuracy}')

# 保存模型权重
model_save_path = 'bert_model_weights.pth'
torch.save(model.state_dict(), model_save_path)
print(f'Model weights saved to {model_save_path}')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10


100%|██████████| 163/163 [11:51<00:00,  4.36s/it]


Average training loss: 0.7127260398691052
Epoch 2/10


100%|██████████| 163/163 [11:59<00:00,  4.42s/it]


Average training loss: 0.0767094462687161
Epoch 3/10


100%|██████████| 163/163 [11:42<00:00,  4.31s/it]


Average training loss: 0.028270753643309776
Epoch 4/10


100%|██████████| 163/163 [11:24<00:00,  4.20s/it]


Average training loss: 0.01557112114641107
Epoch 5/10


100%|██████████| 163/163 [11:24<00:00,  4.20s/it]


Average training loss: 0.10099800203018393
Epoch 6/10


100%|██████████| 163/163 [11:27<00:00,  4.22s/it]


Average training loss: 0.04084696591706044
Epoch 7/10


100%|██████████| 163/163 [11:39<00:00,  4.29s/it]


Average training loss: 0.00813510620600881
Epoch 8/10


100%|██████████| 163/163 [11:48<00:00,  4.34s/it]


Average training loss: 0.00668761181713659
Epoch 9/10


 60%|█████▉    | 97/163 [07:08<05:21,  4.87s/it]

In [24]:
model_save_path = 'bert_model_weights.pth'
torch.save(model.state_dict(), model_save_path)
print(f'Model weights saved to {model_save_path}')

Model weights saved to bert_model_weights.pth


In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
# 读取文件
file_path = '1.csv'  # 请将路径替换为您的文件路径
data = pd.read_csv(file_path, encoding='gbk')

# 确定标签数量
num_labels = len(data['label'].unique())
# 加载tokenizer和模型
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model = BertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=num_labels)  # 确保num_labels定义正确

# 加载保存的模型权重
model_path = 'bert_model_weights.pth'  # 请将路径替换为保存模型权重的路径
try:
    model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
    model.eval()
    print(f'Model weights loaded from {model_path}')
except Exception as e:
    print(f"Error loading model weights: {e}")

# 定义预测函数
def predict(sentence):
    # 预处理输入句子
    inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=128)
    
    # 进行预测
    with torch.no_grad():
        outputs = model(**inputs)
    
    # 获取预测结果
    logits = outputs.logits
    predicted_class_id = torch.argmax(logits, dim=1).item()
    
    return predicted_class_id

# 测试预测函数
sentence = "请问你知道李世民的封号吗？"
predicted_class = predict(sentence)
print(f"预测类别: {predicted_class}")


In [7]:
import pandas as pd
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, BertModel
from sklearn.model_selection import train_test_split
import torch.optim as optim
import torch.nn as nn
from tqdm import tqdm
import numpy as np

# 读取文件
file_path = 'sentences.csv'  # 请将路径替换为您的文件路径
data = pd.read_csv(file_path, encoding='gbk')

# 假设标签是以逗号分隔的字符串，如 'label1,label2'
# 将标签转换为多热编码
all_labels = set()

def process_labels(label):
    if isinstance(label, int):
        label = str(label)
    return label.split(',')

data['label'] = data['label'].apply(process_labels)
for labels in data['label']:
    all_labels.update(labels)
all_labels = list(all_labels)
label_map = {label: i for i, label in enumerate(all_labels)}

data['label'] = data['label'].apply(lambda x: [label_map[label] for label in x])
data['label'] = data['label'].apply(lambda x: np.eye(len(all_labels))[x].sum(axis=0))

# 加载BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

# 数据预处理函数
def preprocess(data, tokenizer, max_length=128):
    encodings = tokenizer(
        data['question'].tolist(),
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )
    labels = torch.tensor(data['label'].tolist(), dtype=torch.float32)
    return encodings, labels

# 将数据划分为训练集和验证集
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

# 预处理训练和验证数据
train_encodings, train_labels = preprocess(train_data, tokenizer)
val_encodings, val_labels = preprocess(val_data, tokenizer)

# 创建Dataset对象
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

train_dataset = CustomDataset(train_encodings, train_labels)
val_dataset = CustomDataset(val_encodings, val_labels)

# 数据加载器
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# 加载BERT模型并添加一个全连接层
class BertForMultiLabelClassification(nn.Module):
    def __init__(self, model_name, num_labels):
        super(BertForMultiLabelClassification, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
    
    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        
        loss = None
        if labels is not None:
            loss_fct = nn.BCEWithLogitsLoss()
            loss = loss_fct(logits, labels)
        
        return logits if loss is None else (loss, logits)

model = BertForMultiLabelClassification('bert-base-chinese', num_labels=len(all_labels))

# 选择优化器
optimizer = optim.AdamW(model.parameters(), lr=5e-5)

# 将模型设置为训练模式
model.train()

# 训练循环
num_epochs = 3
for epoch in range(num_epochs):
    print(f'Epoch {epoch + 1}/{num_epochs}')
    epoch_loss = 0
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].squeeze(1)
        attention_mask = batch['attention_mask'].squeeze(1)
        labels = batch['labels']

        loss, outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
    
    avg_loss = epoch_loss / len(train_loader)
    print(f'Average training loss: {avg_loss}')

# 评估模型
model.eval()
val_loss = 0
total = 0
correct_predictions = 0

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].squeeze(1)
        attention_mask = batch['attention_mask'].squeeze(1)
        labels = batch['labels']

        loss, outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        val_loss += loss.item()

        predictions = torch.sigmoid(outputs) > 0.5
        correct_predictions += (predictions == labels).sum().item()
        total += labels.numel()

avg_val_loss = val_loss / len(val_loader)
accuracy = correct_predictions / total

print(f'Validation Loss: {avg_val_loss}')
print(f'Validation Accuracy: {accuracy}')

# 保存模型权重
model_save_path = 'bert_model_weights_multi.pth'
torch.save(model.state_dict(), model_save_path)
print(f'Model weights saved to {model_save_path}')

# 输出多分类数组
test_data = ["你知道李鸿章的官职和出生日期吗？"]  # 替换为您的测试数据
test_encodings = tokenizer(test_data, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
test_input_ids = test_encodings['input_ids']
test_attention_mask = test_encodings['attention_mask']

model.eval()
with torch.no_grad():
    test_outputs = model(test_input_ids, test_attention_mask)
    test_predictions = torch.sigmoid(test_outputs) > 0.5
    print("Test Predictions:")
    for i, prediction in enumerate(test_predictions):
        labels = [all_labels[j] for j, val in enumerate(prediction) if val]
        print(f"Sentence: {test_data[i]}")
        print(f"Predicted Labels: {labels}")


Epoch 1/3


100%|██████████| 85/85 [04:23<00:00,  3.10s/it]


Average training loss: 0.22188058434163824
Epoch 2/3


100%|██████████| 85/85 [04:23<00:00,  3.10s/it]


Average training loss: 0.04733705812079065
Epoch 3/3


100%|██████████| 85/85 [04:22<00:00,  3.08s/it]


Average training loss: 0.026171437979621046
Validation Loss: 0.020316480912945488
Validation Accuracy: 0.9985294117647059
Model weights saved to bert_model_weights_multi.pth
Test Predictions:
Sentence: 你知道李鸿章的官职和出生日期吗？
Predicted Labels: ['1']


In [25]:
model.load_state_dict(torch.load('bert_model_weights_multi.pth'))
test_data = ["你知道李鸿章的官职和出生日期吗？"]  # 替换为您的测试数据
test_encodings = tokenizer(test_data, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
test_input_ids = test_encodings['input_ids']
test_attention_mask = test_encodings['attention_mask']

model.eval()
with torch.no_grad():
    test_outputs = model(test_input_ids, test_attention_mask)
    test_predictions = torch.sigmoid(test_outputs) > 0.5
    print("Test Predictions:")
    for i, prediction in enumerate(test_predictions):
        labels = [all_labels[j] for j, val in enumerate(prediction) if val]
        print(f"Sentence: {test_data[i]}")
        print(f"Predicted Labels: {labels}")


RuntimeError: Error(s) in loading state_dict for BertForSequenceClassification:
	size mismatch for classifier.weight: copying a param with shape torch.Size([15, 768]) from checkpoint, the shape in current model is torch.Size([16, 768]).
	size mismatch for classifier.bias: copying a param with shape torch.Size([15]) from checkpoint, the shape in current model is torch.Size([16]).

In [9]:
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

# 读取上传的CSV文件
file_path = '1.csv'
data = pd.read_csv(file_path, encoding='gbk')

# 查看数据
print(data.head())

# 假设我们有两个标签：籍贯(0)和出生年份(1)
num_labels = len(data['label'].unique())

# 创建多标签向量
def create_multi_label(row):
    multi_label = [0] * num_labels
    multi_label[row['label']] = 1
    return multi_label

data['multi_label'] = data.apply(create_multi_label, axis=1)

# 准备数据集
class MultiLabelDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        labels = self.labels[idx]

        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )

        labels = torch.tensor(labels, dtype=torch.float)

        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['labels'] = labels

        return item

# 加载BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

# 创建数据集
texts = data['question'].tolist()
labels = data['label'].tolist()
max_len = 128

dataset = MultiLabelDataset(texts, labels, tokenizer, max_len)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=True)

# 训练模型
model = BertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=num_labels)
optimizer = AdamW(model.parameters(), lr=5e-5)

model.train()
for epoch in range(3):
    for batch in dataloader:
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids, attention_mask=attention_mask)
        loss = torch.nn.BCEWithLogitsLoss()(outputs.logits, labels)
        loss.backward()
        optimizer.step()

        print(f"Epoch {epoch}, Loss: {loss.item()}")

# 保存模型
model_save_path = 'bert_multi_label_model.pth'
torch.save(model.state_dict(), model_save_path)
print(f'Model weights saved to {model_save_path}')


     question  label
0  请问a的籍贯是哪里？      0
1   你知道a的籍贯吗？      0
2      a是哪的人？      0
3      a是哪里人？      0
4    a的家乡在哪里？      0


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ValueError: Target size (torch.Size([8])) must be the same as input size (torch.Size([8, 15]))

In [11]:
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

# 读取上传的CSV文件
file_path = '1.csv'
data = pd.read_csv(file_path, encoding='gbk')

# 查看数据
print(data.head())

# 假设我们有两个标签：籍贯(0)和出生年份(1)
num_labels = len(data['label'].unique())

# 创建多标签向量
def create_multi_label(row):
    multi_label = [0] * num_labels
    multi_label[row['label']] = 1
    return multi_label

data['multi_label'] = data.apply(create_multi_label, axis=1)

# 准备数据集
class MultiLabelDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        labels = self.labels[idx]

        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )

        labels = torch.tensor(labels, dtype=torch.float)

        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['labels'] = labels

        return item

# 加载BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

# 创建数据集
texts = data['question'].tolist()  # 确保使用正确的列名
labels = data['multi_label'].tolist()  # 确保使用多标签列
max_len = 128

dataset = MultiLabelDataset(texts, labels, tokenizer, max_len)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=True)

# 训练模型
model = BertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=num_labels)
optimizer = AdamW(model.parameters(), lr=5e-5)

model.train()
for epoch in range(3):
    for batch in dataloader:
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids, attention_mask=attention_mask)
        loss = torch.nn.BCEWithLogitsLoss()(outputs.logits, labels)
        loss.backward()
        optimizer.step()

        print(f"Epoch {epoch}, Loss: {loss.item()}")

# 保存模型
model_save_path = 'bert_multi_label_model.pth'
torch.save(model.state_dict(), model_save_path)
print(f'Model weights saved to {model_save_path}')


     question  label
0  请问a的籍贯是哪里？      0
1   你知道a的籍贯吗？      0
2      a是哪的人？      0
3      a是哪里人？      0
4    a的家乡在哪里？      0


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 0, Loss: 0.7667551040649414
Epoch 0, Loss: 0.6823835372924805
Epoch 0, Loss: 0.616031289100647
Epoch 0, Loss: 0.5834007859230042
Epoch 0, Loss: 0.5292381644248962
Epoch 0, Loss: 0.4958725869655609
Epoch 0, Loss: 0.4733896255493164
Epoch 0, Loss: 0.4600932002067566
Epoch 0, Loss: 0.4359320104122162
Epoch 0, Loss: 0.4236968159675598
Epoch 0, Loss: 0.4078444242477417
Epoch 0, Loss: 0.38233062624931335
Epoch 0, Loss: 0.3815326690673828
Epoch 0, Loss: 0.37401801347732544
Epoch 0, Loss: 0.37017694115638733
Epoch 0, Loss: 0.35244953632354736
Epoch 0, Loss: 0.34605684876441956
Epoch 0, Loss: 0.3288227319717407
Epoch 0, Loss: 0.3219880759716034
Epoch 0, Loss: 0.3169013559818268
Epoch 0, Loss: 0.30686473846435547
Epoch 0, Loss: 0.30512863397598267
Epoch 0, Loss: 0.29622724652290344
Epoch 0, Loss: 0.279869019985199
Epoch 0, Loss: 0.301534503698349
Epoch 0, Loss: 0.28427669405937195
Epoch 0, Loss: 0.27181658148765564
Epoch 0, Loss: 0.273240864276886
Epoch 0, Loss: 0.2680374085903168
Epoch 0,

In [19]:
# 加载模型并进行推理
model.eval()
with torch.no_grad():
    text = "a的别名是什么？"
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    outputs = model(**inputs)
    logits = outputs.logits
    probabilities = torch.sigmoid(logits)
    predicted_labels = (probabilities > 0.5).int()
    print(predicted_labels)


tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]], dtype=torch.int32)


In [ ]:
import spacy
import pandas as pd

# 加载预训练的SpaCy模型
nlp = spacy.load('zh_core_web_sm')

# 示例问题列表
questions = [
    "李鸿章的出生日期和官职是什么？",
    "请问李鸿章的出生日期和官职是什么？",
    "李鸿章的生日和职位是什么？",
    "你知道李鸿章的出生日期和官职吗？",
    "李鸿章的生日和他的职位是什么？",
    # 继续添加其他问题
]

# 定义一个函数来识别人名
def extract_name(question):
    doc = nlp(question)
    for ent in doc.ents:
        if ent.label_ == 'PERSON':
            return ent.text
    return None

# 创建一个包含识别结果的DataFrame
data = {'问题编号': range(1, len(questions) + 1), '问题': questions, '识别出的人名': [extract_name(q) for q in questions]}
df = pd.DataFrame(data)

# 保存为CSV文件
df.to_csv('questions_with_names.csv', index=False, encoding='utf-8')

print(df)
