In [1]:
#数据集的处理
import os
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
class ChineseSegmentationDataset(Dataset):
    def __init__(self, data_path, max_len=50):
        self.data_path = data_path
        self.max_len = max_len
        self.sentences, self.labels = self.load_data()

    def load_data(self):
        sentences = []
        labels = []
        with open(self.data_path, 'r', encoding='utf-8') as f:
            for line in f:
                words = line.strip().split()
                sentence = []
                label = []
                for word in words:
                    if len(word) == 1:
                        sentence.append(word)
                        label.append('S')
                    else:
                        sentence.append(word[0])
                        label.append('B')
                        for char in word[1:-1]:
                            sentence.append(char)
                            label.append('M')
                        sentence.append(word[-1])
                        label.append('E')
                sentences.append(sentence)
                labels.append(label)
        return sentences, labels

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx][:self.max_len]
        label = self.labels[idx][:self.max_len]
        return sentence, label

# 示例数据集路径
data_path = '/home/shaoxiong/exa/人工智能大作业/dataset/icwb2-data/training/msr_training.utf8'
dataset = ChineseSegmentationDataset(data_path)

In [2]:
class RNNSegmentationModel(nn.Module):
    def __init__(self, vocab_size, tag_size, embedding_dim=128, hidden_dim=256, num_layers=2):
        super(RNNSegmentationModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, tag_size)

    def forward(self, x):
        x = self.embedding(x)
        lstm_out, _ = self.lstm(x)
        tag_space = self.fc(lstm_out)
        return tag_space

# 构建词汇表和标签表
vocab = set()
tags = set()
for sentence in dataset.sentences:
    vocab.update(sentence)
for label in dataset.labels:
    tags.update(label)
    
vocab_size = len(vocab)
tag_size = len(tags)

# 构建字符到索引的映射
char_to_idx = {char: idx for idx, char in enumerate(vocab)}
tag_to_idx = {tag: idx for idx, tag in enumerate(tags)}

In [9]:
char_to_idx['蹶']

KeyError: '蹶'

In [3]:
# 将数据转换为索引
def encode_data(sentences, labels):
    encoded_sentences = []
    encoded_labels = []
    for sentence, label in zip(sentences, labels):
        encoded_sentence = [char_to_idx[char] for char in sentence]
        encoded_label = [tag_to_idx[tag] for tag in label]
        encoded_sentences.append(encoded_sentence)
        encoded_labels.append(encoded_label)
    return encoded_sentences, encoded_labels

encoded_sentences, encoded_labels = encode_data(dataset.sentences, dataset.labels)

In [4]:
# 将数据转换为Tensor
from torch.nn.utils.rnn import pad_sequence
encoded_sentences = [torch.tensor(seq, dtype=torch.long) for seq in encoded_sentences]
encoded_labels = [torch.tensor(seq, dtype=torch.long) for seq in encoded_labels]

padded_sentences = pad_sequence(encoded_sentences, batch_first=True, padding_value=0)
padded_labels = pad_sequence(encoded_labels, batch_first=True, padding_value=0)

print(padded_sentences.shape)
print(padded_labels.shape)

# 构建DataLoader
train_data = torch.utils.data.TensorDataset(padded_sentences, padded_labels)
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)

# 初始化模型、损失函数和优化器
model = RNNSegmentationModel(vocab_size, tag_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

torch.Size([86924, 581])
torch.Size([86924, 581])


  _torch_pytree._register_pytree_node(


In [6]:
def train_model(model, train_loader, criterion, optimizer, epochs=1, device='cuda'):
    model.to(device)
    model.train()
    print('start training')
    for epoch in range(epochs):
        total_loss = 0
        print(f'Starting epoch {epoch+1}/{epochs}')
        for batch_idx, (inputs, targets) in enumerate(train_loader):
            if batch_idx % 100 == 0:
                print(f'Starting batch {batch_idx+1}/{len(train_loader)}')
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs.view(-1, outputs.size(-1)), targets.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader)}')

# 假设 model, train_loader, criterion 和 optimizer 已经定义
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_model(model, train_loader, criterion, optimizer, epochs=1, device=device)

start training
Starting epoch 1/1
Starting batch 1/2717
Starting batch 101/2717
Starting batch 201/2717
Starting batch 301/2717
Starting batch 401/2717
Starting batch 501/2717
Starting batch 601/2717
Starting batch 701/2717
Starting batch 801/2717
Starting batch 901/2717
Starting batch 1001/2717
Starting batch 1101/2717
Starting batch 1201/2717
Starting batch 1301/2717
Starting batch 1401/2717
Starting batch 1501/2717
Starting batch 1601/2717
Starting batch 1701/2717
Starting batch 1801/2717
Starting batch 1901/2717
Starting batch 2001/2717
Starting batch 2101/2717
Starting batch 2201/2717
Starting batch 2301/2717
Starting batch 2401/2717
Starting batch 2501/2717
Starting batch 2601/2717
Starting batch 2701/2717
Epoch 1/1, Loss: 0.024512715492518204


In [7]:
def evaluate_model(model, test_data_path):
    model.eval()
    test_dataset = ChineseSegmentationDataset(test_data_path)
    encoded_test_sentences, _ = encode_data(test_dataset.sentences, test_dataset.labels)
    encoded_test_sentences = torch.tensor(encoded_test_sentences, dtype=torch.long)

    with torch.no_grad():
        outputs = model(encoded_test_sentences)
        predicted_tags = torch.argmax(outputs, dim=2)

    # 将预测的标签转换回字符
    idx_to_tag = {idx: tag for tag, idx in tag_to_idx.items()}
    predicted_tags = [[idx_to_tag[idx.item()] for idx in tag_seq] for tag_seq in predicted_tags]

    # 计算准确率
    correct = 0
    total = 0
    for pred, true in zip(predicted_tags, test_dataset.labels):
        for p, t in zip(pred, true):
            if p == t:
                correct += 1
            total += 1

    accuracy = correct / total
    print(f'Accuracy: {accuracy * 100:.2f}%')

In [None]:
model.eval()
outputs = model(padded_sentences[0].to('cuda'))
predicted_tags = torch.argmax(outputs)


tensor([[ 5.4841, -0.4276, -5.2244, -4.0390],
        [ 0.1195,  6.1962, -4.8442, -1.8282],
        [-0.5346, -4.9230,  5.4511, -0.1871],
        ...,
        [10.2005, -5.1622, -5.9810, -6.4846],
        [ 9.4669, -4.2138, -6.1826, -5.8788],
        [ 7.7843, -2.4542, -6.2304, -4.7637]], device='cuda:0',
       grad_fn=<AddmmBackward0>)