Skip to content

Latest commit

 

History

History
281 lines (243 loc) · 9.85 KB

9.2-文本分类实战.md

File metadata and controls

281 lines (243 loc) · 9.85 KB

此处还用到torchtext,针对NLP的文本预处理功能模块。

1、读取数据 数据源:斯坦福的IMDb数据集(Stanford’s Large Movie Review Dataset)

def read_imdb(folder='train', data_root="./dataset/aclImdb_v1/aclImdb"):
    data = []
    for label in ['pos', 'neg']:
        folder_name = os.path.join(data_root, folder, label)
        for file in tqdm(os.listdir(folder_name)):
            with open(os.path.join(folder_name, file), 'rb') as f:
                review = f.read().decode('utf-8').replace('\n', '').lower()
                data.append([review, 1 if label == 'pos' else 0])
    random.shuffle(data)
    return data

DATA_ROOT = "/home/kesci/input/IMDB2578/aclImdb_v1/"
data_root = os.path.join(DATA_ROOT, "aclImdb")
train_data, test_data = read_imdb('train', data_root), read_imdb('test', data_root)

# 打印训练数据中的前五个sample
for sample in train_data[:5]:
    print(sample[1], '\t', sample[0][:50])

2、数据预处理

def get_tokenized_imdb(data):
    '''
    @params:
        data: 数据的列表,列表中的每个元素为 [文本字符串,0/1标签] 二元组
    @return: 切分词后的文本的列表,列表中的每个元素为切分后的词序列
    '''
    review_list = []
    for review, _ in data:
        # tokenizer(review)
        tok_list = []
        for tok in review.split(' '):
            tok.lower()
            tok_list.append(tok)
        review_list.append(tok_list)
    return review_list

def get_vocab_imdb(data):
    '''
    @params:
        data: 同上
    @return: 数据集上的词典,Vocab 的实例(freqs, stoi, itos)
    '''
    tokenized_data = get_tokenized_imdb(data)
    counter = collections.Counter([tk for st in tokenized_data for tk in st])
    return Vocab.Vocab(counter, min_freq=5)

工程知识点:

  • torchtext.vocab.Vocab() 构建当前 corpus 的词汇表
  • collections.Counter() 文本计数器

词典和词语的索引创建好后,就可以将数据集的文本从字符串的形式转换为单词下标序列的形式,以待之后的使用

def preprocess_imdb(data, vocab):
    '''
    @params:
        data: 同上,原始的读入数据
        vocab: 训练集上生成的词典
    @return:
        features: 单词下标序列,形状为 (n, max_l) 的整数张量
        labels: 情感标签,形状为 (n,) 的0/1整数张量
    '''
    max_l = 500  # 将每条评论通过截断或者补0,使得长度变成500

    def pad(x):
        if len(x) > max_l:
            return x[:max_l]
        else:
            return x + [0] * (max_l - len(x))
    tokenized_data = get_tokenized_imdb(data)
    # TEXT.vocab.stoi 对应词寻找下标; itos 对应下标寻找词 
    features = torch.tensor([pad([vocab.stoi[word] for word in words]) for words in tokenized_data])
    labels = torch.tensor([score for _, score in data])
    return features, labels

工程知识点:

TEXT.vocab.stoi 对应词寻找下标
TEXT.vocab.itos 对应下标寻找词


创建数据迭代器

利用 torch.utils.data.TensorDataset,可以创建 PyTorch 格式的数据集,从而创建数据迭代器。 torch.utils.data.DataLoader 将数据进行一定批量大小的输出

查看预处理后的直观效果

batch_size = 64
train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
test_iter = Data.DataLoader(test_set, batch_size)

for X, y in train_iter:
    print('X', X.shape, 'y', y.shape)
    break
print('#batches:', len(train_iter))
  • Out:
X torch.Size([64, 500]) y torch.Size([64])
#batches: 391

3、搭建BiRNN

class BiRNN(nn.Module):
    def __init__(self, vocab, embed_size, num_hiddens, num_layers):
        '''
        @params:
            vocab: 在数据集上创建的词典,用于获取词典大小
            embed_size: 嵌入维度大小
            num_hiddens: 隐藏状态维度大小
            num_layers: 隐藏层个数
        '''
        super(BiRNN, self).__init__()
        self.embedding = nn.Embedding(len(vocab), embed_size)
        
        # encoder-decoder framework
        # bidirectional设为True即得到双向循环神经网络
        self.encoder = nn.LSTM(input_size=embed_size, 
                                hidden_size=num_hiddens, 
                                num_layers=num_layers,
                                bidirectional=True)
        self.decoder = nn.Linear(4*num_hiddens, 2) # 初始时间步和最终时间步的隐藏状态作为全连接层输入
    
    def forward(self, inputs):
        '''
        @params:
            inputs: 词语下标序列,形状为 (batch_size, seq_len) 的整数张量
        @return:
            outs: 对文本情感的预测,形状为 (batch_size, 2) 的张量
        '''
        # 因为LSTM需要将序列长度(seq_len)作为第一维,所以需要将输入转置
        embeddings = self.embedding(inputs.permute(1, 0)) # (seq_len, batch_size, d)
        # rnn.LSTM 返回输出、隐藏状态和记忆单元,格式如 outputs, (h, c)
        outputs, _ = self.encoder(embeddings) # (seq_len, batch_size, 2*h)
        encoding = torch.cat((outputs[0], outputs[-1]), -1) # (batch_size, 4*h)
        outs = self.decoder(encoding) # (batch_size, 2)
        return outs

工程知识点:

torch.tensor.transpose() 仅对张量进行2个维度转换 torch.tensor.permute() 同时对张量进行多个维度转换 torch.cat() 返回元组类型的拼接后张量

4、导入预训练词向量

加载

cache_dir = "/home/kesci/input/GloVe6B5429"
glove_vocab = Vocab.GloVe(name='6B', dim=100, cache=cache_dir)

导入

def load_pretrained_embedding(words, pretrained_vocab):
    '''
    @params:
        words: 需要加载词向量的词语列表,以 itos (index to string) 的词典形式给出
        pretrained_vocab: 预训练词向量
    @return:
        embed: 加载到的词向量
    '''
    embed= torch.zeros(len(words), pretrained_vocab.vector[0].shape[0])  # 初始化语料对应词向量
    oov_count = 0  # 计总袋外词数量
    for i, word in enumerate(words):
        try:
            idx = pretrained_vocab.stoi[word]
            embed[i, :] = pretrained_vocab.vectors[idx]
        except KeyError:
            oov_count += 1
    
    if oov_count > 0:
        print("There are %d oov words." % oov_count)
    return embed
    
net.embedding.weight.data.copy_(load_pretrained_embedding(vocab.itos, glove_vocab))
net.embedding.weight.requires_grad = False # 直接加载预训练好的, 所以不需要更新它
  • embedding.weight.data.copy_(embed) 需要在神经网络模型的Embedding层中明确地传递嵌入矩阵的初始权重

5、训练模型 训练时所需要的评估函数evaluate_accuracy

def evaluate_accuracy(data_iter, net, device=None):
    if device is None and isinstance(net, torch.nn.Module):
        device = list(net.parameters())[0].device 
    acc_sum, n = 0.0, 0
    with torch.no_grad():
        for X, y in data_iter:
            if isinstance(net, torch.nn.Module):
                net.eval()
                acc_sum += (net(X.to(device)).argmax(dim=1) == y.to(device)).float().sum().cpu().item()
                net.train()
            else:
                if('is_training' in net.__code__.co_varnames):
                    acc_sum += (net(X, is_training=False).argmax(dim=1) == y).float().sum().item() 
                else:
                    acc_sum += (net(X).argmax(dim=1) == y).float().sum().item() 
            n += y.shape[0]
    return acc_sum / n

训练函数

def train(train_iter, test_iter, net, loss, optimizer, device, num_epochs):
    net = net.to(device)
    print("training on ", device)
    batch_count = 0
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time()
        for X, y in train_iter:
            X = X.to(device)
            y = y.to(device)
            y_hat = net(X)
            l = loss(y_hat, y) 
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            train_l_sum += l.cpu().item()
            train_acc_sum += (y_hat.argmax(dim=1) == y).sum().cpu().item()
            n += y.shape[0]
            batch_count += 1
        test_acc = evaluate_accuracy(test_iter, net)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
              % (epoch + 1, train_l_sum / batch_count, train_acc_sum / n, test_acc, time.time() - start))

查看效果:

lr, num_epochs = 0.01, 5
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=lr)
loss = nn.CrossEntropyLoss()

train(train_iter, test_iter, net, loss, optimizer, device, num_epochs)
  • Out:
training on  cpu
epoch 1, loss 0.5791, train acc 0.679, test acc 0.816, time 1976.7 sec
epoch 2, loss 0.1988, train acc 0.825, test acc 0.840, time 1276.6 sec
epoch 3, loss 0.1171, train acc 0.848, test acc 0.845, time 1213.0 sec
epoch 4, loss 0.0794, train acc 0.867, test acc 0.848, time 1200.9 sec

6、预测函数

def predict_sentiment(net, vocab, sentence):
    '''
    @params:
        net: 训练好的模型
        vocab: 在该数据集上创建的词典,用于将给定的单词序转换为单词下标的序列,从而输入模型
        sentence: 需要分析情感的文本,以单词序列的形式给出
    @return: 预测的结果,positive 为正面情绪文本,negative 为负面情绪文本
    '''
    device = list(net.parameters())[0].device # 读取模型所在的环境
    sentence = torch.tensor([vocab.stoi[word] for word in sentence], device=device)
    label = torch.argmax(net(sentence.view((1, -1))), dim=1)
    return 'positive' if label.item() == 1 else 'negative'

predict_sentiment(net, vocab, ['this', 'movie', 'is', 'so', 'great'])