In [1]:
!pip install torchtext
import collections
import os
import random
import time
import torch
from torch import nn
import torchtext.vocab as Vocab
import torch.utils.data as Data
import torch.nn.functional as F
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
import csv
data = []
with open('/home/kesci/input/Comments9120/train_shuffle.txt') as f:
    text = f.read().split("\n")
    for t in text:
        t_ = t.split("\t")
        data.append(t_)
labels_data = []
reviews_data = []
for d in data:
    # for label, review in d:
    label = d[0]
    review = d[-1]
    labels_data.append(label)
    reviews_data.append(review)
data = (reviews_data[:-1], labels_data[:-1])  # 最后一个为空
data_test = (reviews_data[:10], labels_data[:10])
data_test

In [3]:
def get_tokenized_imdb(data):
    '''
    @params:
        data: 数据的列表，列表中的每个元素为 [文本字符串，0/1标签] 二元组
    @return: 切分词后的文本的列表，列表中的每个元素为切分后的词序列
    '''
    review_list = []
    for reviews in data[0]:
        tok_list = []
        for tok in reviews:
            tok_list.append(tok)
        review_list.append(tok_list) 
    return review_list

def get_vocab_imdb(data):
    '''
    @params:
        data: 同上
    @return: 数据集上的词典，Vocab 的实例（freqs, stoi, itos）
    '''
    tokenized_data = get_tokenized_imdb(data)
    counter = collections.Counter([tok for token in tokenized_data for tok in token])
    return Vocab.Vocab(counter, min_freq=1)

# vocab = get_tokenized_imdb(data)
vocab = get_vocab_imdb(data)
len(vocab)
# vo_test = Vocab.Vocab(collections.Counter([tok for token in get_tokenized_imdb(data_test) for tok in token]), min_freq=1)
# len(vo_test)

2509

In [4]:
def preprocess_imdb(data, vocab):
    '''
    @params:
        data: 同上，原始的读入数据
        vocab: 训练集上生成的词典
    @return:
        features: 单词下标序列，形状为 (n, max_l) 的整数张量
        labels: 情感标签，形状为 (n,) 的0/1整数张量
    '''
    max_l = 20  # 将每条评论通过截断或者补0，使得长度变成20---因最大长度即为20

    def pad(x):
        if len(x) > max_l:
            return x[:max_l]
        else:
            return x + [0] * (max_l - len(x))
    tokenized_data = get_tokenized_imdb(data)
    # TEXT.vocab.stoi 对应词寻找下标； itos 对应下标寻找词 
    features = torch.tensor([pad([vocab.stoi[word] for word in words]) 
                            for words in tokenized_data])
    labels = torch.tensor([int(sco) for _, score in [data] for sco in score ])
    return features, labels

In [5]:
# preprocess_imdb(data_test, vocab)
train_fea, train_lab = preprocess_imdb(data, vocab)
# get_tokenized_imdb(data)
train_fea, train_lab

(tensor([[ 102,   19,   28,  ...,    0,    0,    0],
         [ 500,  176,  434,  ...,    0,    0,    0],
         [ 248,  492,  124,  ...,    0,    0,    0],
         ...,
         [  64,   25,   46,  ...,    0,    0,    0],
         [  35,   45,  144,  ...,    0,    0,    0],
         [  85, 1438,   78,  ...,    0,    0,    0]]),
 tensor([0, 0, 0,  ..., 0, 1, 0]))

In [6]:
train_set = Data.TensorDataset(train_fea, train_lab)
train_set

<torch.utils.data.dataset.TensorDataset at 0x7f87454035f8>

In [8]:
class BiRNN(nn.Module):
    def __init__(self, vocab, embed_size, num_hiddens, num_layers):
        '''
        @params:
            vocab: 在数据集上创建的词典，用于获取词典大小
            embed_size: 嵌入维度大小
            num_hiddens: 隐藏状态维度大小
            num_layers: 隐藏层个数
        '''
        super(BiRNN, self).__init__()
        self.embedding = nn.Embedding(len(vocab), embed_size)
        
        # encoder-decoder framework
        # bidirectional设为True即得到双向循环神经网络
        self.encoder = nn.LSTM(input_size=embed_size, 
                                hidden_size=num_hiddens, 
                                num_layers=num_layers,
                                bidirectional=True)
        self.decoder = nn.Linear(4*num_hiddens, 2) # 初始时间步和最终时间步的隐藏状态作为全连接层输入
    
    def forward(self, inputs):
        '''
        @params:
            inputs: 词语下标序列，形状为 (batch_size, seq_len) 的整数张量
        @return:
            outs: 对文本情感的预测，形状为 (batch_size, 2) 的张量
        '''
        # 因为LSTM需要将序列长度(seq_len)作为第一维，所以需要将输入转置
        embeddings = self.embedding(inputs.permute(1, 0)) # (seq_len, batch_size, d)
        # rnn.LSTM 返回输出、隐藏状态和记忆单元，格式如 outputs, (h, c)
        outputs, _ = self.encoder(embeddings) # (seq_len, batch_size, 2*h)
        encoding = torch.cat((outputs[0], outputs[-1]), -1) # (batch_size, 4*h)
        outs = self.decoder(encoding) # (batch_size, 2)
        return outs


# for X, y in train_iter:
#     print('X', X.shape, 'y', y.shape)
#     break
# print('#batches:', len(train_iter))
vocab = get_vocab_imdb(data)
batch_size = 64
train_fea, train_lab = preprocess_imdb(data, vocab)
train_set = Data.TensorDataset(train_fea, train_lab)
train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
embed_size, num_hiddens, num_layers = 100, 100, 4
net = BiRNN(vocab, embed_size, num_hiddens, num_layers)

In [9]:
cache_dir = "/home/kesci/work/Comments9120"
glove_vocab = Vocab.Vectors(name='wiki_100.utf8', cache=cache_dir)

In [10]:
def load_pretrained_embedding(words, pretrained_vocab):
    '''
    @params:
        words: 需要加载词向量的词语列表，以 itos (index to string) 的词典形式给出
        pretrained_vocab: 预训练词向量
    @return:
        embed: 加载到的词向量
    '''
    embed= torch.zeros(len(words), 100)  # 初始化语料对应词向量
    oov_count = 0  # 计总袋外词数量
    for i, word in enumerate(words):
        try:
            idx = pretrained_vocab.stoi[word]
            embed[i, :] = pretrained_vocab.vectors[idx]
        except KeyError:
            oov_count += 1
    
    if oov_count > 0:
        print("There are %d oov words." % oov_count)
    return embed
    
net.embedding.weight.data.copy_(load_pretrained_embedding(vocab.itos, glove_vocab))
net.embedding.weight.requires_grad = False # 直接加载预训练好的, 所以不需要更新它

There are 4 oov words.


In [11]:
def evaluate_accuracy(data_iter, net, device=None):
    if device is None and isinstance(net, torch.nn.Module):
        device = list(net.parameters())[0].device 
    acc_sum, n = 0.0, 0
    with torch.no_grad():
        for X, y in data_iter:
            if isinstance(net, torch.nn.Module):
                net.eval()
                acc_sum += (net(X.to(device)).argmax(dim=1) == y.to(device)).float().sum().cpu().item()
                net.train()
            else:
                if('is_training' in net.__code__.co_varnames):
                    acc_sum += (net(X, is_training=False).argmax(dim=1) == y).float().sum().item() 
                else:
                    acc_sum += (net(X).argmax(dim=1) == y).float().sum().item() 
            n += y.shape[0]
    return acc_sum / n

def evaluate_test_set(data_iter, net, device=None):
    if device is None and isinstance(net, torch.nn.Module):
        device = list(net.parameters())[0].device 
    # acc_sum, n = 0.0, 0
    with torch.no_grad():
        for X, _ in data_iter:
            if isinstance(net, torch.nn.Module):
                net.eval()
                test_result = net(X.to(device))
                net.train()
                print(test_result)
            # else:
            #     if('is_training' in net.__code__.co_varnames):
            #         acc_sum += (net(X, is_training=False).argmax(dim=1) == y).float().sum().item() 
            #     else:
            #         acc_sum += (net(X).argmax(dim=1) == y).float().sum().item() 
            # n += y.shape[0]
    # return acc_sum / n
    
def train(train_iter, net, loss, optimizer, device, num_epochs):
    net = net.to(device)
    print("training on ", device)
    batch_count = 0
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time()
        for X, y in train_iter:
            X = X.to(device)
            y = y.to(device)
            y_hat = net(X)
            l = loss(y_hat, y) 
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            train_l_sum += l.cpu().item()
            train_acc_sum += (y_hat.argmax(dim=1) == y).sum().cpu().item()
            n += y.shape[0]
            batch_count += 1
        # test_acc = evaluate_accuracy(test_iter, net)
        print('epoch %d, loss %.4f, train acc %.3f, time %.1f sec'
              % (epoch + 1, train_l_sum / batch_count, train_acc_sum / n, time.time() - start))

In [25]:
# vocab = get_vocab_imdb(data)  # 获得语料词典库

# train_fea, train_lab = preprocess_imdb(data, vocab)
# train_set = Data.TensorDataset(train_fea, train_lab)
# batch_size = 64
# train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
embed_size, num_hiddens, num_layers = 100, 100, 2
net = BiRNN(vocab, embed_size, num_hiddens, num_layers)

lr, num_epochs = 0.001, 10
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=lr)
loss = nn.CrossEntropyLoss()

train(train_iter, net, loss, optimizer, device, num_epochs)

training on  cuda
epoch 1, loss 0.3690, train acc 0.839, time 3.2 sec
epoch 2, loss 0.1273, train acc 0.900, time 3.0 sec
epoch 3, loss 0.0718, train acc 0.918, time 3.0 sec
epoch 4, loss 0.0463, train acc 0.928, time 3.1 sec
epoch 5, loss 0.0315, train acc 0.940, time 3.1 sec
epoch 6, loss 0.0224, train acc 0.948, time 3.1 sec
epoch 7, loss 0.0155, train acc 0.958, time 3.0 sec
epoch 8, loss 0.0111, train acc 0.966, time 3.1 sec
epoch 9, loss 0.0077, train acc 0.974, time 3.0 sec
epoch 10, loss 0.0067, train acc 0.974, time 3.0 sec


In [13]:
test_data_list = []
with open('/home/kesci/input/Comments9120/test_handout.txt') as f:
    test_data = f.read().split("\n")
    test_data_list.append(test_data)
    test_data_list.append(['0'] * len(test_data))
    # print(test_data_list)

In [26]:
test_fea, test_lab = preprocess_imdb(test_data_list, vocab)
test_set = Data.TensorDataset(test_fea, test_lab)
test_iter = Data.DataLoader(test_set, batch_size)
test_set

<torch.utils.data.dataset.TensorDataset at 0x7f86d0055710>

In [27]:
def evaluate_test_set(data_iter, net, device=None):
    # net = net.to(device)
    if device is None and isinstance(net, torch.nn.Module):
        device = list(net.parameters())[0].device 
    # device = list(net.parameters())[0].device 
    # acc_sum, n = 0.0, 0
    result = []
    with torch.no_grad():
        for X, _ in data_iter:
            if isinstance(net, torch.nn.Module):
                net.eval()
                test_result = net(X.to(device)).softmax(dim=1)
                result.append(test_result)
        return result
                
re = evaluate_test_set(test_iter, net, device)

In [29]:
score_l = []
for r in re:
    r_view = r.cpu().numpy()
    for score in r_view:
        score_l.append(score[-1])

import pandas as pd
d_test = pd.read_csv("/home/kesci/input/Comments9120/test_handout.txt", header=None)
d_test.insert(0,"Prediction", score_l[:-1])
output = d_test.drop(columns=0,axis=1)  # 移除评论内容一列
output.to_csv("result.csv")