In [22]:
import torch
from torch import nn
import pandas as pd
import os


DATA_ROOT_PATH = "/home/XuHaoshuai/Project/HumanIE-IPM-experiment-2.0/lstm"
relation = 'couple'


training_set = pd.read_csv(os.path.join(DATA_ROOT_PATH, 'training_set', relation + '.csv'))
test_set = pd.read_csv(os.path.join(DATA_ROOT_PATH, 'test_set', relation + '.csv'))

In [23]:
import collections
from torchtext.vocab import vocab


def get_vocab(sent_list):
    tokenized_data = [[word for word in sent.split(' ') if word != ''] for sent in sent_list]
    counter = collections.Counter([tk for st in tokenized_data for tk in st])
    return vocab(counter, min_freq=1)

train_vocab = get_vocab(training_set['processed_sent'].tolist())
test_vocab = get_vocab(test_set['processed_sent'].tolist())
'# words in vocab:', len(test_vocab)

('# words in vocab:', 20826)

In [26]:
def preprocess(sent_list, label_list, vocab):
    max_l = 50  # 将每条评论通过截断或者补0，使得长度变成500

    def pad(x):
        return x[:max_l] if len(x) > max_l else x + [0] * (max_l - len(x))

    tokenized_data = [[word for word in sent.split(' ') if word != ''] for sent in sent_list]
    features = torch.tensor([pad([vocab.get_stoi()[word] for word in words]) for words in tokenized_data])
    labels = torch.tensor(label_list)
    return features, labels

In [27]:
import torch.utils.data as Data
import numpy as np

batch_size = 50
train_set = Data.TensorDataset(*preprocess(training_set['processed_sent'].tolist(), training_set['human'].tolist(), train_vocab))
test_set = Data.TensorDataset(*preprocess(test_set['processed_sent'].tolist(), test_set['label'].tolist() ,test_vocab))
train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
test_iter = Data.DataLoader(test_set, batch_size)

In [None]:
for X, y in train_iter:
    print('X', X.shape, 'y', y.shape)
    break
'#batches:', len(train_iter)

X torch.Size([50, 200]) y torch.Size([50])


('#batches:', 20)

In [29]:
from gensim.models import Word2Vec

class BiRNN(nn.Module):
    def __init__(self, vocab, embed_size, num_hiddens, num_layers):
        super(BiRNN, self).__init__()
        # embedding
        wvmodel = Word2Vec.load("/home/XuHaoshuai/Project/HumanIE-IPM-experiment-2.0/word2vec/word2vec.model")
        vocab_size = len(wvmodel.wv)
        vector_size = wvmodel.vector_size
        weight = torch.randn(vocab_size, vector_size)
        words = wvmodel.wv.index_to_key
        word_to_idx = {word: i for i, word in enumerate(words)}
        idx_to_word = {i: word for i, word in enumerate(words)}
        for i in range(len(wvmodel.wv.index_to_key)):
            try:
                index = word_to_idx[wvmodel.wv.index_to_key[i]]
            except:
                continue
        vector=wvmodel.wv.get_vector(idx_to_word[word_to_idx[wvmodel.wv.index_to_key[i]]])
        weight[index, :] = torch.from_numpy(vector)
        self.embedding = nn.Embedding.from_pretrained(weight)
        
        # bidirectional设为True即得到双向循环神经网络
        self.encoder = nn.LSTM(input_size=embed_size, 
                                hidden_size=num_hiddens, 
                                num_layers=num_layers,
                                bidirectional=True)
        # 初始时间步和最终时间步的隐藏状态作为全连接层输入
        self.decoder = nn.Linear(4*num_hiddens, 2)

    def forward(self, inputs):
        # inputs的形状是(批量大小, 词数)，因为LSTM需要将序列长度(seq_len)作为第一维，所以将输入转置后
        # 再提取词特征，输出形状为(词数, 批量大小, 词向量维度)
        embeddings = self.embedding(inputs.permute(1, 0))
        # rnn.LSTM只传入输入embeddings，因此只返回最后一层的隐藏层在各时间步的隐藏状态。
        # outputs形状是(词数, 批量大小, 2 * 隐藏单元个数)
        outputs, _ = self.encoder(embeddings) # output, (h, c)
        # 连结初始时间步和最终时间步的隐藏状态作为全连接层输入。它的形状为
        # (批量大小, 4 * 隐藏单元个数)。
        encoding = torch.cat((outputs[0], outputs[-1]), -1)
        outs = self.decoder(encoding)
        return outs

In [30]:
embed_size, num_hiddens, num_layers = 100, 100, 2
net = BiRNN(train_vocab, embed_size, num_hiddens, num_layers)

In [31]:
import os
import time

def evaluate_accuracy(data_iter, net, device=None):
    if device is None and isinstance(net, torch.nn.Module):
        # 如果没指定device就使用net的device
        device = list(net.parameters())[0].device 
    acc_sum, n = 0.0, 0
    with torch.no_grad():
        for X, y in data_iter:
            if isinstance(net, torch.nn.Module):
                net.eval() # 评估模式, 这会关闭dropout
                acc_sum += (net(X.to(device)).argmax(dim=1) == y.to(device)).float().sum().cpu().item()
                net.train() # 改回训练模式
            else: # 自定义的模型, 3.13节之后不会用到, 不考虑GPU
                if('is_training' in net.__code__.co_varnames): # 如果有is_training这个参数
                    # 将is_training设置成False
                    acc_sum += (net(X, is_training=False).argmax(dim=1) == y).float().sum().item() 
                else:
                    acc_sum += (net(X).argmax(dim=1) == y).float().sum().item() 
            n += y.shape[0]
    return acc_sum / n

# Train the model
def train(train_iter, test_iter, net, loss, optimizer, device, num_epochs):
    net = net.to(device)
    print("training on ", device)
    batch_count = 0
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time()
        for X, y in train_iter:
            X = X.to(device)
            y = y.to(device)
            y_hat = net(X)
            l = loss(y_hat, y) 
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            train_l_sum += l.cpu().item()
            train_acc_sum += (y_hat.argmax(dim=1) == y).sum().cpu().item()
            n += y.shape[0]
            batch_count += 1
        test_acc = evaluate_accuracy(test_iter, net)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
              % (epoch + 1, train_l_sum / batch_count, train_acc_sum / n, test_acc, time.time() - start))

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cpu')
lr, num_epochs = 0.01, 10
# 要过滤掉不计算梯度的embedding参数
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=lr)
loss = nn.CrossEntropyLoss()
train(train_iter, test_iter, net, loss, optimizer, device, num_epochs)

training on  cpu
epoch 1, loss 0.7194, train acc 0.534, test acc 0.500, time 5.7 sec
epoch 2, loss 0.3004, train acc 0.675, test acc 0.443, time 5.4 sec
epoch 3, loss 0.1041, train acc 0.878, test acc 0.479, time 5.4 sec
epoch 4, loss 0.0308, train acc 0.955, test acc 0.475, time 5.4 sec
epoch 5, loss 0.0106, train acc 0.981, test acc 0.482, time 5.4 sec
epoch 6, loss 0.0062, train acc 0.992, test acc 0.487, time 5.4 sec
epoch 7, loss 0.0015, train acc 0.998, test acc 0.493, time 5.4 sec
epoch 8, loss 0.0018, train acc 0.996, test acc 0.491, time 5.3 sec
epoch 9, loss 0.0016, train acc 0.996, test acc 0.528, time 5.3 sec
epoch 10, loss 0.0011, train acc 0.998, test acc 0.521, time 5.4 sec


In [32]:
torch.save(net, os.path.join(DATA_ROOT_PATH, 'saved_model', relation + '.model'))

In [33]:
torch.load("/home/XuHaoshuai/Project/HumanIE-IPM-experiment-2.0/lstm/saved_model/couple.model")

BiRNN(
  (embedding): Embedding(55923, 100)
  (encoder): LSTM(100, 100, num_layers=2, bidirectional=True)
  (decoder): Linear(in_features=400, out_features=2, bias=True)
)

In [None]:
# def predict_sentiment(net, vocab, sentence):
#     """sentence是词语的列表"""
#     device = list(net.parameters())[0].device
#     sentence = torch.tensor([vocab.get_stoi()[word] for word in sentence], device=device)
#     label = torch.argmax(net(sentence.view((1, -1))), dim=1)
#     return 1 if label.item() == 1 else -1

# predict_sentiment(net, train_vocab, ['词语','解释','指','人物二','从弟','人物一'])