In [1]:
import numpy as np
import pandas as pd

In [2]:
def read_data(path, cols):
    data = pd.read_csv(path, sep='\t', usecols = cols)
    text = []
    label = []
    
    for i in data.values:
        text.append(i[0])
        label.append(i[1])
        
    return text, label

In [3]:
text, label = read_data('./data/train.tsv', ['Phrase', 'Sentiment'])
text = [i.split(' ') for i in text]

In [4]:
from torchtext import data
from torchtext import vocab
import torch

In [5]:
TEXT = data.Field(sequential = True, fix_length = 36)

In [6]:
cache = '.vector_cache'
vectors = vocab.Vectors(name='./glove/glove.6B.300d.txt', cache = cache)
TEXT.build_vocab(text, vectors = vectors, unk_init = torch.nn.init.xavier_uniform)

In [7]:
process_text = TEXT.process(text)
process_text = process_text.permute(1, 0)

In [8]:
process_text[0]

tensor([   45,   316,     5, 16579,  6249,     2,  6882,    12,    72,    11,
           56,    16,     2,  4669,    11,   174,    56,    16,     2, 12561,
            3,    73,     5,    86,   644, 11258,    22,   738,     5,    86,
         2011,     7,    63,     5,     4,    47])

In [9]:
input_ids = process_text

In [10]:
random_order = list(range(len(input_ids)))
np.random.seed(2020)   # 固定种子
np.random.shuffle(random_order)

# 4:1 划分训练集和测试集
input_ids_train = np.array([input_ids[i].numpy() for i in random_order[:int(len(input_ids)*0.85)]])
y_train = np.array([label[i] for i in random_order[:int(len(input_ids) * 0.85)]])

print(input_ids_train.shape, y_train.shape)

input_ids_test = np.array([input_ids[i].numpy() for i in random_order[int(len(input_ids)*0.85):]])
y_test = np.array([label[i] for i in random_order[int(len(input_ids)*0.85):]])

print(input_ids_test.shape, y_test.shape)

(132651, 36) (132651,)
(23409, 36) (23409,)


In [11]:
import torch
import torch.nn as nn
from torch.utils.data import *
from sklearn.metrics import accuracy_score

In [12]:
BATCH_SIZE = 128

train_data = TensorDataset(torch.LongTensor(input_ids_train), 
                           torch.LongTensor(y_train))

train_loader = DataLoader(train_data, batch_size = BATCH_SIZE)

test_data = TensorDataset(torch.LongTensor(input_ids_test), 
                          torch.LongTensor(y_test))

test_loader = DataLoader(test_data, batch_size = BATCH_SIZE)

In [13]:
class LSTMModel(nn.Module):
    def __init__(self):
        super(LSTMModel, self).__init__()
        
        self.embedding = nn.Embedding(len(TEXT.vocab), 300)
        # 权重在词汇表vocab的vectors属性中
        self.weight_matrix = TEXT.vocab.vectors
        # 指定嵌入矩阵的初始权重
        self.embedding.weight.data.copy_(self.weight_matrix)
        self.embedding.weight.requires_grad = True     
        
        self.lstm = nn.LSTM(300, 128, bidirectional = True, num_layers = 3, batch_first = True, dropout = 0.5)
        self.fc = nn.Linear(256, 128)
        self.fc1 = nn.Linear(128, 5)
        self.act = nn.ReLU()

    def forward(self, x):
        
        out = self.embedding(x)
        out, hidden = self.lstm(out)
        out = torch.cat((hidden[0][-2], hidden[0][-1]), dim = -1) # 拼接前向与后向向量
        out = self.act(self.fc(out))
        out = self.fc1(out)
        return out

In [14]:
DEVICE = torch.device("cuda")
lstm_model = LSTMModel().to(DEVICE)

In [15]:
NUM_EPOCHS = 3

criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(lstm_model.parameters(), lr = 0.001)

In [16]:
for epoch in range(NUM_EPOCHS):
    lstm_model.train()
    sta_loss = []
    for batch_idx, data in enumerate(train_loader):
        text, label = data[0].to(DEVICE), data[1].to(DEVICE)
        pred_label = lstm_model(text)
        optimizer.zero_grad()
        loss = criterion(pred_label, label)
        loss.backward()
        optimizer.step()
        sta_loss.append(loss.item())

    with torch.no_grad():
        lstm_model.eval()
        pred_test = []
        true_test = []
            
        for batch_idx, data in enumerate(test_loader):
            text, label = data[0].to(DEVICE), data[1].to(DEVICE)
            pred_label = lstm_model(text)
            pred_test.extend(pred_label.argmax(dim=1).cpu().numpy())
            true_test.extend(label.cpu().numpy())
        
        print('第{}轮, average loss: {:.4}; 准确率为: {:.2}%'.format(epoch+1, sum(sta_loss)/len(sta_loss)*100, accuracy_score(pred_test, true_test)))

第1轮, average loss: 0.9209; 准确率为: 0.66%
第2轮, average loss: 0.7412; 准确率为: 0.67%
第3轮, average loss: 0.6641; 准确率为: 0.67%


In [17]:
data = pd.read_csv('./data/test.tsv', sep='\t', usecols = ['Phrase'])
submit_text = []
for i in data.values:
    submit_text.append(i[0])
    
submit_text = [i.split(' ') for i in submit_text]

In [18]:
process_submit_text = TEXT.process(submit_text)
process_submit_text = process_submit_text.permute(1, 0)

In [19]:
submit_data = TensorDataset(torch.LongTensor(np.array(process_submit_text)))

submit_loader = DataLoader(submit_data, batch_size = BATCH_SIZE)

In [20]:
with torch.no_grad():
    lstm_model.eval()
    pred_submit = []
        
    for batch_idx, data in enumerate(submit_loader):
        x = data[0].to(DEVICE)
        pred_label = lstm_model(x)
        pred_submit.extend(pred_label.argmax(dim=1).cpu().numpy())

In [21]:
res = []
for i in range(156061, 222353):
    res.append([i, pred_submit[i-156061]])

In [22]:
submit = pd.DataFrame(res, columns = ['PhraseId', 'Sentiment'])
submit.to_csv('./submit/lstm_submit.csv', index = 0)