In [1]:
import numpy as np
import pandas as pd
from collections import Counter

In [2]:
def read_data(path, cols):
    data = pd.read_csv(path, sep='\t', usecols = cols)
    text = []
    label = []
    
    for i in data.values:
        text.append(i[0])
        label.append(i[1])
        
    return text, label

In [3]:
text, label = read_data('./data/train.tsv', ['Phrase', 'Sentiment'])
text = [i.split(' ') for i in text]

In [4]:
def text_to_bow(text):
    temp = []
    for i in text:
        temp += i     
    temp = list(set(temp))
    temp = Counter(temp)
    dic = {}
    for index, word in enumerate(temp.keys()):
        dic[word] = index
        
    bow_text = []
    for i in text:
        temp = np.zeros(10000)
        for j in i:
            try:
                temp[dic[j]] += 1
            except:
                None
        bow_text.append(temp)
    return bow_text, dic

In [5]:
bow_text, dic = text_to_bow(text)

In [6]:
label = [np.array(i) for i in label]

In [7]:
random_order = list(range(len(bow_text)))
np.random.seed(2020)   # 固定种子
np.random.shuffle(random_order)

# 4:1 划分训练集和测试集
bow_text_train = np.array([bow_text[i] for i in random_order[:int(len(bow_text)*0.85)]])
label_train = np.array([label[i] for i in random_order[:int(len(bow_text) * 0.85)]])

print(bow_text_train.shape, label_train.shape)

bow_text_valid = np.array([bow_text[i] for i in random_order[int(len(bow_text)*0.85):int(len(bow_text)*0.9)]])
label_valid = np.array([label[i] for i in random_order[int(len(bow_text)*0.85):int(len(bow_text) * 0.9)]])

print(bow_text_valid.shape, label_valid.shape)

bow_text_test = np.array([bow_text[i] for i in random_order[int(len(bow_text)*0.9):]])
label_test = np.array([label[i] for i in random_order[int(len(bow_text) * 0.9):]])

print(bow_text_test.shape, label_test.shape)

(132651, 10000) (132651,)
(7803, 10000) (7803,)
(15606, 10000) (15606,)


In [8]:
import torch
import torch.nn as nn
from torch.utils.data import *
from sklearn.metrics import accuracy_score

In [9]:
BATCH_SIZE = 128

train_data = TensorDataset(torch.LongTensor(bow_text_train), 
                           torch.LongTensor(label_train))

train_loader = DataLoader(train_data, batch_size = BATCH_SIZE)

valid_data = TensorDataset(torch.LongTensor(bow_text_valid), 
                          torch.LongTensor(label_valid))

valid_loader = DataLoader(valid_data, batch_size = BATCH_SIZE)

test_data = TensorDataset(torch.LongTensor(bow_text_test), 
                          torch.LongTensor(label_test))

test_loader = DataLoader(test_data, batch_size = BATCH_SIZE)

In [10]:
class LRModel(nn.Module):
    def __init__(self):
        super(LRModel, self).__init__()
        
        self.hidden = nn.Linear(10000, 256)
        self.out = nn.Linear(256, 5)
        self.act = nn.ReLU()
        
    def forward(self, x):
        x = self.act(self.hidden(x))
        x = self.out(x)
        return x

In [11]:
DEVICE = torch.device("cuda")
lr_model = LRModel().to(DEVICE)

In [12]:
NUM_EPOCHS = 20

criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(lr_model.parameters(), lr = 0.001)

In [13]:
for epoch in range(NUM_EPOCHS):
    for batch_idx, data in enumerate(train_loader):
        text, label = data[0].to(DEVICE), data[1].to(DEVICE)
        pred_label = lr_model(text.float())
        optimizer.zero_grad()
        loss = criterion(pred_label, label)
        loss.backward()
        optimizer.step()
    
        if(batch_idx + 1) % 500 == 0:
            print('Train Epoch: {} [{}/{} ({:.2f}%)]\tLoss: {:.6f}'.format(epoch+1, (batch_idx+1) * len(text),
                                                                           len(train_loader.dataset),
                                                                           100 * batch_idx / len(train_loader), 
                                                                           loss.item()))
    with torch.no_grad():
        pred_test = []
        true_test = []
        
        for batch_idx, data in enumerate(valid_loader):
            text, label = data[0].to(DEVICE), data[1].to(DEVICE)
            pred_label = lr_model(text.float())
            pred_test.extend(pred_label.argmax(dim=1).cpu().numpy())
            true_test.extend(label.cpu().numpy())
            
        for batch_idx, data in enumerate(test_loader):
            text, label = data[0].to(DEVICE), data[1].to(DEVICE)
            pred_label = lr_model(text.float())
            pred_test.extend(pred_label.argmax(dim=1).cpu().numpy())
            true_test.extend(label.cpu().numpy())
        
        print('该轮准确率为: {}'.format(accuracy_score(pred_test, true_test)))

该轮准确率为: 0.6183519159297706
该轮准确率为: 0.6246742705796916
该轮准确率为: 0.6234781494297065
该轮准确率为: 0.6210431885172369
该轮准确率为: 0.6196334743047546
该轮准确率为: 0.6180956042547738
该轮准确率为: 0.6163441411422957
该轮准确率为: 0.6149344269298134
该轮准确率为: 0.6124567474048442


KeyboardInterrupt: 

In [15]:
data = pd.read_csv('./data/test.tsv', sep='\t', usecols = ['Phrase'])
submit_text = []
for i in data.values:
    submit_text.append(i[0])
    
submit_text = [i.split(' ') for i in submit_text]

In [16]:
bow_submit_text = []
for i in submit_text:
    temp = np.zeros(10000)
    for j in i:
        try:
            temp[dic[j]] += 1
        except:
            None
    bow_submit_text.append(temp)

In [17]:
submit_data = TensorDataset(torch.Tensor(np.array(bow_submit_text)))

submit_loader = DataLoader(submit_data, batch_size = BATCH_SIZE)

In [18]:
with torch.no_grad():
    pred_submit = []
        
    for batch_idx, data in enumerate(submit_loader):
        x = data[0].to(DEVICE)
        pred_label = lr_model(x.float())
        pred_submit.extend(pred_label.argmax(dim=1).cpu().numpy())

In [19]:
res = []
for i in range(156061, 222353):
    res.append([i, pred_submit[i-156061]])

In [22]:
submit = pd.DataFrame(res, columns = ['PhraseId', 'Sentiment'])
submit.to_csv('./submit/lr_submit.csv', index = 0)