In [0]:
import pandas as pd
import numpy as np
import pickle

In [0]:
import csv
def read_data(dataPath, is_train = True):
    with open(dataPath, mode='r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        data = list(reader)
        data = np.asarray(data)
        if is_train:
          content_data = {'content':[], 'label':[]}
        else:
          content_data = {'content':[], 'id':[]}
        for line in data:
            content_data['content'].append(list(line['微博中文内容']))
            if is_train:
              content_data['label'].append(line['情感倾向'])
            else:
              content_data['id'].append(line['微博id'])
        
        return content_data
    

In [0]:
content_train_data = read_data('drive/My Drive/NLP/sentiment_compete/data/train_weibo_clean.csv')
content_test_data = read_data('drive/My Drive/NLP/sentiment_compete/data/test_weibo_clean.csv', False)

In [0]:
import pickle
import numpy as np

UNKNOWN = '<UNK>'
PADDING = '<PAD>'

def read_pretrain_embedding(emb_path):
    pre_train_emd = {}
    with open(emb_path, mode='r', encoding='utf-8') as f:
        word_nums, dim = f.readline().split()
        word_nums = int(word_nums)
        dim = int(dim)
        for line in f:
            tokens = line.strip().split(' ')
            if len(tokens) == dim + 1:
                pre_train_emd[tokens[0]] = list(map(lambda x: float(x), tokens[1:]))
    return pre_train_emd

In [0]:
pre_train_emd = read_pretrain_embedding('data/sgns.weibo.char')

FileNotFoundError: ignored

In [0]:
def build_word_emb(content_train_data, pre_train_emd, emd_dim, content_test_data = None):
    """
        return word_emb type-list [n_vocab * dim]
               word2Index type-dict {word: index}
    """
    word_emb = [np.random.uniform(-0.1, 0.1, emd_dim) for _ in range(2)]
    word2Index = {PADDING: 0, UNKNOWN: 1}
    for content in content_train_data['content']:
        for i in range(len(content)):
            if content[i] not in word2Index:
                if content[i] in pre_train_emd: # in vocab
                    word2Index[content[i]] = len(word2Index)
                    word_emb.append(pre_train_emd[content[i]])
                else: # not in vocab
                    content[i] = UNKNOWN
            content[i] = word2Index[content[i]]
            
    if content_test_data is not None:
        for content in content_test_data['content']:
            for i in range(len(content)):
                if content[i] not in word2Index:
                    if content[i] in pre_train_emd: # in vocab
                        word2Index[content[i]] = len(word2Index)
                        word_emb.append(pre_train_emd[content[i]])
                    else: # not in vocab
                        content[i] = UNKNOWN
                content[i] = word2Index[content[i]]
    word_emb = np.asarray(word_emb, dtype='float32')
    return word_emb, word2Index

In [0]:
def save_word_emb(word_emb, word2Index):
    with open('data/word_emb', 'wb') as f:
        pickle.dump((word_emb, word2Index), f)
    print('saved.')

def load_word_emb(path):
    with open(path, 'rb') as f:
        word_emb, word2Index = pickle.load(f)
        return word_emb, word2Index

def save_doc_index(path, content_train_data, content_test_data):
    with open(path, 'wb') as f:
        pickle.dump((content_train_data, content_test_data), f)
    print('saved.')

def load_doc_index(path):
    with open(path, 'rb') as f:
        content_train_data, content_test_data = pickle.load(f)
        return word_emb, word2Index

In [0]:
word_emb, word2Index = build_word_emb(content_train_data, pre_train_emd, 300, content_test_data)

NameError: ignored

In [0]:
word_emb, word2Index = load_word_emb('drive/My Drive/NLP/sentiment_compete/data/word_emb')

In [0]:
def pad_content(content_data, seq_len, word2Index):
    pad_seq = np.zeros((len(content_data), seq_len), dtype=int)
    for i, row in enumerate(content_data):
        doc_len = seq_len
        if doc_len > len(row):
            doc_len = len(row)
        
        for index in range(len(row)):
          if row[index] in word2Index:
             row[index] = word2Index[row[index]]
          else:
             row[index] = word2Index[UNKNOWN]
        pad_seq[i, :doc_len] = row[:doc_len]
    return pad_seq

In [0]:
seq_len = 140

train_content = pad_content(content_train_data['content'], seq_len, word2Index)
test_content = pad_content(content_test_data['content'], seq_len, word2Index)

In [0]:
test_data = list(zip(content_test_data['id'] ,test_content))

In [0]:
train_content.shape

(99560, 140)

In [0]:
content_train_data['content'] = train_content

In [0]:
def split_data(content_train_data):
    np.random.seed(123)
    #np.random.shuffle(content_train_data['content'])
    #np.random.shuffle(content_train_data['label'])
    length = len(content_train_data['content'])
    train_x = content_train_data['content'][:int(0.9 * length)]
    train_y = content_train_data['label'][:int(0.9 * length)]
    valid_x = content_train_data['content'][int(0.9 * length):]
    valid_y = content_train_data['label'][int(0.9 * length):]
    train_y = np.asarray(train_y, dtype='int')
    valid_y = np.asarray(valid_y, dtype='int')
    return train_x, train_y, valid_x, valid_y

In [0]:
train_x, train_y, valid_x, valid_y = split_data(content_train_data)

In [0]:
train_y = train_y + 1 # 0 neg 1 mid 2 positive
valid_y = valid_y + 1

In [0]:
valid_x.shape

(9956, 140)

In [0]:
train_y.shape

(89604,)

In [0]:
len(train_y[train_y == 2])

22305

In [0]:
batch_size = 32

import torch
from torch.utils.data import TensorDataset, DataLoader

train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(valid_x), torch.from_numpy(valid_y))

In [0]:
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)

In [0]:
# First checking if GPU is available
train_on_gpu=torch.cuda.is_available()
 
if(train_on_gpu):
    print('Training on GPU.')
else:
    print('No GPU available, training on CPU.')

Training on GPU.


In [0]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Assuming that we are on a CUDA machine, this should print a CUDA device:

print(device)

cuda:0


In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class STModel(nn.Module):
    
    def __init__(self, emd_dim, vocab_size, classes_num, hidden_num, layer_num, embeddings = None, padding_idx = 0,dropout=0.5):
        super(STModel, self).__init__()
        self.hidden_num = hidden_num
        self.layer_num = layer_num
        self.dropout = dropout
        self.vocab_size = vocab_size
        self.emd_dim = emd_dim
        self.encoder = nn.Embedding(vocab_size, emd_dim, padding_idx= padding_idx)
        if embeddings is not None:
            self.encoder.weight.data.copy_(embeddings)
        #self.encoder = nn.Embedding(vocab_size, emd_dim, padding_idx= padding_idx, _weight = embeddings)
        self.drop = nn.Dropout(dropout)
        self.lstm = nn.LSTM(emd_dim, hidden_num, layer_num, dropout= dropout,
                            bidirectional= False,batch_first = True)
        self.fc = nn.Linear(hidden_num, classes_num)
        self.init_weights()
        self.sigm = nn.Sigmoid()
        
        
    def forward(self, x, hidden, print_flag = False):
        batch_size = x.size(0)
        if(print_flag):
          print(batch_size)
        emb = self.drop(self.encoder(x))
        if(print_flag):
          print('emb', emb.size())
        if(print_flag):
          print('hidden', hidden)
        lstm_out, hidden = self.lstm(emb, hidden)
        if(print_flag):
          print('lstm_out', lstm_out.size())
        lstm_out = self.drop(lstm_out) #lstm_out torch.Size([16, 140, 100]) batch seq hidden 
        fc_input = lstm_out[:,-1,:]
        if(print_flag):
          print('fc_input', fc_input.size())
        fc_out = self.fc(fc_input)
        #if(print_flag):
       #   print('fc_out', fc_out.size())
       # sig_out = self.sigm(fc_out)
       # if(print_flag):
       #   print('sig_out', sig_out.size())
       # pred = sig_out.view(batch_size, -1)
       # if(print_flag):
        #  print('pred', pred.size())
       # return pred
        return fc_out
    
    def init_weights(self):
        initrange = 0.1
        #self.encoder.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()
        self.fc.weight.data.uniform_(-initrange, initrange)
        
    def init_hidden(self, batch_size):
        weight = next(self.parameters())
        if(train_on_gpu):
            return (weight.new_zeros(self.layer_num, batch_size, self.hidden_num).cuda(),
                    weight.new_zeros(self.layer_num, batch_size, self.hidden_num).cuda())
        else:
            return (weight.new_zeros(self.layer_num, batch_size, self.hidden_num),
                    weight.new_zeros(self.layer_num, batch_size, self.hidden_num))

In [0]:
emd_dim = 300
vocab_size = word_emb.shape[0]
classes_num = 3
hidden_num = 100
layer_num = 2
dropout = 0.3
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#embeddings = torch.tensor(pickle.load(pkl), dtype=torch.float).to(device)
word_emb = torch.tensor(word_emb, dtype=torch.float,requires_grad=False).to(device)
model = STModel(emd_dim, vocab_size, classes_num, hidden_num, layer_num, embeddings = word_emb, dropout = 0.3)

# 新段落

In [0]:
model.to(device)

STModel(
  (encoder): Embedding(5224, 300, padding_idx=0)
  (drop): Dropout(p=0.3, inplace=False)
  (lstm): LSTM(300, 100, num_layers=2, batch_first=True, dropout=0.3)
  (fc): Linear(in_features=100, out_features=3, bias=True)
  (sigm): Sigmoid()
)

In [0]:
import torch.optim as optim

SAVE_PATH = 'STMmodel.pt'
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)
batch_size = 32
clip = 5
epochs = 100
model.train()

STModel(
  (encoder): Embedding(5224, 300, padding_idx=0)
  (drop): Dropout(p=0.3, inplace=False)
  (lstm): LSTM(300, 100, num_layers=2, batch_first=True, dropout=0.3)
  (fc): Linear(in_features=100, out_features=3, bias=True)
  (sigm): Sigmoid()
)

In [0]:
for epoch in range(epochs):
    running_loss = 0.0
    batch_count = 0
    for i, train_data in enumerate(train_loader):
        #print('batch_index:', i)
        train_input, train_label = train_data
        if(train_on_gpu):
            train_input, train_label = train_input.to(device), train_label.to(device)
        # forward
        hidden = model.init_hidden(train_label.size(0)) # 有问题
        model.zero_grad()
        outputs = model(train_input, hidden)
        loss = criterion(outputs, train_label)
        # backward
        loss.backward()
        # optimize
        optimizer.step()
        running_loss += loss.item()
        if (i+1) % 1000 == 0:    # print every 2000 mini-batches
            #val_h = model.init_hidden(batch_size)
            evaluate_model(valid_loader, model)
            model.train()
            print('[%d, %5d] loss: %.3f' %(epoch + 1, i + 1, running_loss / 1000))
            running_loss = 0.0
            #torch.save(net.state_dict(), PATH)
            #with open(SAVE_PATH, 'wb') as f:
            #    torch.save(model, f)
print('finished training')

Accuracy of the network: 53 %
[1,  1000] loss: 0.959
Accuracy of the network: 53 %
[1,  2000] loss: 0.947
Accuracy of the network: 53 %
[2,  1000] loss: 0.946
Accuracy of the network: 53 %
[2,  2000] loss: 0.956
Accuracy of the network: 53 %
[3,  1000] loss: 0.945
Accuracy of the network: 55 %
[3,  2000] loss: 0.928
Accuracy of the network: 54 %
[4,  1000] loss: 0.896
Accuracy of the network: 53 %
[4,  2000] loss: 0.884
Accuracy of the network: 55 %
[5,  1000] loss: 0.894
Accuracy of the network: 57 %
[5,  2000] loss: 0.896
Accuracy of the network: 61 %
[6,  1000] loss: 0.883
Accuracy of the network: 61 %
[6,  2000] loss: 0.840
Accuracy of the network: 64 %
[7,  1000] loss: 0.796
Accuracy of the network: 62 %
[7,  2000] loss: 0.786
Accuracy of the network: 63 %
[8,  1000] loss: 0.794
Accuracy of the network: 64 %
[8,  2000] loss: 0.806
Accuracy of the network: 66 %
[9,  1000] loss: 0.733
Accuracy of the network: 65 %
[9,  2000] loss: 0.718
Accuracy of the network: 66 %
[10,  1000] loss

In [0]:
def evaluate_model(data_loader, model):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data in data_loader:
            train_input, labels = data
            hidden = model.init_hidden(labels.size(0)) # 有问题
            train_input, labels = train_input.to(device), labels.to(device)
            outputs = model(train_input, hidden)
            #print(outputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            #print(predicted)
            #print(labels)
            correct += (predicted == labels).sum().item()

    print('Accuracy of the network: %d %%' % (
    100 * correct / total))

In [0]:
evaluate_model(valid_loader, model)

tensor([[0.0126, 0.9907, 0.1492],
        [0.0222, 0.9832, 0.1690],
        [0.0104, 0.9882, 0.1813],
        [0.0204, 0.9837, 0.2170],
        [0.0223, 0.9821, 0.2191],
        [0.0115, 0.9894, 0.1580],
        [0.0345, 0.9769, 0.1816],
        [0.0101, 0.9908, 0.1735],
        [0.0426, 0.9707, 0.2295],
        [0.0254, 0.9756, 0.2186],
        [0.0269, 0.9760, 0.2469],
        [0.0114, 0.9908, 0.1560],
        [0.0180, 0.9867, 0.1876],
        [0.0283, 0.9797, 0.2110],
        [0.0168, 0.9852, 0.1662],
        [0.0106, 0.9917, 0.1666],
        [0.0426, 0.9690, 0.2498],
        [0.0395, 0.9727, 0.2406],
        [0.0284, 0.9831, 0.2018],
        [0.0138, 0.9869, 0.1707],
        [0.0198, 0.9801, 0.1919],
        [0.0158, 0.9876, 0.2261],
        [0.0148, 0.9835, 0.1564],
        [0.0108, 0.9900, 0.1616],
        [0.0341, 0.9766, 0.2088],
        [0.0279, 0.9751, 0.2225],
        [0.0131, 0.9880, 0.2014],
        [0.0208, 0.9846, 0.2076],
        [0.0128, 0.9906, 0.1734],
        [0.020

In [0]:
def predict_model(model, data_loader):
    model.eval()
    pred = []
    ids = []
    batch_num = len(test_data) // batch_size
    if(len(test_data) % batch_size != 0):
      batch_num += 1
    with torch.no_grad():
        for i in range(batch_num):
            if i == (batch_num - 1):
                batchs = test_data[i * batch_size: ]
            else:
                batchs = test_data[i * batch_size: (i + 1)* batch_size]
            current_size = len(batchs)
            hidden = model.init_hidden(current_size) # 有问题
            text_ids = [_[0] for _ in batchs]
            text = torch.LongTensor([_[1] for _ in batchs]).to(device)
            outputs = model(text, hidden)
            _, predicted = torch.max(outputs.data, 1)
            predict_val = list(predicted.cpu().numpy() - 1)
            ids = ids + text_ids
            pred = pred + predict_val
    result = pd.DataFrame({
        'id': ids,
        'y':pred
    })
    result.to_csv('reuslt.csv', index=False)

In [0]:
test_data = TensorDataset(torch.from_numpy(test_content))

In [0]:
test_data = DataLoader(test_data, shuffle=False, batch_size=batch_size)

In [0]:
predict_model(model,test_data)

In [0]:
len(test_data)

10000