获取数据函数
--

In [1]:
def get_data(path):
    data = []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append((line[-2], line[: -4]))
    return iter(data)

In [2]:
raw_data = get_data('comment.txt')

In [3]:
next(raw_data)

('0', '才用就发现相机打开迟钝，半天反应不过来，有时候还会卡出去，他们又不给解决方案。')

构建文本分词器（分词的同时去除无意义的停用词）
--

In [4]:
def get_stopwords(path):
    stop_words = []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            stop_words.append(line.strip('\n'))
    return stop_words

In [5]:
stop_words = get_stopwords('stopword.txt')

In [6]:
import jieba
def tokenizer(sentence):
    words = jieba.lcut(sentence)
    for word in words:
        if word in stop_words:
            words.remove(word)
    return words

In [7]:
tokenizer('这是一个停用词测试，会不会删除呢？')

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\lenovo\AppData\Local\Temp\jieba.cache
Loading model cost 0.847 seconds.
Prefix dict has been built successfully.


['这是', '停', '用词', '测试', '会', '删除', '？']

根据分词结果构建词典，形成（词：索引）的映射
--

In [8]:
from torchtext.vocab import build_vocab_from_iterator

def yield_tokens(data):
    for label, comment in data:
        yield tokenizer(comment)

vocab = build_vocab_from_iterator(yield_tokens(raw_data), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])
# 用<unk>填充那些不在词表中的词。

In [9]:
len(vocab)

45707

In [10]:
vocab(tokenizer('这是词典索引测试'))

[276, 0, 41407, 843]

### 保存词典

In [15]:
import pickle
output = open('emb/vocab.pkl', 'wb')
pickle.dump(vocab, output)
output.close()

数据处理管道，将文本映射成词的索引的集合，将标签转换成整数
--

In [11]:
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x)

In [12]:
text_pipeline('这是词典索引测试')

[276, 0, 41407, 843]

In [13]:
label_pipeline('1')

1

批处理函数
--

In [14]:
import torch
from torch.utils.data import DataLoader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# 检测是否有安装了cuda的显卡可用

def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_label, _text) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64) # 评论转换成索引张量
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
        # 所谓偏移量可以理解为该文本的长度
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)    

train_iter = get_data('comment.txt')
dataloader = DataLoader(train_iter, batch_size=8, shuffle=False, collate_fn=collate_batch)
# shuffle=True将打乱数据集，batch_size使得加载器每次读取该数量的条目

In [15]:
torch.cuda.is_available()

True

使用LSTM模型
---------------------

长短期记忆网络——通常被称为 LSTM，是一种特殊的 RNN，能够学习长期依赖性。由 Hochreiter 和 Schmidhuber（1997）提出。

![jupyter](https://n.sinaimg.cn/spider202044/731/w1040h491/20200404/0bc7-irtymmw0458671.png)


In [16]:
from torch import nn

class LSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class, hidden_size, num_layers):
        super(LSTM, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.lstm = nn.LSTM(embed_dim, hidden_size, num_layers, bidirectional=False)
        # num_layers定义了LSTM网络的层数，bidirectional表示是单向LSTM还是双向LSTM（BiLSTM）
        self.fc = nn.Linear(hidden_size, num_class)
        self.init_weights()
    
    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()
    
    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets).unsqueeze(1)

        output, (hn, cn) = self.lstm(embedded)
        return self.fc(output)

固定随机数种子
--

In [17]:
def seed_torch(seed=6):

    #random.seed(seed)
    # os.environ['PYTHONHASHSEED'] = str(seed)
    #np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_torch()

启动一个实例
--

In [18]:
train_iter = get_data('comment.txt')
num_class = len(set([label for (label, text) in train_iter]))
vocab_size = len(vocab)
emsize = 64
hidden_size = 100
num_layers = 1
LSTM_model = LSTM(vocab_size, emsize, num_class, hidden_size, num_layers).to(device)

定义训练和评估函数
--

In [19]:
import time

def train(dataloader):
    LSTM_model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        predicted_label = LSTM_model(text, offsets).squeeze()
        loss = criterion(predicted_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(LSTM_model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()

def evaluate(dataloader):
    LSTM_model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predicted_label = LSTM_model(text, offsets).squeeze()
            loss = criterion(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

拆分数据集并运行模型，训练集，验证集，测试集8：1：1
--

In [20]:
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset

# Hyperparameters超参数
EPOCHS = 10 # epoch
LR = 10  # learning rate学习率
BATCH_SIZE = 64 # batch size for training
  
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(LSTM_model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None

data_iter = get_data('comment.txt')
dataset = to_map_style_dataset(data_iter)

num_train = int(len(dataset) * 0.8)
split_train_, split_temp_ = random_split(dataset, [num_train, len(dataset) - num_train])

num_valid = int(len(split_temp_) * 0.5)
split_valid, split_test = random_split(split_temp_, [num_valid, len(split_temp_) - num_valid])

# print(num_train, num_valid, len(dataset) - num_train - num_valid)

train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(split_valid, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(split_test, batch_size=BATCH_SIZE,
                             shuffle=True, collate_fn=collate_batch)

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader)
    accu_val = evaluate(valid_dataloader)
    if total_accu is not None and total_accu > accu_val:
      scheduler.step()
    else:
       total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)

| epoch   1 |   500/ 1264 batches | accuracy    0.854
| epoch   1 |  1000/ 1264 batches | accuracy    0.918
-----------------------------------------------------------
| end of epoch   1 | time: 89.96s | valid accuracy    0.928 
-----------------------------------------------------------
| epoch   2 |   500/ 1264 batches | accuracy    0.932
| epoch   2 |  1000/ 1264 batches | accuracy    0.935
-----------------------------------------------------------
| end of epoch   2 | time: 75.06s | valid accuracy    0.929 
-----------------------------------------------------------
| epoch   3 |   500/ 1264 batches | accuracy    0.942
| epoch   3 |  1000/ 1264 batches | accuracy    0.943
-----------------------------------------------------------
| end of epoch   3 | time: 74.91s | valid accuracy    0.928 
-----------------------------------------------------------
| epoch   4 |   500/ 1264 batches | accuracy    0.956
| epoch   4 |  1000/ 1264 batches | accuracy    0.955
-------------------------

### 测试集测试

In [21]:
print('Checking the results of test dataset.')
accu_test = evaluate(test_dataloader)
print('test accuracy {:8.3f}'.format(accu_test))

Checking the results of test dataset.
test accuracy    0.940


### 测试一条评论

In [22]:
commment_label = {0: "Bad comment",
                 1: "Good comment"}

def predict(text, text_pipeline):
    with torch.no_grad():
        text = torch.tensor(text_pipeline(text))
        output = LSTM_model(text.to(device), torch.tensor([0]).to(device)).squeeze()
    return output.argmax().item()

ex_text_str = "华为浏览器，点了半天没反应，多屏选择，有一个页面删除不掉！后悔买了"
print(f"This is a {commment_label[predict(ex_text_str, text_pipeline)]}")

This is a Bad comment


### 保存模型参数

In [181]:
# 保存模型
torch.save(LSTM_model.state_dict(), 'models/lstm_model.pth')

预测结果还行，下面对爬取的商品评论进行预测并将结果写入文件
--

In [23]:
# 加载词典
import pickle
f = open('emb/vocab.pkl', 'rb')
vocab = pickle.load(f)

In [25]:
import torch
# 加载模型

# 先实例化一个模型
train_iter = get_data('comment.txt')
num_class = len(set([label for (label, text) in train_iter]))
vocab_size = 45707
emsize = 64
hidden_size = 100
num_layers = 1
LSTM_model = LSTM(vocab_size, emsize, num_class, hidden_size, num_layers)
# 加载状态字典
LSTM_model.load_state_dict(torch.load('models/lstm_model.pth'))
LSTM_model.to(device)

LSTM(
  (embedding): EmbeddingBag(45707, 64, mode=mean)
  (lstm): LSTM(64, 100)
  (fc): Linear(in_features=100, out_features=2, bias=True)
)

In [26]:
def get_comment_label(comments, text_pipeline):
    labels = []
    with torch.no_grad():
        for comment in comments:
            comment = torch.LongTensor(text_pipeline(comment))
            output = LSTM_model(comment.to(device), torch.tensor([0]).to(device)).squeeze()
            label = output.argmax().item()
            labels.append(label)
    return labels
ex_text_str = '质量特别差，朋友门千万不要买，我说的是真心话，系统特别卡，\
                看视频都卡的不要不要的了，是水货不是正品，还不让退货也不让换呢，千万不要买。。'

In [27]:
text_pipeline = lambda x: vocab(tokenizer(x))
commment_label = {0: "Bad comment",
                 1: "Good comment"}
label = get_comment_label([ex_text_str], text_pipeline)[0]
print(f"This is a {commment_label[label]}")

This is a Bad comment


In [28]:
import pandas as pd

In [30]:
df = pd.read_csv('data/100012779151.csv')
df['sentiment'] = get_comment_label(list(df['content']), text_pipeline)

In [32]:
df.to_csv('data/1.csv')

In [33]:
products_id = ['100012779151', '100027683422', '100021725644', '10030539565679', '100024403104', '100021318642']

In [34]:
def write_label(product_id):
    df = pd.read_csv(f'data/{product_id}.csv')
    df['sentiment'] = get_comment_label(list(df['content']), text_pipeline)
    df.to_csv(f'data/{product_id}.csv')
    print(f'{product_id} done')

In [35]:
for product_id in products_id:
    write_label(product_id)

100012779151 done
100027683422 done
100021725644 done
10030539565679 done
100024403104 done
100021318642 done
