# 一维卷积 - 中文评价情绪预测（One-dimensional Convolution - Chinese Evaluation Emotion Prediction）

### 数据集: 某外卖平台收集的用户评价，正向 4000 条，负向 约 8000 条

## 字段说明

| 字段 | 说明 |
| ---- | ---- |
| label | 1 表示正向评论，0 表示负向评论 |
| review | 评论内容 |

安装 jieba 和 pandas

     pip install jieba

In [2]:
import torch
import torchtext
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import jieba
from torch.utils.data import DataLoader

In [3]:
torch.__version__

'1.9.0'

In [4]:
torchtext.__version__

'0.10.0'

In [5]:
data = pd.read_csv('waimai_10k.csv')

In [6]:
data.head()

Unnamed: 0,label,review
0,1,很快，好吃，味道足，量大
1,1,没有送水没有送水没有送水
2,1,非常快，态度好。
3,1,方便，快捷，味道可口，快递给力
4,1,菜味道很棒！送餐很及时！


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11987 entries, 0 to 11986
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   11987 non-null  int64 
 1   review  11987 non-null  object
dtypes: int64(1), object(1)
memory usage: 187.4+ KB


In [8]:
data.label.value_counts()

0    7987
1    4000
Name: label, dtype: int64

In [9]:
jieba.lcut('这是日月光华在网易云课堂的课程')

Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\guanghua\AppData\Local\Temp\jieba.cache
Loading model cost 0.749 seconds.
Prefix dict has been built successfully.


['这是', '日月', '光华', '在', '网易', '云', '课堂', '的', '课程']

In [12]:
def pre_text(text):
    text = text.replace('，', '').replace('！', '')
    return jieba.lcut(text)

In [13]:
data['review'] = data.review.apply(pre_text)

In [14]:
data['review'] 

0                                      [很快, 好吃, 味道, 足量, 大]
1                                 [没有, 送水, 没有, 送水, 没有, 送水]
2                                        [非常, 快, 态度, 好, 。]
3                                 [方便快捷, 味道, 可口, 快, 递给, 力]
4                                   [菜, 味道, 很棒, 送餐, 很, 及时]
                               ...                        
11982                   [以前, 几乎, 天天, 吃, 现在, 调料, 什么, 都, 不放]
11983    [昨天, 订, 凉皮, 两份, 什么, 调料, 都, 没有, 放, 就, 放, 了, 点, ...
11984                                  [凉皮, 太辣, ,, 吃不下, 都]
11985                                [本来, 迟到, 了, 还, 自己, 点]
11986    [肉夹馍, 不错, 羊肉, 泡馍, 酱肉, 包, 很, 一般, 。, 凉面, 没, 想象, ...
Name: review, Length: 11987, dtype: object

In [15]:
# 导入创建词表工具(日月： 1， 光华： 2， 吃饭： 3， 调料： 4)
from torchtext.vocab import build_vocab_from_iterator

In [17]:
def yield_tokens(data):
    for text in data:
        yield text

In [19]:
vocab = build_vocab_from_iterator(yield_tokens(data['review']),
                                  specials=['<pad>', '<unk>'],
                                  min_freq=2)

In [20]:
vocab.set_default_index(vocab['<unk>'])

In [22]:
vocab['调料']

965

In [26]:
vocab(['很快', '好吃', '味道', '足量', '大'])

[56, 15, 14, 5229, 114]

In [27]:
vocab['<unk>']

1

In [28]:
vocab['山峰']

1

In [29]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11987 entries, 0 to 11986
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   11987 non-null  int64 
 1   review  11987 non-null  object
dtypes: int64(1), object(1)
memory usage: 187.4+ KB


In [32]:
i = int(len(data)*0.8)

In [33]:
train_data = data.sample(i)

In [34]:
train_data.head()

Unnamed: 0,label,review
7619,0,"[送货员, 板, 着, 大脸, 一句, 话, 都, 没有, 楞楞, 的, 炒饭, 又, 黑,..."
10915,0,"[垃圾, 订, 了, 将近, 三个, 小时, 还, 没, 送到, 电话, 打, 不通, 。]"
11571,0,"[没有, 年前, 分量, 多, 了]"
8675,0,"[量, 给, 的, 太少, 了, 以前, 都, 还, 凑活, 今天, 的, 菜, 只有, 小..."
3456,1,"[送错, 了, ,, 很, 积极, 的, 更换, 了, 好评]"


In [35]:
len(train_data)

9589

In [41]:
test_data = data.iloc[data.index[~data.index.isin(train_data.index)]]

In [42]:
test_data.head()      # vx  : louhh01

Unnamed: 0,label,review
2,1,"[非常, 快, 态度, 好, 。]"
9,1,"[最后, 五分钟, 订, 的, 卖家, 特别, 好, 接单, 了, 谢谢, 。]"
22,1,"[很, 方便, 很快, 就, 送到, 了, 。, 棒]"
33,1,"[在, 这种, 天气, 里, 感谢, 送, 餐员, 的, 辛苦, 服务, ,, 谢谢, 啦]"
45,1,"[点, 了, 太, 多次, 了, 味道, 很香]"


In [43]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [44]:
device

device(type='cuda')

In [46]:
train_data.values

array([[0,
        list(['送货员', '板', '着', '大脸', '一句', '话', '都', '没有', '楞楞', '的', '炒饭', '又', '黑', '又', '咸粥', '也', '黑', '了', '吧唧', '的'])],
       [0,
        list(['垃圾', '订', '了', '将近', '三个', '小时', '还', '没', '送到', '电话', '打', '不通', '。'])],
       [0, list(['没有', '年前', '分量', '多', '了'])],
       ...,
       [0,
        list(['送药', '的', '人', '态度', '很差', '还', '让', '我', '去', '门口', '取', '根本', '不', '送进来', '还', '不带', '零钱', '问', '他', '怎么', '处理', '（', '找不开', '我', '零钱', '）', '他', '说', '那', '你', '退货', '吧', '这', '送', '东西', '的', '是', '有', '毛病', '吗', '态度', '太次', '了'])],
       [0,
        list(['肥肉', '太多', '吃', '的', '时候', '还', '把', '嘴唇', '拿', '破', '了'])],
       [0, list(['这次', '虾饺', '里面', '居然', '全是', '淀粉', '都', '没有', '虾'])]],
      dtype=object)

In [48]:
train_data.values[0]

array([0,
       list(['送货员', '板', '着', '大脸', '一句', '话', '都', '没有', '楞楞', '的', '炒饭', '又', '黑', '又', '咸粥', '也', '黑', '了', '吧唧', '的'])],
      dtype=object)

In [51]:
# 文本的批处理函数
def collate_batch(batch):
    label_list, text_list = [], []
    for (_label, _text) in batch:
        label_list.append(_label)
        precessed_text = torch.tensor(vocab(_text), dtype=torch.int64)
        text_list.append(precessed_text)
    label_list = torch.tensor(label_list)
    text_list = torch.nn.utils.rnn.pad_sequence(text_list, batch_first=True)
    return label_list.to(device), text_list.to(device)

In [55]:
# dataset   __getitem__; __len__()
train_dl = DataLoader(train_data.values, batch_size=64,
                      collate_fn=collate_batch,
                      shuffle=True)

In [56]:
test_dl = DataLoader(test_data.values, batch_size=64,
                      collate_fn=collate_batch)

In [57]:
label_batch, text_batch = next(iter(train_dl))

In [58]:
label_batch

tensor([0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
        0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0], device='cuda:0')

In [59]:
text_batch         # louhh01 

tensor([[  16,   48,   45,  ...,    0,    0,    0],
        [  18,    3,   13,  ...,    0,    0,    0],
        [1293,  512,   36,  ...,    0,    0,    0],
        ...,
        [ 157,    3,  153,  ...,    0,    0,    0],
        [ 420,   69,    3,  ...,    0,    0,    0],
        [  32,  568,    2,  ...,    0,    0,    0]], device='cuda:0')

In [None]:
# 一维卷积模型：
   1. embdeing
   2. conv1d + maxpool
   3. conv1d
   4. nn.AdaptiveAvgPool1d
   5. view() 
   6. Linear()

In [63]:
vocab_size = len(vocab)
embeding_dim = 100

In [None]:
# 输入 text shape： batch, lengh
# 经过embeding后：  batch, lengh， feathues=100

In [88]:
class CONV1D_Net(nn.Module):
    def __init__(self, vocab_size, embeding_dim):
        super(CONV1D_Net, self).__init__()
        self.em = nn.Embedding(vocab_size, embeding_dim)
        self.conv1 = nn.Conv1d(embeding_dim, 64, kernel_size=7)
        self.pool = nn.MaxPool1d(kernel_size=2)
        self.conv2 = nn.Conv1d(64, 128, kernel_size=7)
        self.avgpool = nn.AdaptiveAvgPool1d(output_size=5)  # batch*128*5
        self.fc1 = nn.Linear(128*5, 64)
        self.fc2 = nn.Linear(64, 2)
    def forward(self, x):
        x = self.em(x)
        x = x.permute(0, 2, 1)
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = F.relu(self.conv2(x))
        x = self.avgpool(x)
        x = F.dropout(F.relu(self.fc1(x)))
        x = self.fc2(x)
        return x

In [89]:
model = CONV1D_Net(vocab_size, embeding_dim).to(device)

In [90]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [92]:
def train(dataloader):
    total_acc, total_count, total_loss, = 0, 0, 0
    model.train()
    for label, text in dataloader:
        predicted_label = model(text)
        loss = loss_fn(predicted_label, label)
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        with torch.no_grad():
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
            total_loss += loss.item()*label.size(0)
    return total_loss/total_count, total_acc/total_count

In [93]:
def test(dataloader):
    model.eval()
    total_acc, total_count, total_loss, = 0, 0, 0

    with torch.no_grad():
        for label, text in dataloader:
            predicted_label = model(text)
            loss = loss_fn(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
            total_loss += loss.item()*label.size(0)
    return total_loss/total_count, total_acc/total_count

In [94]:
def fit(epochs, train_dl, test_dl):
    train_loss = []
    train_acc = []
    test_loss = []
    test_acc = []

    for epoch in range(epochs):
        epoch_loss, epoch_acc = train(train_dl)
        epoch_test_loss, epoch_test_acc = test(test_dl)
        train_loss.append(epoch_loss)
        train_acc.append(epoch_acc)
        test_loss.append(epoch_test_loss)
        test_acc.append(epoch_test_acc)
        template = ("epoch:{:2d}, train_loss: {:.5f}, train_acc: {:.1f}% ," 
                    "test_loss: {:.5f}, test_acc: {:.1f}%")
        print(template.format(
              epoch, epoch_loss, epoch_acc*100, epoch_test_loss, epoch_test_acc*100))
    print("Done!")
    
    return train_loss, test_loss, train_acc, test_acc

In [95]:
EPOCHS = 25

In [96]:
train_loss, test_loss, train_acc, test_acc = fit(EPOCHS, 
                                                 train_dl, 
                                                 test_dl)

  return torch.max_pool1d(input, kernel_size, stride, padding, dilation, ceil_mode)


RuntimeError: CUDA error: CUBLAS_STATUS_INVALID_VALUE when calling `cublasSgemm( handle, opa, opb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc)`