### 导入相关包

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import os
import pickle
import pkuseg
from tqdm import tqdm
from gensim.models import KeyedVectors
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import accuracy_score
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

### 数据读取
1. 使用上周保存的已经分好训练集测试集的数据
2. 并同样进行分词操作，对所有训练数据准备好一个词库
3. 准备词库的单词和序号相互的索引，在其中加入`<UNK>`标记，代表未出现在词库中的词语。

In [None]:
with open('./datasets.pickle', 'rb') as f:
    datasets = pickle.load(f)
datasets['train']

In [None]:
_STOP_WORDS = []
with open('./stopwords.txt', 'r', encoding='utf-8') as f:
    _STOP_WORDS = f.readlines()

def remove_stopwords(words):
    return [word for word in words if word not in _STOP_WORDS]

def tokenize_words(line, filter_stopwords=True):
    words = segmentor.cut(line)
    if filter_stopwords:
        words = remove_stopwords(words)
    return words

words = []
segmentor = pkuseg.pkuseg()
for data in datasets['train']:
    words += tokenize_words(data[1])
words = list(set(words))
len(words)

In [None]:
word2idx  = {word: i+1 for i, word in enumerate(words)}
word2idx['<unk>'] = 0
idx2word = {i+1: word for i, word in enumerate(words)}
idx2word[0] = '<unk>'
word2idx

### 词向量
1. 使用gensim读取词向量,(单词总数352217, 维度300)
2. 从上面的词向量中缩小范围，只留下我们要用到的词语的词向量
3. 定义编码函数，将句子变为对应的单词序号序列，有了序号才方便去词向量中寻找单词对应的向量,同时要解决句子长度不一致问题，每个句子分词之后长度是不同的，设`max_len=64`，超过就取前64，不足就补0.
4. 将数据集转化为这种类型
5. 下载词向量保存在当前文件夹下，我的词向量为`sgns.wiki.word`

In [None]:
wvmodel = KeyedVectors.load_word2vec_format('sgns.wiki.word', binary=False, encoding='utf-8')

In [None]:
weight = torch.zeros(len(words) + 1, 300)  # 因为<unk>，所以加1
for word in wvmodel.index2word:
    try:
        index = word2idx[word]  # 若找到就保存这个词语的向量
    except:
        continue
    weight[index, :] = torch.from_numpy(wvmodel.get_vector(word)) # 没找到词向量就为0

In [None]:
def encode_sample(tokenize_sample, max_len=64, pad=0):
    features = []
    for token in tokenize_sample:
        if token in word2idx:
            features.append(word2idx[token])
        else:
            features.append(0)
    
    if len(features) >= max_len:
        return features[:max_len]
    else:
        while (len(features) < max_len):
            features.append(pad)
        return features

In [None]:
train_features = [encode_sample(tokenize_words(data[1])) for data in datasets['train']]
train_labels = [int(data[0]) for data in datasets['train']]
dev_features = [encode_sample(tokenize_words(data[1])) for data in datasets['dev']]
dev_labels = [int(data[0]) for data in datasets['dev']]
test_features = [encode_sample(tokenize_words(data[1])) for data in datasets['test']]
test_labels = [int(data[0]) for data in datasets['test']]
# dev_features

### 定义TextCNN
对输入x,首先经过word_embedding，将其变成词向量对应的矩阵，再通过三个不同卷积核大小的卷积层得到不同的特征，将得到的特征拼接起来，最后通过一层全连接层。

In [None]:
class TextCNN(nn.Module):
    def __init__(self, embed_size, seq_len, num_labels, weight):
        super(TextCNN, self).__init__()
        self.num_labels = num_labels
        self.embedding = nn.Embedding.from_pretrained(weight)
        # embedding固定不训练
        self.embedding.weight.requires_grad = False
        self.conv1 = nn.Conv2d(1, 1, (3, embed_size))
        self.conv2 = nn.Conv2d(1, 1, (4, embed_size))
        self.conv3 = nn.Conv2d(1, 1, (5, embed_size))
        self.pool1 = nn.MaxPool2d((seq_len - 3 + 1, 1))
        self.pool2 = nn.MaxPool2d((seq_len - 4 + 1, 1))
        self.pool3 = nn.MaxPool2d((seq_len - 5 + 1, 1))
        self.linear = nn.Linear(3, num_labels)  # 只用3个卷积核，最后维
    
    def forward(self, x):
        # [64, 64, 300] (batch_size, seq_len, embed_size)=> [64, 1, 64, 300]
        x = self.embedding(x).view(x.size(0), 1, x.size(1), -1)
        x1 = self.pool1(F.relu(self.conv1(x)))  # 卷积后[64, 1, 62, 1] => 池化后 [64, 1, 1, 1]
        x2 = self.pool2(F.relu(self.conv2(x)))  # [64, 1, 61, 1] => [64, 1, 1, 1]
        x3 = self.pool3(F.relu(self.conv3(x)))
        out = torch.cat((x1, x2, x3), -1)  #[64, 1, 1, 3]
        out = out.view(x.size(0), 1, -1)  #[64, 1, 3]
        out = self.linear(out).view(-1, self.num_labels)  #[64, 1, 2] => [64, 2]      
        return out
        

### 定义超参数，网络初始化，及数据预准备

In [None]:
num_epochs = 10
embed_size = 300
seq_len = 64
num_labels = 2
batch_size = 64
lr = 0.001
device = torch.device('cuda')

In [None]:
net = TextCNN(embed_size, seq_len, num_labels, weight)
net.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=lr)

In [None]:
train_set = TensorDataset(torch.tensor(train_features), torch.tensor(train_labels))
val_set = TensorDataset(torch.tensor(dev_features), torch.tensor(dev_labels))
test_set = TensorDataset(torch.tensor(test_features), torch.tensor(test_labels))

train_iter = DataLoader(train_set, batch_size=batch_size,
                                         shuffle=True)
val_iter = DataLoader(val_set, batch_size=batch_size, shuffle=False)
test_iter = DataLoader(test_set, batch_size=batch_size,
                                        shuffle=False)

### 训练

In [None]:
# losses = []
for epoch in range(num_epochs):
    train_loss, val_loss = 0, 0
    train_acc, val_acc = 0, 0
    m, n = 0, 0
    iter_bar = tqdm(train_iter, desc='Iter Train')
    for features, labels in iter_bar:
        m += 1
        features = features.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()
        output = net(features)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        # losses.append(loss.item())
        
        train_acc += accuracy_score(torch.argmax(output.cpu().data, dim=1), labels.cpu())
        train_loss += loss
        
    # val
    with torch.no_grad():
        for val_features, val_labels in val_iter:
            n += 1
            val_features = val_features.to(device)
            val_labels = val_labels.to(device)
            
            output = net(val_features)
            loss = criterion(output, val_labels)
            val_acc += accuracy_score(torch.argmax(output.cpu().data, dim=1), val_labels.cpu())
            val_loss += loss
    print('Epoch: {}, train_loss: {}, train_acc: {}, val_loss: {}, val_acc: {}'.format(
            epoch, train_loss.data / m, train_acc / m, 
            val_loss.data / n, val_acc / n))
### torch.save(net.state_dict(), save_path)

### 测试

In [None]:
net.eval()
test_acc = 0
test_pre = 0
for test_idx, (test_features, test_labels) in enumerate(test_iter):
    test_features = test_features.to(device)
    test_labels = test_labels.to(device)
    
    with torch.no_grad():
        output = net(test_features) 
        out_index = torch.argmax(output.cpu().data,dim=1)
        label = test_labels.cpu()
        test_acc += accuracy_score(torch.argmax(output.cpu().data, dim=1), test_labels.cpu())
        test_pre =(out_index == 1) & (label == 1).sum()
print('Test acc is {}'.format(test_acc / (test_idx + 1)))
print('Test pre is {}'.format(test_pre / (label == 1)))
print('Test recall is {}'.format(test_pre / (out_index == 1)))
F1 = 2*recall*pre/(recall+pre)
print('Test f1 is {}'.f1)

In [None]:
out_index = torch.argmax(output.cpu().data,dim=1)
out_index

In [None]:
label = test_labels.cpu()
label

In [None]:
label.long() == out_index

In [None]:
x = (out_index == 1) & (label == 1)