# TextCNN文本分类 代码复现

## CNN 理解
>来自知乎等网络平台以及个人整理归纳

### CNN 出现原因
如果用全连接神经网络处理大尺寸图像具有三个明显的缺点：

（1）首先将图像展开为向量会丢失空间信息；   

（2）其次参数过多效率低下，训练困难；

（3）同时大量的参数也很快会导致网络过拟合。

### CNN 构成

卷积层：提取图像特征

池化层：降维，防止过拟合

全连接层：输出结构

多次 卷积+激活（不改变形状）+池化，最后进行全连接

### CNN 特点

1. 稀疏交互

在卷积神经网络中，卷积核尺度远小于输入的维度，这样每个输出神经元仅与前一层特定局部区域内的神经元存在连接权重（即产生交互），我们称这种特性为稀疏交互。

稀疏交互的物理意义：通常图像、文本、语音等现实世界中的数据都具有局部的特征结构， 我们可以先学习局部的特征， 再将局部的特征组合起来形成更复杂和抽象的特征。

2. 参数共享

参数共享是指在同一个模型的不同模块中使用相同的参数。卷积运算中的参数共享让网络只需要学一个参数集合，而不是对于每一位置都需要学习一个单独的参数集合。

参数共享的物理意义：使得卷积层具有平移等变性。

3. 等变表示

假如图像中有一只猫，那么无论它出现在图像中的任何位置，我们都应该将它识别为猫，也就是说神经网络的输出对于平移变换来说应当是等变的。特别地，当函数f(x)与g(x)满足f(g(x))=g(f(x))时，我们称f(x)关于变换g具有等变性。在猫的图片上先进行卷积，再向右平移l像素的输出，与先将图片向右平移l像素再进行卷积操作的输出结果是相等的。

# CNN 结构图片示意

![avatar](https://pic3.zhimg.com/80/v2-2ea1f0b8b166f31273b26bca3ba8e8b2_1440w.jpg)


## 代码部分

In [3]:
import os
import numpy as np
import torch
import torch.nn as nn
from  torch.utils.data import Dataset,DataLoader
from tqdm import tqdm

读取数据

In [9]:
def read_data(train_or_test,num=None):
    with open(os.path.join(".","data",train_or_test + ".txt"),encoding="utf-8") as f:
        all_data = f.read().split("\n")

    texts = []
    labels = []
    for data in all_data:
        if data:
            t,l = data.split("\t")
            texts.append(t)
            labels.append(l)
    if num == None:
        return texts,labels
    else:
        return texts[:num],labels[:num]

构建语料库

In [10]:
def built_corpus(train_texts,embedding_num):
    word_2_index = {"<PAD>":0,"<UNK>":1}
    for text in train_texts:
        for word in text:
            word_2_index[word] = word_2_index.get(word,len(word_2_index))
    return word_2_index,nn.Embedding(len(word_2_index),embedding_num)

构建数据集

In [11]:
class TextDataset(Dataset):
    def __init__(self,all_text,all_label,word_2_index,max_len):
        self.all_text = all_text
        self.all_label = all_label
        self.word_2_index = word_2_index
        self.max_len = max_len

    def __getitem__(self,index):
        text = self.all_text[index][:self.max_len]
        label = int(self.all_label[index])

        text_idx = [self.word_2_index.get(i,1) for i in text]
        text_idx = text_idx + [0] * (self.max_len - len(text_idx))

        text_idx = torch.tensor(text_idx).unsqueeze(dim=0)

        return text_idx,label

    def __len__(self):
        return len(self.all_text)

构建模块

In [12]:
class Block(nn.Module):
    def __init__(self,kernel_s,embeddin_num,max_len,hidden_num):
        super().__init__()
        self.cnn = nn.Conv2d(in_channels=1,out_channels=hidden_num,kernel_size=(kernel_s,embeddin_num)) #  1 * 1 * 7 * 5 (batch *  in_channel * len * emb_num )
        self.act = nn.ReLU()
        self.mxp = nn.MaxPool1d(kernel_size=(max_len-kernel_s+1))

    def forward(self,batch_emb): # 1 * 1 * 7 * 5 (batch *  in_channel * len * emb_num )
        c = self.cnn.forward(batch_emb)
        a = self.act.forward(c)
        a = a.squeeze(dim=-1)
        m = self.mxp.forward(a)
        m = m.squeeze(dim=-1)
        return m

模块组成模型

In [13]:
class TextCNNModel(nn.Module):
    def __init__(self,emb_matrix,max_len,class_num,hidden_num):
        super().__init__()
        self.emb_num = emb_matrix.weight.shape[1]

        self.block1 = Block(2,self.emb_num,max_len,hidden_num)
        self.block2 = Block(3,self.emb_num,max_len,hidden_num)
        self.block3 = Block(4,self.emb_num,max_len,hidden_num)
        self.block4 = Block(5, self.emb_num, max_len, hidden_num)

        self.emb_matrix = emb_matrix

        self.classfier = nn.Linear(hidden_num*4,class_num) # 全连接层，可以视为分类器
        self.loss_fun = nn.CrossEntropyLoss()
    
    def forward(self,batch_idx,batch_label=None):
        batch_emb = self.emb_matrix(batch_idx) # 输入维数 1*7*5
        b1_result = self.block1.forward(batch_emb)
        b2_result = self.block2.forward(batch_emb)
        b3_result = self.block3.forward(batch_emb)
        b4_result = self.block4.forward(batch_emb)

        feature = torch.cat([b1_result,b2_result,b3_result,b4_result],dim=1) # 1 * 6 : [ batch * (3 * 2)]
        pre = self.classfier(feature) # 存疑 讲解为概率，权重，与下面的预测值间的区别不是很清楚
        
        # 如果有标签，那么就输出损失值；否则输出预测值
        if batch_label is not None:
            loss = self.loss_fun(pre,batch_label)
            return loss
        else:
            return torch.argmax(pre,dim=-1)
        

In [14]:
train_text,train_label = read_data("train")
validation_text,validation_label =  read_data("dev")

embedding = 50
max_len= 20
batch_size = 200
epoch = 1000
lr = 0.001
hidden_num = 2
class_num = len(set(train_label))
device = "cuda:0" if torch.cuda.is_available() else "cpu"

word_2_index,words_embedding = built_corpus(train_text,embedding)

train_dataset = TextDataset(train_text,train_label,word_2_index,max_len)
train_loader = DataLoader(train_dataset,batch_size,shuffle=False)

validation_dataset = TextDataset(validation_text,validation_label,word_2_index,max_len)
validation_loader = DataLoader(validation_dataset,batch_size,shuffle=False)


model = TextCNNModel(words_embedding,max_len,class_num,hidden_num).to(device)
opt = torch.optim.AdamW(model.parameters(),lr=lr)


for e in range(epoch):
    print(f"epoc {e}")

    for batch_idx,batch_label in train_loader:
        batch_idx = batch_idx.to(device)
        batch_label = batch_label.to(device)
        loss = model.forward(batch_idx,batch_label)
        loss.backward()
        opt.step()
        opt.zero_grad()

    print(f"loss:{loss:.3f}")

    right_num = 0
    for batch_idx,batch_label in validation_loader:
        batch_idx = batch_idx.to(device)
        batch_label = batch_label.to(device)
        pre = model.forward(batch_idx)
        right_num += int(torch.sum(pre==batch_label))

    print(f"acc = {right_num/len(validation_text)*100:.2f}%")
    print()

epoc 0
loss:1.232
acc = 65.05%

epoc 1
loss:1.026
acc = 71.29%

epoc 2
loss:0.915
acc = 74.22%

epoc 3
loss:0.839
acc = 75.92%

epoc 4
loss:0.779
acc = 77.27%

epoc 5
loss:0.739
acc = 78.09%

epoc 6
loss:0.706
acc = 78.50%

epoc 7
loss:0.679
acc = 79.06%

epoc 8
loss:0.657
acc = 79.43%

epoc 9
loss:0.638
acc = 79.74%

epoc 10
loss:0.619
acc = 80.10%

epoc 11
loss:0.597
acc = 80.45%

epoc 12
loss:0.576
acc = 80.58%

epoc 13
loss:0.562
acc = 80.58%

epoc 14
loss:0.553
acc = 80.61%

epoc 15
loss:0.547
acc = 80.75%

epoc 16
loss:0.530
acc = 80.84%

epoc 17
loss:0.521
acc = 80.95%

epoc 18
loss:0.509
acc = 81.11%

epoc 19
loss:0.501
acc = 81.12%

epoc 20
loss:0.491
acc = 81.12%

epoc 21
loss:0.483
acc = 81.19%

epoc 22
loss:0.471
acc = 81.37%

epoc 23
loss:0.461
acc = 81.44%

epoc 24
loss:0.451
acc = 81.57%

epoc 25
loss:0.443
acc = 81.53%

epoc 26
loss:0.434
acc = 81.44%

epoc 27
loss:0.427
acc = 81.37%

epoc 28
loss:0.418
acc = 81.47%

epoc 29
loss:0.413
acc = 81.40%

epoc 30
loss:0.405
a

结果评价与思考：

1. 验证集准确率在75%-80%振荡

2. epoc没有必要设为1000，设为300即可 早停机制

对参数的一些尝试：

1. embedding为5，max_len为7，与原论文相同，收敛速度慢

2. 将batch_size定为1，20，收敛速度慢，单次epoc时间减短

3. Block为3，收敛速度慢，效果相对4个Block要差

希望进行尝试：

1. Block是不是越多越好（梯度爆炸，梯度消失），Block数量是否可以作为参数进行优化而不是超参数 (AutoML)

2. 改为con1d卷积