In [43]:
import numpy as ny
import torch
import pandas as pd
import random
import torch.nn as nn
import torch.nn.functional as F


path = 'weibo_senti_100k.csv'
outputs = 'weibo_senti_100k_random.csv'

In [12]:

def shuffles(path, outputs):
    contents = []
    f1 = open(path, 'r', encoding='utf-8') 
    for line in f1.readlines():
        contents.append(line)
    
    random.shuffle(contents)
    
    f2 = open(outputs, 'w', encoding='utf-8')
    for content in contents:
        f2.write(content)
    
    f1.close()
    f2.close()


shuffles(path,outputs)

In [14]:
data = pd.read_csv(outputs, header=None, sep=',',encoding='utf-8')
data.shape

(119989, 2)

In [32]:
import jieba 
def cut(sentence):
    return [token for token in jieba.lcut(sentence)]


In [36]:
import torchtext
TEXT = torchtext.data.Field(sequential=True,lower=True,tokenize=cut)

LABEL = torchtext.data.LabelField(sequential=False, dtype=torch.int64)


In [38]:
train_dataset,dev_dataset,test_dataset = torchtext.data.TabularDataset.splits(
      path='/Users/jimmy/Desktop/微博/sentiment-weibo/', 
      format='csv',   
      skip_header=False,  
      train='train.csv',  
      validation='vali.csv',
      test='test.csv',    
      fields=[('label',LABEL),('content',TEXT)] 
  )


Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/pt/17k8cfss7jj45568t8jjk4g40000gn/T/jieba.cache
Loading model cost 0.852 seconds.
Prefix dict has been built successfully.


In [39]:
pretrained_name = 'sgns.sogounews.bigram-char' 
pretrained_path = '/Users/jimmy/Desktop/微博/sentiment-weibo/' 
vectors = torchtext.vocab.Vectors(name=pretrained_name, cache=pretrained_path)


  0%|          | 0/365113 [00:00<?, ?it/s]Skipping token b'365113' with 1-dimensional vector [b'300']; likely a header
100%|█████████▉| 364860/365113 [00:50<00:00, 11840.86it/s]

In [40]:
TEXT.build_vocab(train_dataset, dev_dataset,test_dataset,
                 vectors=vectors)
LABEL.build_vocab(train_dataset, dev_dataset,test_dataset)


In [41]:
len(TEXT.vocab)

197937

In [42]:
train_iter, dev_iter,test_iter = torchtext.data.BucketIterator.splits(
        (train_dataset, dev_dataset,test_dataset), 
        batch_sizes=(128, 128,128), 
        sort_key=lambda x: len(x.content) 
        )


In [44]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class TextCNN(nn.Module):
    def __init__(self, 
                 class_num, # 最后输出的种类数 
                 filter_sizes, # 卷积核的长也就是滑动窗口的长 
                 filter_num,   # 卷积核的数量 
                 vocabulary_size, # 词表的大小
                 embedding_dimension, # 词向量的维度
                 vectors, # 词向量
                 dropout): 
        super(TextCNN, self).__init__() 

        chanel_num = 1  # 通道数，也就是一篇文章一个样本只相当于一个feature map

        self.embedding = nn.Embedding(vocabulary_size, embedding_dimension) # 嵌入层 
        self.embedding = self.embedding.from_pretrained(vectors) #嵌入层加载预训练词向量

        self.convs = nn.ModuleList(
            [nn.Conv2d(chanel_num, filter_num, (fsz, embedding_dimension)) for fsz in filter_sizes])  # 卷积层
        self.dropout = nn.Dropout(dropout) # dropout
        self.fc = nn.Linear(len(filter_sizes) * filter_num, class_num) #全连接层

    def forward(self, x):
        # x维度[句子长度,一个batch中所包含的样本数] 例:[3451,128]
        x = self.embedding(x) # #经过嵌入层之后x的维度，[句子长度,一个batch中所包含的样本数,词向量维度] 例：[3451,128,300]
        x = x.permute(1,0,2) # permute函数将样本数和句子长度换一下位置，[一个batch中所包含的样本数,句子长度,词向量维度] 例：[128,3451,300]
        x = x.unsqueeze(1) # # conv2d需要输入的是一个四维数据，所以新增一维feature map数 unsqueeze(1)表示在第一维处新增一维，[一个batch中所包含的样本数,一个样本中的feature map数，句子长度,词向量维度] 例：[128,1,3451,300]
        x = [conv(x) for conv in self.convs] # 与卷积核进行卷积，输出是[一个batch中所包含的样本数,卷积核数，句子长度-卷积核size+1,1]维数据,因为有[3,4,5]三张size类型的卷积核所以用列表表达式 例：[[128,16,3459,1],[128,16,3458,1],[128,16,3457,1]]
        x = [sub_x.squeeze(3) for sub_x in x]#squeeze(3)判断第三维是否是1，如果是则压缩，如不是则保持原样 例：[[128,16,3459],[128,16,3458],[128,16,3457]]
        x = [F.relu(sub_x) for sub_x in x] # ReLU激活函数激活，不改变x维度 
        x = [F.max_pool1d(sub_x,sub_x.size(2)) for sub_x in x] # 池化层，根据之前说的原理，max_pool1d要取出每一个滑动窗口生成的矩阵的最大值，因此在第二维上取最大值 例：[[128,16,1],[128,16,1],[128,16,1]]
        x = [sub_x.squeeze(2) for sub_x in x] # 判断第二维是否为1，若是则压缩 例：[[128,16],[128,16],[128,16]]
        x = torch.cat(x, 1) # 进行拼接，例：[128,48]
        x = self.dropout(x) # 去除掉一些神经元防止过拟合，注意dropout之后x的维度依旧是[128,48]，并不是说我dropout的概率是0.5，去除了一半的神经元维度就变成了[128,24]，而是把x中的一些神经元的数据根据概率全部变成了0，维度依旧是[128,48]
        logits = self.fc(x) # 全接连层 例：输入x是[128,48] 输出logits是[128,10]
        return logits


In [45]:
class_num = 2 # 类别数目
filter_size = [3,4,5]  # 卷积核种类数 
filter_num=16   # 卷积核数量
vocab_size = len(TEXT.vocab) # 词表大小
embedding_dim = TEXT.vocab.vectors.size()[-1] 
vectors = TEXT.vocab.vectors 
dropout=0.5 
learning_rate = 0.001  
epochs = 5   
save_dir = './/TextCNN/model' # 模型保存路径
steps_show = 10   # 每10步查看一次训练集loss和mini batch里的准确率
steps_eval = 100  # 每100步测试一下验证集的准确率
early_stopping = 1000  # 若发现当前验证集的准确率在1000步训练之后不再提高 一直小于best_acc,则提前停止训练

textcnn_model = TextCNN(class_num=class_num,
        filter_sizes=filter_size,
        filter_num=filter_num,
        vocabulary_size=vocab_size,
        embedding_dimension=embedding_dim,
        vectors=vectors,
        dropout=dropout)


In [48]:
def train(train_iter, dev_iter, model):

    if torch.cuda.is_available(): 
        model.cuda()

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) 
    steps = 0
    best_acc = 0
    last_step = 0
    model.train()
    for epoch in range(1, epochs + 1): 
        for batch in train_iter:
            feature, target = batch.content, batch.label
            if torch.cuda.is_available(): 
                feature,target = feature.cuda(),target.cuda() 
            optimizer.zero_grad()
            logits = model(feature)
            loss = F.cross_entropy(logits, target) # 交叉熵损失函数
            loss.backward() 
            optimizer.step() 
            steps += 1 
            if steps % steps_show == 0: 
                corrects = (torch.max(logits, 1)[1].view(target.size()).data == target.data).sum() # logits是[128,10],torch.max(logits, 1)也就是选出第一维中概率最大的值，输出为[128,1],torch.max(logits, 1)[1]相当于把每一个样本的预测输出取出来，然后通过view(target.size())平铺成和target一样的size (128,),然后把与target中相同的求和，统计预测正确的数量
                train_acc = 100.0 * corrects / batch.batch_size # 计算每个mini batch中的准确率
                print('steps:{} - loss: {:.6f}  acc:{:.4f}'.format(
                  steps,
                  loss.item(),
                  train_acc))
                
            if steps % steps_eval == 0: # 每训练100步进行一次验证
                dev_acc = dev_eval(dev_iter,model)
                if dev_acc > best_acc:
                    best_acc = dev_acc
                    last_step = steps
                    print('Saving best model, acc: {:.4f}%\n'.format(best_acc))
                    save(model,save_dir, steps)
                else:
                    if steps - last_step >= early_stopping:
                        print('\n提前停止于 {} steps, acc: {:.4f}%'.format(last_step, best_acc))
                        raise KeyboardInterrupt


In [49]:
def dev_eval(dev_iter,model):
    model.eval()
    corrects, avg_loss = 0, 0
    for batch in dev_iter:
        feature, target = batch.content, batch.label
        if torch.cuda.is_available():
            feature, target = feature.cuda(), target.cuda()
        logits = model(feature)
        loss = F.cross_entropy(logits, target)
        avg_loss += loss.item()
        corrects += (torch.max(logits, 1)
                    [1].view(target.size()).data == target.data).sum()
    size = len(dev_iter.dataset)
    avg_loss /= size
    accuracy = 100.0 * corrects / size
    print('\nEvaluation - loss: {:.6f}  acc: {:.4f}%({}/{}) \n'.format(avg_loss,
                                                                      accuracy,
                                                                      corrects,
                                                                      size))
    return accuracy


In [50]:
def save(model, save_dir, steps):
    if not os.path.isdir(save_dir):
        os.makedirs(save_dir)
    save_path = 'bestmodel_steps{}.pt'.format(steps)
    save_bestmodel_path = os.path.join(save_dir, save_path)
    torch.save(model.state_dict(), save_bestmodel_path)


In [51]:
train(train_iter,dev_iter,textcnn_model)

steps:10 - loss: 0.486474  acc:78.9062
steps:20 - loss: 0.328642  acc:89.0625
steps:30 - loss: 0.201734  acc:94.5312
steps:40 - loss: 0.093193  acc:97.6562
steps:50 - loss: 0.069663  acc:98.4375
steps:60 - loss: 0.137595  acc:96.0938
steps:70 - loss: 0.105220  acc:96.8750
steps:80 - loss: 0.042571  acc:99.2188
steps:90 - loss: 0.059783  acc:96.8750
steps:100 - loss: 0.076821  acc:99.2188

Evaluation - loss: 0.000658  acc: 97.8998%(23494/23998) 

Saving best model, acc: 97.8998%



NameError: name 'os' is not defined