In [2]:
from io import open
import glob
import unicodedata
import string
import os
import pandas as pd
from sklearn.utils import shuffle
from torchtext import data
from tqdm import tqdm
from torch.nn import init
from torchtext.vocab import GloVe, Vectors
from torchtext.data import Iterator, BucketIterator

#所有英文字母加上五个标点符号(包含一个空格)
all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)
# 将unicode转为ASCII
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )
#build the category_lines dictionary, a list of names per language
category_lines = {}
all_categories = []
# read a file and split into lines
def readLines(filename):
    lines = open(filename, encoding = 'utf-8').read().strip().split('\n')
    return [unicodeToAscii(line) for line in lines]

file_path = r'C:\Users\lenovo\Desktop\Josie\自学\Pytorch_名字分类\data\data\names'
#数据整合成dataframe
total_data = pd.DataFrame(columns = ('content', 'category'))
for root, dirs, files in os.walk(file_path):
    for idx, file in enumerate(files):
        category = file.split('/')[-1].split('.')[0]
        all_categories.append(category)
        lines = readLines(os.path.join(root, file))
        for line in lines:
            single_name = {'content':line, 'category':int(idx)}
            total_data = total_data.append(single_name, ignore_index = True)
#找到最长名字，做pad用(fix_length = 最长长度)
max_length = 0
for i in total_data['content']:
    if len(i) > max_length:
        max_length = len(i)
print('名字中最长长度为%d'%max_length)
#解决loss值不变的方法一：打乱数据集(total_data)
total_data = shuffle(total_data)
tokenize = lambda x: x.split()
#data.Field:定义样本的处理操作
TEXT = data.Field(sequential = True, tokenize = tokenize, lower = True, fix_length = 19)
LABEL = data.Field(sequential = False, use_vocab = False)
#将原始的corpus转换成data.Example实例(主要为data.Example.fromlist方法)
#都是train数据，就不用区分是train还是test了
def get_dataset(content_info, category_info, text_field, label_field):
    fields = [('content', text_field),
               ('category', label_field)]
    examples = []
    for text, label in zip(content_info, category_info):
        examples.append(data.Example.fromlist([text, label], fields))
    return examples, fields
train_examples, train_fields = get_dataset(total_data['content'], total_data['category'],
                                          TEXT, LABEL)
#使用torchtext.data.Dataset来构建数据集
train = data.Dataset(train_examples, train_fields)
vectors = Vectors(name = r'C:\Users\lenovo\.vector_cache\glove.6B\glove.6B.300d.txt')
TEXT.build_vocab(train, vectors = vectors)
weight_matrix = TEXT.vocab.vectors
#解决loss值不变的方法二：减小batch_size
train_iter = BucketIterator(train, batch_size = 64, device = -1,
                            sort_key = lambda x: len(x.content),
                            sort = False,sort_within_batch = False, repeat = False)
print(train_iter)

名字中最长长度为19


The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.


<torchtext.data.iterator.BucketIterator object at 0x00000202D06715C0>


In [3]:
#模型一 textCNN
#CNN优化方法： 1）运用unsequeeze将数据从3维变为4维，进入conv2d（而不是conv1d),之后
                 #再用sequeeze变回想要的size，比直接用conv1d效果更好
               #2) 所有卷积层的in_channels大小设置一样，out_channels(及kernel个数)也
               #设置一样
import torch
import torch.nn as nn
import torch.nn.functional as F
import sys
class textCNN(nn.Module):
    def __init__(self):
        super(textCNN, self).__init__()
        self.embed_num = 20074
        self.embed_dim = 300
        self.class_num = 18
        #由于从conv1d变为了conv2d，输入通道数是第二维度（1）
        self.in_channels = [1, 1, 1]
        #这里的out_channels就是kernel_num,即卷积核的个数
        #输入输出通道数设置相同大小效果更好
        self.out_channels = [16, 16, 16]
        self.kernel_sizes = [3, 3, 3]
        self.stride = 1
        self.padding = 0
        self.dropout = 0.5
        self.word_embeddings = nn.Embedding(len(TEXT.vocab), self.embed_dim)
        #         #指定预训练的词向量(即embedding层的权重)（Glove)
        weight_matrix = TEXT.vocab.vectors
        self.word_embeddings.weight.data.copy_(weight_matrix)
        #反向时不计算embeddin层的梯度(不更新embedding层的权重)，提升模型训练时间，对应优化器有需要调整的地方
        self.word_embeddings.weight.requires_grad = False
        #进入CNN层
#       [64, 1, 19, 300]
        self.conv1 = nn.Conv2d(self.in_channels[0], self.out_channels[0],
                               (self.kernel_sizes[0], self.embed_dim), self.stride,
                               self.padding)
        self.conv2 = nn.Conv2d(self.in_channels[1], self.out_channels[1],
                              (self.kernel_sizes[1], self.embed_dim), self.stride,
                              self.padding)
#         self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv3 = nn.Conv2d(self.in_channels[2], self.out_channels[2],
                              (self.kernel_sizes[2], self.embed_dim), self.stride,
                              self.padding)
        self.dropout = nn.Dropout(self.dropout)
        #全连接Linear层做线性变换
        self.label = nn.Linear(self.out_channels[2] * 3, self.class_num)
    #CNN模块写到一个函数里面
    def conv_block(self, input, conv_layer):
        conv_out = conv_layer(input)# conv_out.size() = (batch_size, out_channels, embed_dim(updated), 1)
        #变为(batch_size, out_channels,embed_dim(updated))
        #squeeze只能删除维数为1的维度
        activation = F.relu(conv_out.squeeze(3))
        max_out = F.max_pool1d(activation, activation.size()[2]).squeeze(2)
        return max_out
#         max_out = F.max_pool1d(activation, activation.size()[2]).squeeze(2)# maxpool_out.size() = (batch_size, out_channels)
#         return max_out
    def forward(self, input_sentences):
        input = self.word_embeddings(input_sentences)
        input = input.transpose(0, 1).contiguous()
        input = input.unsqueeze(1)
        #input.size() = (batch_size, 1, num_seq, embedding_dim)

        #公式(num_seq - kernel.size + 2padding)/stride + 1
        #卷积层，里面包含卷积，激活，最大池化
        max_out1 = self.conv_block(input, self.conv1)
        max_out2 = self.conv_block(input, self.conv2)
        max_out3 = self.conv_block(input, self.conv3)
        #拼接(整合维度)
        all_out = torch.cat((max_out1, max_out2,max_out3),1)
#         #dropout层
        fc_in = self.dropout(all_out)
        logits = self.label(fc_in)
#         logits.size() = [batch_size, class_num]
        return logits
        
        

In [4]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
from torch.nn import functional as F
from torch.optim.lr_scheduler import StepLR
#梯度裁剪来防止梯度爆炸
def clip_gradient(model, clip_value):
    params = list(filter(lambda p: p.grad is not None, model.parameters()))
    for p in params:
        #nn.utils.clip_grad_norm(model.parameters(), 10)
        p.grad.data.clamp_(-clip_value, clip_value)
        
def main():
    model = textCNN()
    model.train()
    total_epoches_loss = 0
    total_epoches_acc = 0
#     optimizer = optim.Adam(model.parameters(), lr = 1e-3)
    optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr = 1e-3)
    #传入优化器让学习器受其管理，当连续500次没有减少Loss时就减低lr(乘以0.9)
    scheduler = StepLR(optimizer, step_size = 500, gamma = 0.9)
    #pytorch中处理多分类用CrossEntropyLoss时，标签需从0开始
    loss_function = nn.CrossEntropyLoss()
    epoches = 100
    Loss_list = []
    Accuracy_list = []
    for i in range(epoches):
        each_batch_loss = 0
        each_batch_acc = 0
        for epoch, batch in enumerate(train_iter):
            optimizer.zero_grad()
            predicted = model(batch.content)
            loss = loss_function(predicted, batch.category)
            print(predicted.size(), batch.category.size())
            num_corrects = (torch.max(predicted, 1)[1].view(batch.category.size()).data
                           == batch.category.data).float().sum()
            acc = 100.0 * num_corrects/len(batch)
            loss.backward()
            #nn.utils.clip_grad_norm(model.parameters(), 10)
            clip_gradient(model, 1e-1)
            optimizer.step()
            scheduler.step()
            each_batch_loss += loss.item()
            each_batch_acc += acc.item()
            Loss_list.append(each_batch_loss)
            Accuracy_list.append(each_batch_acc)
        total_epoches_loss += each_batch_loss
        total_epoches_acc += each_batch_acc
        print('第%d个epoch的loss值为%f'%(i+1, (each_batch_loss/len(train_iter))))
        print('第%d个epoch的准确率为%f'%(i+1, (each_batch_acc/len(train_iter)/100.0)))

if __name__ == '__main__':
    main()     

torch.Size([64, 18]) torch.Size([64])
torch.Size([64, 18]) torch.Size([64])
torch.Size([64, 18]) torch.Size([64])
torch.Size([64, 18]) torch.Size([64])
torch.Size([64, 18]) torch.Size([64])
torch.Size([64, 18]) torch.Size([64])
torch.Size([64, 18]) torch.Size([64])
torch.Size([64, 18]) torch.Size([64])


KeyboardInterrupt: 

In [None]:
#模型二
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
from torch.nn import functional as F
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.batch_size = 64
        self.output_size = 18
        self.in_channels = 1
        self.out_channels = 1
        self.kernel_heights = [3, 3, 3]
        self.stride = 0
        self.padding = 2
        self.vocab_size = 19
        self.embedding_length = 300
        
        self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_length)
#         self.word_embeddings.weight = nn.Parameter(weights, requires_grad = False)
        self.conv1 = nn.Conv2d(self.in_channels, self.out_channels,
                                                           (self.kernel_heights[0], 
                                                           self.embedding_length),
                                                           self.stride, self.padding)
        self.conv2 = nn.Conv2d(self.in_channels, self.out_channels, 
                                                          (self.kernel_heights[1],
                                                          self.embedding_length),
                                                          self.stride, self.padding)
        self.conv3 = nn.Conv2d(self.in_channels, self.out_channels, 
                                                         (self.kernel_heights[2],
                                                          self.embedding_length),
                                                          self.stride, self.padding)
        self.dropout = nn.Dropout(p=0.2)
        self.label = nn.Linear(len(self.kernel_heights)*self.out_channels, 
                               self.output_size)
    def conv_block(self, input, conv_layer):
        conv_out = conv_layer(input)
        activation = F.relu(conv_out.squeeze(3))
        max_out = F.max_pool1d(activation, activation.size()[2]).squeeze(2)
        return max_out
    def forward(self, input_sentences, batch_size = None):
        print(input_sentences.size())
        input = self.word_embeddings(input_sentences)
        
        input = input.unsqueeze(1)
        max_out1 = self.conv_block(input, self.conv1)
        max_out2 = self.conv_block(input, self.conv2)
        max_out3 = self.conv_block(input, self.conv3)
        
        all_out = torch.cat((max_out1, max_out2, max_out3), 1)
        fc_in = self.dropout(all_out)
        logits = self.label(fc_in)
        return logits
def main():
    model = CNN()
    model.train()
    total_epoches_loss = 0
    total_epoches_acc = 0
#     optimizer = optim.Adam(model.parameters(), lr = 1e-3)
    optimizer = optim.Adam(model.parameters(), lr = 1e-3, weight_decay=1e-5)
    #pytorch中处理多分类用CrossEntropyLoss时，标签需从0开始
    loss_function = nn.CrossEntropyLoss()
    epoches = 5
    Loss_list = []
    Accuracy_list = []
    for i in range(epoches):
        each_batch_loss = 0
        each_batch_acc = 0
        for epoch, batch in enumerate(train_iter):
            optimizer.zero_grad()
            predicted = model(batch.content)
            loss = loss_function(predicted, batch.category)
            num_corrects = (torch.max(predicted, 1)[1].view(batch.category.size()).data
                           == batch.category.data).float().sum()
            acc = 100.0 * num_corrects/len(batch)
            loss.backward()
            clip_gradient(model, 1e-1)
            optimizer.step()
            each_batch_loss += loss.item()
            each_batch_acc += acc.item()
            Loss_list.append(each_batch_loss)
            Accuracy_list.append(each_batch_acc)
        total_epoches_loss += each_batch_loss
        total_epoches_acc += each_batch_acc
        print('第%d个epoch的loss值为%f'%(i+1, (each_batch_loss/len(train_iter))))
        print('第%d个epoch的准确率为%f'%(i+1, (each_batch_acc/len(train_iter)/100.0)))
        
if __name__ == '__main__':
    main()         