In [1]:
from io import open
import glob
import unicodedata
import string

#所有英文字母加上五个标点符号(包含一个空格)
all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)
# 将unicode转为ASCII
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )
#build the category_lines dictionary, a list of names per language
category_lines = {}
all_categories = []
# read a file and split into lines
def readLines(filename):
    lines = open(filename, encoding = 'utf-8').read().strip().split('\n')
    return [unicodeToAscii(line) for line in lines]

In [2]:
import os
import pandas as pd
file_path = r'C:\Users\lenovo\Desktop\Josie\自学\Pytorch_名字分类\data\data\names'
#数据整合成dataframe
total_data = pd.DataFrame(columns = ('content', 'category'))
for root, dirs, files in os.walk(file_path):
    for idx, file in enumerate(files):
        category = file.split('/')[-1].split('.')[0]
        all_categories.append(category)
        lines = readLines(os.path.join(root, file))
        for line in lines:
            single_name = {'content':line, 'category':int(idx)}
            total_data = total_data.append(single_name, ignore_index = True)
print(total_data.shape)
print(total_data.head(10))
#total_data.shape():[20074, 2]

(20074, 2)
    content category
0    Khoury        0
1     Nahas        0
2     Daher        0
3    Gerges        0
4    Nazari        0
5   Maalouf        0
6    Gerges        0
7    Naifeh        0
8  Guirguis        0
9      Baba        0


In [3]:
#找到最长名字，做pad用(fix_length = 最长长度)
max_length = 0
for i in total_data['content']:
    if len(i) > max_length:
        max_length = len(i)
print(max_length)
print(total_data['category'].unique())

19
[0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17]


In [4]:
#解决loss值不变的方法一：打乱数据集(total_data)
from sklearn.utils import shuffle
total_data = shuffle(total_data)
total_data = total_data.sample(20000)

In [5]:
#根据'文字补齐和预处理学习(代码已调对)'来调整代码
from torchtext import data
from torchtext.vocab import Vectors
from tqdm import tqdm
from torch.nn import init
tokenize = lambda x: x.split()
#data.Field:定义样本的处理操作
TEXT = data.Field(sequential = True, tokenize = tokenize, lower = True, fix_length = 19)
LABEL = data.Field(sequential = False, use_vocab = False)

In [6]:
#将原始的corpus转换成data.Example实例(主要为data.Example.fromlist方法)
#都是train数据，就不用区分是train还是test了
def get_dataset(content_info, category_info, text_field, label_field):
    fields = [('content', text_field),
               ('category', label_field)]
    examples = []
    for text, label in zip(content_info, category_info):
        examples.append(data.Example.fromlist([text, label], fields))
    return examples, fields
train_examples, train_fields = get_dataset(total_data['content'], total_data['category'],
                                          TEXT, LABEL)
#使用torchtext.data.Dataset来构建数据集
train = data.Dataset(train_examples, train_fields)

In [7]:
from torchtext.vocab import GloVe, Vectors
from torchtext import data
vectors = Vectors(name = r'C:\Users\lenovo\.vector_cache\glove.6B\glove.6B.300d.txt')
TEXT.build_vocab(train, vectors = vectors)
weight_matrix = TEXT.vocab.vectors

In [8]:
from torchtext.data import Iterator, BucketIterator
#解决loss值不变的方法二：减小batch_size
train_iter = BucketIterator(train, batch_size = 64, device = -1,
                            sort_key = lambda x: len(x.content),
                            sort = False,sort_within_batch = False, repeat = False)

The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.


In [None]:
#模型一
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import time 
class SimpleLSTMBaseline(nn.Module):
    def __init__(self, hidden_dim = 128, emb_dim = 300, num_linear = 1):
        super().__init__()
        self.embedding = nn.Embedding(len(TEXT.vocab), emb_dim)
        self.encoder = nn.LSTM(emb_dim, hidden_dim, num_layers = 1)
        self.linear_layers = []
        # 中间fc层
        for _ in range(num_linear - 1):
            self.linear_layers.append(nn.Linear(hidden_dim, hidden_dim))
            self.linear_layers = nn.ModuleList(self.linear_layers)
        # 输出层
        self.predictor = nn.Linear(hidden_dim, 18)
        # 使用归一化加快运算速度
        self.bn = nn.BatchNorm1d(hidden_dim)
    def forward(self, seq):
        hdn, _ = self.encoder(self.embedding(seq))
        feature = hdn[-1, :, :]  # 选择最后一个output
        for layer in self.linear_layers:
            feature = self.bn(feature)
            feature = layer(feature)
        preds = self.predictor(feature)
        return preds
def clip_gradient(model, clip_value):
    params = list(filter(lambda p: p.grad is not None, model.parameters()))
    for p in params:
        p.grad.data.clamp_(-clip_value, clip_value)
def main():
    nh = 500
    total_epoch_loss = 0
    total_epoch_acc = 0
    model = SimpleLSTMBaseline(nh, emb_dim = 300)
    model.train()
#     optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr = 0.001)
    optimizer = optim.Adam(model.parameters(), lr = 0.01, weight_decay=1e-5)
    #pytorch中处理多分类用CrossEntropyLoss时，标签需从0开始
    loss_function = nn.CrossEntropyLoss()
    for epoch, batch in enumerate(train_iter):
        optimizer.zero_grad()
        start = time.time()
        predicted = model(batch.content)
        loss = loss_function(predicted, batch.category)
        num_corrects = (torch.max(predicted, 1)[1].view(batch.category.size()).data
                       == batch.category.data).float().sum()
        acc = 100.0 * num_corrects/len(batch)
        loss.backward()
        clip_gradient(model, 1e-1)
        optimizer.step()
        total_epoch_loss += loss.item()
        total_epoch_acc += acc.item()
    print('loss值为%f'%(total_epoch_loss/len(train_iter)))
    print('准确率为%f'%(total_epoch_acc/len(train_iter)))
if __name__ == '__main__':
    main()

KeyboardInterrupt: 

In [57]:
#模型二
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import time
class LSTM(nn.Module):
    def __init__(self):
        super(LSTM, self).__init__()
        self.word_embeddings = nn.Embedding(len(TEXT.vocab), 300)
        self.lstm = nn.LSTM(input_size = 300, hidden_size = 128, num_layers = 1, batch_first = True)
        #input layer:[fix_length, batch_size, embedding_size]
        #hidden layer:[fix_length, batch_size, hidden_size]
        #nn.Linear(hidden_size, output_category)
        self.decoder = nn.Linear(128, 18)
    
    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out = self.lstm(embeds)[0]
#         print(lstm_out.shape)
        final = lstm_out[-1]
#         final = F.relu(self.decoder(final))
        y = self.decoder(final)
        return y
def clip_gradient(model, clip_value):
    params = list(filter(lambda p: p.grad is not None, model.parameters()))
    for p in params:
        p.grad.data.clamp_(-clip_value, clip_value)
        
def main():
    model = LSTM()
    model.train()
    total_epoch_loss = 0
    total_epoch_acc = 0
#     optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr = 0.001)
    optimizer = optim.Adam(model.parameters(), lr = 0.01, weight_decay=1e-5)
    #pytorch中处理多分类用CrossEntropyLoss时，标签需从0开始
    loss_function = nn.CrossEntropyLoss()
    for epoch, batch in enumerate(train_iter):
        optimizer.zero_grad()
        start = time.time()
        predicted = model(batch.content)
        loss = loss_function(predicted, batch.category)
        num_corrects = (torch.max(predicted, 1)[1].view(batch.category.size()).data
                       == batch.category.data).float().sum()
        acc = 100.0 * num_corrects/len(batch)
        loss.backward()
        clip_gradient(model, 1e-1)
        optimizer.step()
        total_epoch_loss += loss.item()
        total_epoch_acc += acc.item()
    print('平均loss值为%f'%(total_epoch_loss/len(train_iter)))
    print('平均准确率为%f'%(total_epoch_acc/len(train_iter)))
if __name__ == '__main__':
    main()

loss值为1.899778
准确率为45.781806


In [11]:
print(len(TEXT.vocab))

17357


In [11]:
#模型三：格式标准，易读，运算速度快(准确率稳定在85.4%左右)
#调参过程中重点调的内容有hidden_size, learning_rate, batch_size
#添加：1）embedding_dropout，fc_dropout，relu层  没用
#添加；2)softmax层: crossentropy方法里封装了softmax，无需另外加softmax层，这样会压缩数据
#       造成误差
#添加: 3)weight_decay:权重衰减(L2正则化)来防止过拟合 
#epoch尽量大(几百个),可以充分训练
#模型变好的方法: 1)随机初始化词向量变为加载预训练好的词向量(在模型里面)(即embedding层的权重)，
                #反向时embedding层的权重不更新，加快运算速度
#还可以优化的地方:现在是取最后一个字母进入linear层，后可以加入pooling层，计算所有字母的
#embeding均值或者最大值，进入linear层，充分利用每个词
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
from torch.nn import functional as F
import numpy as np
from torch.optim.lr_scheduler import StepLR

class LSTMClassifier(nn.Module):
    def __init__(self):
        super(LSTMClassifier, self).__init__()
        self.batch_size = 64
        self.hidden_size = 128
        self.vocab_size = len(TEXT.vocab)
        self.embedding_length = 300
        self.embedding_dropout = 0.2
        self.fc_dropout = 0.1
        #bidirectional没用上，会报错
        self.output_size = 18
        self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_length)
#         #指定预训练的词向量(即embedding层的权重)（Glove)
        weight_matrix = TEXT.vocab.vectors
        self.word_embeddings.weight.data.copy_(weight_matrix)
        #反向时不计算embeddin层的梯度(不更新embedding层的权重)，提升模型训练时间，对应优化器有需要调整的地方
        self.word_embeddings.weight.requires_grad = False
#         self.word_embeddings.weight = nn.Parameter(word_embeddings, require_grad = True)
        self.lstm = nn.LSTM(self.embedding_length, self.hidden_size)
#         self.embed_dropout = nn.Dropout(self.embedding_dropout)
#         self.fc_dropout = nn.Dropout(self.fc_dropout)
#         self.relu = nn.ReLU()
        self.label = nn.Linear(self.hidden_size, self.output_size)
        
    def forward(self, input_sentence):
        input = self.word_embeddings(input_sentence)
        if input.shape[1] == self.batch_size:
            h_0 = Variable(torch.zeros(1, self.batch_size, self.hidden_size))
            c_0 = Variable(torch.zeros(1, self.batch_size, self.hidden_size))
        else:
            h_0 = Variable(torch.zeros(1, input.shape[1], self.hidden_size))
            c_0 = Variable(torch.zeros(1, input.shape[1], self.hidden_size))        
        output, (final_hidden_state, final_cell_state) = self.lstm(input,(h_0, c_0))
        print(output.size())
        print(final_hidden_state.size())
        print(final_cell_state.size())
        print("----")
        final_output = self.label(final_hidden_state[-1])
#         final_output = self.relu(final_output)
#         final_output = self.fc_dropout(final_output)
#         print(final_output)
        return final_output
#梯度裁剪来防止梯度爆炸
def clip_gradient(model, clip_value):
    params = list(filter(lambda p: p.grad is not None, model.parameters()))
    for p in params:
        #nn.utils.clip_grad_norm(model.parameters(), 10)
        p.grad.data.clamp_(-clip_value, clip_value)
        
def main():
    model = LSTMClassifier()
    model.train()
    total_epoches_loss = 0
    total_epoches_acc = 0
#     optimizer = optim.Adam(model.parameters(), lr = 1e-3)
    #动态监控lr,如多次(200)没有发生loss下降，则降低学习率
    optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr = 1e-3)
    #传入优化器让学习器受其管理，当连续200次没有减少Loss时就减低lr(乘以0.9)
    scheduler = StepLR(optimizer, step_size = 500, gamma = 0.9)
    #pytorch中处理多分类用CrossEntropyLoss时，标签需从0开始
    loss_function = nn.CrossEntropyLoss()
    epoches = 5
    Loss_list = []
    Accuracy_list = []
    for i in range(epoches):
        each_batch_loss = 0
        each_batch_acc = 0
        for epoch, batch in enumerate(train_iter):
            optimizer.zero_grad()
            predicted = model(batch.content)
            loss = loss_function(predicted, batch.category)
            num_corrects = (torch.max(predicted, 1)[1].view(batch.category.size()).data
                           == batch.category.data).float().sum()
            acc = 100.0 * num_corrects/len(batch)
            loss.backward()
            #nn.utils.clip_grad_norm(model.parameters(), 10)
            clip_gradient(model, 1e-1)
            optimizer.step()
            scheduler.step()
            each_batch_loss += loss.item()
            each_batch_acc += acc.item()
            Loss_list.append(each_batch_loss)
            Accuracy_list.append(each_batch_acc)
        total_epoches_loss += each_batch_loss
        total_epoches_acc += each_batch_acc
        print('第%d个epoch的loss值为%f'%(i+1, (each_batch_loss/len(train_iter))))
        print('第%d个epoch的准确率为%f'%(i+1, (each_batch_acc/len(train_iter)/100.0)))
        
if __name__ == '__main__':
    main()        

torch.Size([19, 64, 128])
torch.Size([1, 64, 128])
torch.Size([1, 64, 128])
----
torch.Size([19, 64, 128])
torch.Size([1, 64, 128])
torch.Size([1, 64, 128])
----
torch.Size([19, 64, 128])
torch.Size([1, 64, 128])
torch.Size([1, 64, 128])
----
torch.Size([19, 64, 128])
torch.Size([1, 64, 128])
torch.Size([1, 64, 128])
----
torch.Size([19, 64, 128])
torch.Size([1, 64, 128])
torch.Size([1, 64, 128])
----
torch.Size([19, 64, 128])
torch.Size([1, 64, 128])
torch.Size([1, 64, 128])
----


KeyboardInterrupt: 

In [26]:
#绘制loss值和acc值
import matplotlib.pyplot as plt
x1 = range(0, 10)
x2 = range(0, 10)
y1 = Accuracy_list
y2 = Loss_list
plt.subplot(2, 1, 1)
plt.plot(x1, y1, 'o-')
plt.title('Train accuracy vs. epoches')
plt.ylabel('Train accuracy')
plt.plot(2, 1, 1)
plt.plot(x2, y2, '.-')
plt.title('Train loss vs. epoches')
plt.ylabel('Train loss')
plt.show()

NameError: name 'epoches' is not defined

In [None]:
#------""以下是原始代码-------

In [266]:
#数据集: 一共18种语言，加在一起有20074条数据
import os
file_path = r'C:\Users\lenovo\Desktop\Josie\自学\Pytorch_名字分类\data\data\names'
for root, dirs, files in os.walk(file_path):
    for file in files:
        category = file.split('/')[-1].split('.')[0]
        all_categories.append(category)
        lines = readLines(os.path.join(root, file))
        category_lines[category] = lines
n_categories = len(all_categories)
#将名字数据转为Tensor格式才能入模，pytorch在tensor上封装了遗赠variable
#一个字母为一个张量
import torch
import torch.nn as nn
from torch.autograd import Variable
Embedding_length = 57
embeds = nn.Embedding(n_letters, Embedding_length)
def letterToTensor():
    letter_Tensor = {}
    for letter in all_letters:
        letter_Tensor[letter] = embeds(Variable(
                              torch.LongTensor([all_letters.find(letter)])))
    return letter_Tensor
def lineToTensor(line):
    tensor = Variable(torch.randn(len(line), 1, Embedding_length))
    for li, letter in enumerate(line):
        tensor[li] = embeds(Variable(
                              torch.LongTensor([all_letters.find(letter)])))
    return tensor
letterTensor = letterToTensor()

In [267]:
#处理标签值
words = list(set(all_categories))
word2ind = {word: i for i, word in enumerate(words)}
words = list(set(all_categories))
word2ind = {word: i for i, word in enumerate(words)}
def Label_lineToTensor(line):
    tensor = torch.zeros(len(line), 1, n_categories)
    for li, letter in enumerate(line):
        tensor[li][0][word2ind['Chinese']] = 1
    tensor = Variable(tensor)
    return tensor

In [317]:
import random
#随机采样训练样本对
def randomChoice(l):
    return l[random.randint(0, len(l) - 1)]

def randomTrainingExample():
    category = randomChoice(all_categories)
    line = randomChoice(category_lines[category])
    line_tensor = lineToTensor(line)
#     category_tensor = torch.squeeze(Label_lineToTensor(category))
    category_tensor = Label_lineToTensor(category)
    return category, line, category_tensor, line_tensor
# for i in range(5):
#     category, line, category_tensor, line_tensor = randomTrainingExample()
#     print(category, category_tensor)

In [318]:
import torch
from torch import nn
from torch.autograd import Variable

class RNN(nn.Module):
    def __init__(self):
        super(RNN, self).__init__()
        self.rnn = nn.RNN(
            input_size=57,
            hidden_size=128,
            num_layers=2,
        )
        self.linear = nn.Linear(128, 18)
 
    def forward(self, x, h_n): 
        r_out, h_n = self.rnn(x, h_n)
        outs = []
        for step in range(r_out.size(1)):
            outs.append(self.linear(r_out[:, step, :]))  
        return torch.stack(outs, dim=1), h_n
rnn = RNN()
optimizer = torch.optim.Adam(rnn.parameters())
loss_func = nn.NLLLoss()

In [319]:
#Variable(torch.randn(2, 3, 20))
def train(category_tensor, line_tensor):
    hidden = Variable(torch.zeros(2, 1, 128))
    rnn.zero_grad()
    optimizer.zero_grad()
    output, hidden = rnn(line_tensor, hidden)
    
    #output去掉中间的batch_size维度，从3维变为2维
#     output = torch.squeeze(output)
    hidden = hidden.data
    #分别为input_size, hidden_size, output_size, 真实label-size
    print(line_tensor.size())
    print(hidden.size())
    print(output.size())
    print(category_tensor.size())
#     print(category_tensor.size())
#     print(output.size())
#     print(type(category_tensor))
#     print(type(output))
#     print(line_tensor)
# train(category_tensor, line_tensor)
#     计算损失值
#     loss = loss_func(output, category_tensor)
#     print('round' + str(i) + ' ' + str(loss))
#     optimizer.zero_grad()
#     loss.backward()
#     torch.nn.utils.clip_grad_norm(rnn.parameters(), 5)
#     optimize.step()
#     for p in rnn.parameters():
#         p.data.add_(-learning_rate, p.grad.data)
#     return output, loss.data[0]

In [320]:
from torch.autograd import Variable
n_iters = 5000
print_every = 200
#定义类别
def categoryFromOutput(output):
    top_n, top_i = output.data.topk(1)
    category_i = top_i[0][0]
    return all_categories[category_i], category_i
#这里开始迭代跑模型
train_correct = []
for iter in range(1, n_iters + 1):
    category, line, category_tensor, line_tensor = randomTrainingExample()
    output, loss = train(category_tensor, line_tensor)
    current_loss += loss
    guess, guess_i = categoryFromOutput(output)
    #获取每批次的预判正确个数
    if guess == category:
        train_correct.append(1)
    else:
        train_correct.append(0)
    correct = '✓' if guess == category else '✗ (%s)' % category
    if iter % print_every == 0:
        print('%d %d%% %.4f %s / %s %s %s' % (iter, iter / n_iters * 100, loss, line, guess,
                                          category, correct))
print('准确率为%f'%(sum(train_correct)/(len(train_correct))))

torch.Size([5, 1, 57])
torch.Size([2, 1, 128])
torch.Size([5, 1, 18])
torch.Size([6, 1, 18])


TypeError: 'NoneType' object is not iterable