<a href="https://colab.research.google.com/github/Jacobluke-/FYPI/blob/main/Pytorch_Tutorial/Classifying_names.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
from __future__ import unicode_literals, print_function, division
from io import open
import glob
import os

def findFiles(path): return glob.glob(path)

print(findFiles('data/names/*.txt'))

['data/names/Irish.txt', 'data/names/Spanish.txt', 'data/names/Scottish.txt', 'data/names/French.txt', 'data/names/Greek.txt', 'data/names/Dutch.txt', 'data/names/Czech.txt', 'data/names/Arabic.txt', 'data/names/Vietnamese.txt', 'data/names/Russian.txt', 'data/names/English.txt', 'data/names/Japanese.txt', 'data/names/Portuguese.txt', 'data/names/Korean.txt', 'data/names/Chinese.txt', 'data/names/Polish.txt', 'data/names/Italian.txt', 'data/names/German.txt']


In [47]:
import unicodedata
import string

all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

# Turn a Unicode string to plain ascii
def unicodeToAscii(s):
  return ''.join(
      c for c in unicodedata.normalize('NFD', s)
      if unicodedata.category(c) != 'Mn'
      and c in all_letters
  )

print(unicodeToAscii('Ślusàrski'))

# Build the category_lines dictionary, a list of names per language
category_lines = {}
all_categories = []

def readLines(filename):
  lines = open(filename, encoding = 'utf-8').read().strip().split('\n')
  return [unicodeToAscii(line) for line in lines]

for filename in findFiles('data/names/*.txt'):
  category = os.path.splitext(os.path.basename(filename))[0]
  all_categories.append(category)
  lines = readLines(filename)
  category_lines[category] = lines
  

n_categories = len(all_categories)
print(category_lines['Italian'][:5])


Slusarski
['Abandonato', 'Abatangelo', 'Abatantuono', 'Abate', 'Abategiovanni']


In [None]:
# 我们获得了一个变量 category_lines，而这是一个字典，所以则是类别（语言），
# 对应值是一个列表，其中包含多行数据（姓氏）
# 同时，还保存了 all_categories （语言列表） 以及 n_categories（语言数量）

## 将姓氏转换为张量
为了表示单个字母，我们使用大小为<1 x n_letters>的“ one-hot vector”。 一个“one hot”向量是当前字母的索引处为 1，其余部分为 0 的向量，例如 "b" = <0 1 0 0 0 ...>。

我们将每行的所有字母的“one hot”向量连接成 2D 矩阵<line_length x 1 x n_letters>来表示一个单词（姓氏）。

额外的 1 维是因为 PyTorch 假设所有内容都是批量的-我们这里批量大小为 1 。

In [26]:
import torch
a = torch.zeros(1,26)
a[0]

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.])

In [27]:
import torch

def letterToIndex(letter):
  return all_letters.find(letter)

def letterToTensor(letter):
  tensor = torch.zeros(1,n_letters)
  tensor[0][letterToIndex(letter)] = 1
  return tensor

def lineToTensor(line):
  tensor = torch.zeros(len(line),1,n_letters)
  for li, letter in enumerate(line):
    tensor[li][0][letterToIndex(letter)]=1
  return tensor

In [28]:
print(n_letters)
print(all_letters.find("r"))
a = letterToTensor("r")
print(a)
print(lineToTensor('Jacob'))
print(lineToTensor('Jacob').size())

57
17
tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0.]])
tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0.]],

        [[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0.]],

        [[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 

In [35]:
# Construct a neural network

import torch.nn as nn

class RNN(nn.Module):
  def __init__(self, input_size, hidden_size, output_size):
    super(RNN, self).__init__()

    self.hidden_size = hidden_size

    self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
    self.i2o = nn.Linear(input_size + hidden_size, output_size)
    self.softmax = nn.LogSoftmax(dim = 1)

  def forward(self, input, hidden):
    combined = torch.cat((input, hidden),1)
    hidden = self.i2h(combined)
    output = self.i2o(combined)
    output = self.softmax(output)
    return output, hidden

  def initHidden(self):
    return torch.zeros(1, self.hidden_size)

n_hidden = 128
rnn = RNN(n_letters, n_hidden, n_categories)


先测试单步运行， 传递一个字母的张量与上一步的隐藏状态（初始化其为0）。 返回的结果是每种语言的概率与下一步的隐藏状态（保留至下一步使用）。

In [38]:
input = letterToTensor('Jacob')
hidden = torch.zeros(1, n_hidden)

output, next_hidden = rnn(input, hidden)
print(output, "\n", next_hidden)

tensor([[-2.8662, -2.9319, -2.8176, -2.9756, -2.8815, -2.9386, -3.0103, -2.9071,
         -2.9440, -2.7864, -2.8506, -2.7800, -2.9132, -2.8188, -2.8520, -2.9681,
         -2.9079, -2.9135]], grad_fn=<LogSoftmaxBackward>) 
 tensor([[ 0.0238, -0.0183, -0.0456,  0.0060, -0.0294, -0.0789,  0.0797, -0.0074,
         -0.0161, -0.0321,  0.1135,  0.0613, -0.0208,  0.0706, -0.0929, -0.0139,
         -0.0921, -0.0189, -0.0227,  0.0176,  0.0132, -0.0062, -0.0909,  0.0542,
         -0.0481, -0.0651, -0.0521, -0.0416,  0.0437, -0.0409, -0.0659,  0.0965,
         -0.1046, -0.0313,  0.0325,  0.0513,  0.0277, -0.0948,  0.0536,  0.0303,
         -0.1007, -0.0630,  0.0497, -0.0741,  0.0746, -0.1241, -0.0425,  0.0224,
          0.0420, -0.0527,  0.0939,  0.0310, -0.0609,  0.0144, -0.0719,  0.0532,
         -0.0349, -0.1100, -0.0270, -0.0312,  0.0965,  0.0909,  0.0038,  0.0447,
         -0.0351,  0.0415,  0.0908, -0.0580, -0.1293,  0.0221, -0.0104,  0.0224,
          0.0028, -0.0269,  0.0347,  0.0402,  0.

In [45]:
input = lineToTensor('Albert')
hidden = torch.zeros(1, n_hidden)

output, next_hidden = rnn(input[0], hidden)
print(output)

def categoryFromOutput(output):
    top_n, top_i = output.topk(1)
    category_i = top_i[0].item()
    return all_categories[category_i], category_i

print(categoryFromOutput(output))
print(output.topk(1))
print(category_lines['Arabic'])


tensor([[-2.8980, -2.8657, -2.8503, -2.8620, -2.8573, -2.9138, -2.9576, -2.9687,
         -2.8908, -2.8126, -2.8690, -2.8361, -2.9298, -2.8677, -2.8776, -2.9520,
         -2.9548, -2.8804]], grad_fn=<LogSoftmaxBackward>)
('Portuguese', 9)
torch.return_types.topk(
values=tensor([[-2.8126]], grad_fn=<TopkBackward>),
indices=tensor([[9]]))
Type license() to see the full license text


In [48]:
import random

def randomChoice(l):
    return l[random.randint(0, len(l) - 1)]

def randomTrainingExample():
    category = randomChoice(all_categories)
    line = randomChoice(category_lines[category])
    category_tensor = torch.tensor([all_categories.index(category)], dtype=torch.long)
    line_tensor = lineToTensor(line)
    return category, line, category_tensor, line_tensor

for i in range(10):
    category, line, category_tensor, line_tensor = randomTrainingExample()
    print('category =', category, '/ line =', line)

category = Dutch / line = Schoonraad
category = French / line = Tremble
category = Vietnamese / line = Luong
category = Dutch / line = Sanna
category = English / line = Perrins
category = Portuguese / line = Franco
category = Chinese / line = Ding
category = Italian / line = Pietri
category = Chinese / line = Cheung
category = Spanish / line = Noguerra


In [None]:
criterion = nn.NLLLoss()
learning_rate = 0.005 # If you set this too high, it might explode. If too low, it might not learn

def train(category_tensor, line_tensor):
    hidden = rnn.initHidden()

    rnn.zero_grad()

    for i in range(line_tensor.size()[0]):
        output, hidden = rnn(line_tensor[i], hidden)

    loss = criterion(output, category_tensor)
    loss.backward()

    # Add parameters' gradients to their values, multiplied by learning rate
    for p in rnn.parameters():
        p.data.add_(-learning_rate, p.grad.data)

    return output, loss.item()