In [2]:
!ll

/bin/bash: ll: command not found


In [3]:
!ls

drive  sample_data


In [4]:
from __future__ import unicode_literals, print_function, division
from io import open
import glob
import os

def findFiles(path): return glob.glob(path)

print(findFiles('drive/MyDrive/Colab Notebooks/data/names/*.txt'))

['drive/MyDrive/Colab Notebooks/data/names/Spanish.txt', 'drive/MyDrive/Colab Notebooks/data/names/Italian.txt', 'drive/MyDrive/Colab Notebooks/data/names/Russian.txt', 'drive/MyDrive/Colab Notebooks/data/names/Korean.txt', 'drive/MyDrive/Colab Notebooks/data/names/Irish.txt', 'drive/MyDrive/Colab Notebooks/data/names/Polish.txt', 'drive/MyDrive/Colab Notebooks/data/names/Chinese.txt', 'drive/MyDrive/Colab Notebooks/data/names/Scottish.txt', 'drive/MyDrive/Colab Notebooks/data/names/Greek.txt', 'drive/MyDrive/Colab Notebooks/data/names/Japanese.txt', 'drive/MyDrive/Colab Notebooks/data/names/English.txt', 'drive/MyDrive/Colab Notebooks/data/names/German.txt', 'drive/MyDrive/Colab Notebooks/data/names/Dutch.txt', 'drive/MyDrive/Colab Notebooks/data/names/Arabic.txt', 'drive/MyDrive/Colab Notebooks/data/names/Portuguese.txt', 'drive/MyDrive/Colab Notebooks/data/names/Czech.txt', 'drive/MyDrive/Colab Notebooks/data/names/French.txt', 'drive/MyDrive/Colab Notebooks/data/names/Vietnamese.tx

In [5]:
import unicodedata
import string

all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

# 유니코드 문자열을 ASCII로 변환, https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
  return ''.join(
      c for c in unicodedata.normalize('NFD', s)
      if unicodedata.category(c) != 'Mn'
      and c in all_letters
  )

print(unicodeToAscii('Ślusàrski'))

# 각 언어의 이름 목록인 category_lines 사전 생성
category_lines = {}
all_categories = []

# 파일을 읽고 줄 단위로 분리
def readLines(filename):
  lines = open(filename, encoding='utf-8').read().strip().split('\n')
  return [unicodeToAscii(line) for line in lines]

for filename in findFiles('drive/MyDrive/Colab Notebooks/data/names/*.txt'):
  category = os.path.splitext(os.path.basename(filename))[0]
  all_categories.append(category)
  lines = readLines(filename)
  category_lines[category] = lines

n_categories = len(all_categories)

Slusarski


In [6]:
print(category_lines['Italian'][:5])

['Abandonato', 'Abatangelo', 'Abatantuono', 'Abate', 'Abategiovanni']


In [7]:
'''
.. NOTE::
역자 주:  One-Hot 벡터는 언어를 다룰 때 자주 이용되며,
단어,글자 등을 벡터로 표현 할 때 단어,글자 사이의 상관 관계를 미리 알 수 없을 경우,
One-Hot으로 표현하여 서로 직교한다고 가정하고 학습을 시작합니다.
동일하게 상관 관계를 알 수 없는 다른 데이터의 경우에도 One-Hot 벡터를 활용 할 수 있습니다.
'''

import torch

# all_letters 로 문자의 주소 찾기, 예시 "a" = 0
def letterToIndex(letter):
    return all_letters.find(letter)

# 검증을 위해서 한개의 문자를 <1 x n_letters> Tensor로 변환
def letterToTensor(letter):
    tensor = torch.zeros(1, n_letters)
    tensor[0][letterToIndex(letter)] = 1
    return tensor

# 한 줄(이름)을  <line_length x 1 x n_letters>,
# 또는 One-Hot 문자 벡터의 Array로 변경
def lineToTensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

print(letterToTensor('J'))

print(lineToTensor('Jones').size())

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0.]])
torch.Size([5, 1, 57])


In [8]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

n_hidden = 128
rnn = RNN(n_letters, n_hidden, n_categories)

In [9]:
input = letterToTensor('A')
hidden =torch.zeros(1, n_hidden)

output, next_hidden = rnn(input, hidden)
print(output)

tensor([[-2.8030, -2.7792, -2.8164, -2.8655, -2.9508, -2.9698, -2.8966, -2.8035,
         -2.9222, -2.9443, -2.9769, -2.9137, -2.9794, -2.8319, -2.9016, -2.8203,
         -2.9439, -2.9470]], grad_fn=<LogSoftmaxBackward>)


In [10]:
input = lineToTensor('Albert')
hidden = torch.zeros(1, n_hidden)

output, next_hidden = rnn(input[0], hidden)
print(output)

tensor([[-2.8030, -2.7792, -2.8164, -2.8655, -2.9508, -2.9698, -2.8966, -2.8035,
         -2.9222, -2.9443, -2.9769, -2.9137, -2.9794, -2.8319, -2.9016, -2.8203,
         -2.9439, -2.9470]], grad_fn=<LogSoftmaxBackward>)


In [11]:
def categoryFromOutput(output):
  top_n, top_i = output.topk(1) # 텐서의 가장 큰 값 및 주소
  category_i = top_i[0].item()     # 텐서에서 정수 값으로 변경
  return all_categories[category_i], category_i

print(categoryFromOutput(output))

('Italian', 1)


In [16]:
import random

def randomChoice(l):
    return l[random.randint(0, len(l) - 1)]

def randomTrainingExample():
    category = randomChoice(all_categories)
    line = randomChoice(category_lines[category])
    category_tensor = torch.tensor([all_categories.index(category)], dtype=torch.long)
    line_tensor = lineToTensor(line)
    return category, line, category_tensor, line_tensor

for i in range(10):
    category, line, category_tensor, line_tensor = randomTrainingExample()
    print('category =', category, '/ line =', line)
    print(category_tensor)
    print(line_tensor)

category = Russian / line = Mihalchuk
tensor([2])
tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0.]],

        [[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0

In [13]:
criterion = nn.NLLLoss()

In [14]:
learning_rate = 0.005 # 이것을 너무 높게 설정하면 발산할 수 있고, 너무 낮으면 학습이 되지 않을 수 있습니다.

def train(category_tensor, line_tensor):
    hidden = rnn.initHidden()

    rnn.zero_grad()

    for i in range(line_tensor.size()[0]):
        output, hidden = rnn(line_tensor[i], hidden)

    loss = criterion(output, category_tensor)
    loss.backward()

    # 매개변수의 경사도에 학습률을 곱해서 그 매개변수의 값에 더합니다.
    for p in rnn.parameters():
        p.data.add_(p.grad.data, alpha=-learning_rate)

    return output, loss.item()

In [15]:
import time
import math

n_iters = 100000
print_every = 5000
plot_every = 1000



# 도식화를 위한 손실 추적
current_loss = 0
all_losses = []

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

start = time.time()

for iter in range(1, n_iters + 1):
    category, line, category_tensor, line_tensor = randomTrainingExample()
    output, loss = train(category_tensor, line_tensor)
    current_loss += loss

    # iter 숫자, 손실, 이름, 추측 화면 출력
    if iter % print_every == 0:
        guess, guess_i = categoryFromOutput(output)
        correct = '✓' if guess == category else '✗ (%s)' % category
        print('%d %d%% (%s) %.4f %s / %s %s' % (iter, iter / n_iters * 100, timeSince(start), loss, line, guess, correct))

    # 현재 평균 손실을 전체 손실 리스트에 추가
    if iter % plot_every == 0:
        all_losses.append(current_loss / plot_every)
        current_loss = 0

5000 5% (0m 6s) 2.5505 Vinh / Vietnamese ✓
10000 10% (0m 12s) 2.0174 Araujo / Portuguese ✓
15000 15% (0m 18s) 1.6029 Donati / Italian ✓
20000 20% (0m 25s) 2.4258 Dirchs / Portuguese ✗ (German)
25000 25% (0m 31s) 1.1920 Davidson / Scottish ✓
30000 30% (0m 37s) 0.5581 Snijders / Dutch ✓
35000 35% (0m 43s) 0.5662 Xian / Chinese ✓
40000 40% (0m 49s) 1.8735 Albero / Portuguese ✗ (Italian)
45000 45% (0m 55s) 1.8255 Garcia / Spanish ✗ (Portuguese)
50000 50% (1m 1s) 0.3051 Paloumbas / Greek ✓
55000 55% (1m 8s) 0.0242 Beltyukov / Russian ✓
60000 60% (1m 14s) 0.2909 Johnstone / Scottish ✓
65000 65% (1m 20s) 2.3283 Kara / Japanese ✗ (Czech)
70000 70% (1m 26s) 0.3907 Zang / Chinese ✓
75000 75% (1m 33s) 3.8460 Can / Chinese ✗ (Dutch)
80000 80% (1m 39s) 0.3484 Lefebvre / French ✓
85000 85% (1m 45s) 0.6335 Zielinski / Polish ✓
90000 90% (1m 51s) 0.7281 Dioletis / Greek ✓
95000 95% (1m 58s) 0.5174 Etxeberria / Spanish ✓
100000 100% (2m 4s) 0.7296 Kouros / Greek ✓
