In [10]:
from __future__ import unicode_literals, print_function, division
from io import open
import glob
import os
from random import shuffle

def findFiles(path): return glob.glob(path)

print(findFiles('./*.txt'))

import unicodedata
import string

all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

category_lines = {}
all_categories = []
def readLines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicodeToAscii(line) for line in lines]

for filename in findFiles('./*.txt'):
    category = os.path.splitext(os.path.basename(filename))[0]
    all_categories.append(category)
    lines = readLines(filename)
    category_lines[category] = lines

n_categories = len(all_categories)
completeData=[]
print (all_categories)
for category in category_lines:
    for name in category_lines[category]:
        line=[]
        line.append(category)
        line.append(name)
        completeData.append(line)
shuffle(completeData)
total_names=len(completeData)
num_of_training=int((total_names*3)/4)
train_data=completeData[:num_of_training]
test_data=completeData[num_of_training:]

for i in range(1,10):
    print (train_data[i])

import torch

def letterToIndex(letter):
    return all_letters.find(letter)
def letterToTensor(letter):
    tensor = torch.zeros(1, n_letters)
    tensor[0][letterToIndex(letter)] = 1
    return tensor
def lineToTensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor



import torch.nn as nn
import torch
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)


#from data import *
#from rnn import *

def categoryFromOutput(output):
    top_n, top_i = output.topk(1)
    category_i = top_i[0].item()
    return all_categories[category_i], category_i

import random

def randomChoice(l):
    return l[random.randint(0, len(l) - 1)]

def randomTrainingExample():
    category = randomChoice(all_categories)
    line = randomChoice(category_lines[category])
    category_tensor = torch.tensor([all_categories.index(category)], dtype=torch.long)
    line_tensor = lineToTensor(line)
    return category, line, category_tensor, line_tensor

for i in range(10):
    category, line, category_tensor, line_tensor = randomTrainingExample()
    print('category =', category, '/ line =', line)

criterion = nn.NLLLoss()
n_hidden = 128
rnn = RNN(n_letters, n_hidden, n_categories)

learning_rate = 0.005 

def train(category_tensor, line_tensor):
    hidden = rnn.initHidden()

    rnn.zero_grad()

    for i in range(line_tensor.size()[0]):
        output, hidden = rnn(line_tensor[i], hidden)

    loss = criterion(output, category_tensor)
    loss.backward()

    # Add parameters' gradients to their values, multiplied by learning rate
    for p in rnn.parameters():
        p.data.add_(-learning_rate, p.grad.data)

    return output, loss.item()



import time
import math
from random import shuffle
n_iters = 10
print_every = 2000
plot_every = 2000



# Keep track of losses for plotting
current_loss = 0
all_losses = []

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

start = time.time()
shuffle(train_data)
for epoch in range(1, n_iters + 1):
    iter=0
    for example in train_data:
    #category, line, category_tensor, line_tensor = randomTrainingExample()
        category_tensor=torch.tensor([all_categories.index(example[0])],dtype=torch.long)
        line_tensor=lineToTensor(example[1])
        output, loss = train(category_tensor, line_tensor)
        current_loss += loss
        iter+=1

    # Print iter number, loss, name and guess
        if iter % print_every == 0:
            #guess, guess_i = categoryFromOutput(output)
            #correct = 'right' if guess == category else 'wrong (%s)' % category
            print('epoch %d %d%% (%s) %.4f' % (epoch, ( iter / len(train_data) )* 100, timeSince(start), loss))

    # Add current loss avg to list of losses
        if iter % plot_every == 0:
            all_losses.append(current_loss / plot_every)
            current_loss = 0


torch.save(rnn,'classification.pt')

['./Vietnamese.txt', './Scottish.txt', './Dutch.txt', './Japanese.txt', './Portuguese.txt', './Polish.txt', './Arabic.txt', './Irish.txt', './Czech.txt', './Russian.txt', './German.txt', './Italian.txt', './Chinese.txt', './Spanish.txt', './Korean.txt', './Greek.txt', './English.txt', './French.txt']
['Vietnamese', 'Scottish', 'Dutch', 'Japanese', 'Portuguese', 'Polish', 'Arabic', 'Irish', 'Czech', 'Russian', 'German', 'Italian', 'Chinese', 'Spanish', 'Korean', 'Greek', 'English', 'French']
['German', 'Kaspar']
['Italian', 'Salvatici']
['Russian', 'Veprentsov']
['Russian', 'Jokin']
['Russian', 'Matskevich']
['Russian', 'Adashev']
['Dutch', 'Vennen']
['Russian', 'Patsiorkovsky']
['Russian', 'Balalaev']
category = Japanese / line = Tommii
category = Portuguese / line = Rodrigues
category = Korean / line = Choe
category = Irish / line = Meadhra
category = Greek / line = Kokoris
category = Vietnamese / line = Trieu
category = Russian / line = Zabrodin
category = Scottish / line = Christie


  "type " + obj.__name__ + ". It won't be checked "


In [11]:


from torch.autograd import Variable
import sys

rnn = torch.load('classification.pt')

def categoryFromOutput(output):
    top_n, top_i = output.topk(1)
    category_i = top_i[0].item()
    return all_categories[category_i], category_i

# Just return an output given a line
def evaluate(line_tensor):
    hidden = rnn.initHidden()
    
    for i in range(line_tensor.size()[0]):
        output, hidden = rnn(line_tensor[i], hidden)
    
    return output

def predict(line, n_predictions=3):
    #output = evaluate(Variable(lineToTensor(line)))
    output= evaluate(Variable(lineToTensor(line)))
    # Get top N categories
    topv, topi = output.data.topk(n_predictions, 1, True)
    predictions = []

    for i in range(n_predictions):
        value = topv[0][i]
        category_index = topi[0][i]
        print('(%.2f) %s' % (value, all_categories[category_index]))
        predictions.append([value, all_categories[category_index]])

    return predictions

if __name__ == '__main__':
    predict(sys.argv[1])
shuffle(test_data)
num_correct=0
for example in test_data:
    line_tensor=lineToTensor(example[1])
    category=example[0]
    output=evaluate(line_tensor)
    guess,guess_i=categoryFromOutput(output)
    if guess == category:
        num_correct+=1

accuracy=num_correct*100/len(test_data)
print (accuracy)

(-0.52) Russian
(-2.47) English
(-2.97) Arabic
74.75592747559274


# New Section