In [2]:
import torch
import torch.nn as nn
import numpy as np
import glob
import string
import csv
import os
from matplotlib import pyplot as plt
import sys

# various helper functions
from torch_name_classifier_helpers import readLines
from torch_name_classifier_helpers import randomTrainingExample
from torch_name_classifier_helpers import categoryFromOutput
from torch_name_classifier_helpers import textToTensor

# declare RNN
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input_layer, hidden_layer):
        combined_layer = torch.cat((input_layer,hidden_layer), 1)
        hidden_layer = self.i2h(combined_layer)
        output_layer = self.i2o(combined_layer)
        output_layer = self.softmax(output_layer)
        return output_layer, hidden_layer
    def initHidden(self):
        return torch.zeros(1,self.hidden_size)

# develop training function
def train(the_rnn, category_tensor, line_tensor, criterion, learn_rate):
    # init hidden layer
    hidden_layer = the_rnn.initHidden()
    the_rnn.zero_grad()

    # make out predictions, one char at a time
    for i in range(line_tensor.size(0)):
        output_layer,hidden_layer = the_rnn(line_tensor[i], hidden_layer)

    loss = criterion(output_layer, category_tensor)
    loss.backward()

    # use steepest-descent to optimize
    for p in the_rnn.parameters():
        p.data.add_(-learn_rate, p.grad.data)

    return output_layer,loss.item()

def predict(the_rnn, line_tensor):
    hidden_layer = the_rnn.initHidden()

    for i in range(list(line_tensor.size())[0]):
        output_layer,hidden_layer = the_rnn(line_tensor[i], hidden_layer)

    return output_layer
        
def main():
    # declare regex for files containing names
    fnames = 'data/names/*.txt'

    # assemble sequence of valid ASCII characters
    # that can occur in a name
    all_letters = string.ascii_letters + " .,;'"
    n_letters = len(all_letters)

    # Build the category_lines dictionary, a list of names per language
    category_lines = {}
    all_categories = []

    nfiles = 0
    for filename in glob.glob(fnames):
        # basename of file is the lanquage
        category = os.path.splitext(os.path.basename(filename))[0]
        # add category (i.e. language) to list
        all_categories.append(category)
        # add names to dictionary, indexed by language
        lines = readLines(filename, all_letters)
        category_lines[category] = lines
        nfiles += 1
    if(nfiles == 0):
        print("No files found for regular expression ("+fnames+")")
        sys.exit(-1)
        
    # count number of languages (i.e. classes)
    n_categories = len(all_categories)

    # write categories to csv file
    with open('all_categories.csv', 'w') as f:
        writer = csv.writer(f)
        writer.writerows([all_categories])

    # 4. create instance of the RNN
    n_input_neurons = n_letters
    n_hidden_neurons = 256
    n_output_neurons = n_categories
    MyRNN = RNN(n_input_neurons, n_hidden_neurons, n_output_neurons)
    
    # 5. load checkpoint, if available
    
    # 6. set training parameters
    my_criterion = nn.NLLLoss()
    my_learn_rate = 0.001
    n_iters = 100000
    print_every = 50000
    avg_loss = 0.00
    hit_rate = 0.00
    dhit = 1.00 / float(print_every)
    all_avg_losses = []
    all_hit_rates = []
    
    for i in range(1, n_iters + 1):
# TO DO:
        # 7. train on random feature-label pair
        category, line, category_tensor, line_tensor =\
                  randomTrainingExample(all_categories, category_lines, all_letters)
        output, loss = train(MyRNN, category_tensor, line_tensor, my_criterion, my_learn_rate)
        avg_loss += loss
        guess, guess_idx = categoryFromOutput(output, all_categories)

        if(guess == category):
            hit_rate += dhit
            
        # periodically report progress
        if i % print_every == 0:
            avg_loss /= float(print_every)
            print("iteration # " + str(i) + " of " + str(n_iters))            
            print("   Avg. loss: {:6.2f}".format(avg_loss))
            print("    Hit Rate: {:6.2f}".format(hit_rate))
            if(guess == category):
                print("   " + line + " : " + guess + " == " + category)
            else:
                print("   " + line + " : " + guess + " != " + category)
            all_avg_losses.append(avg_loss)
            all_hit_rates.append(hit_rate)
            avg_loss = 0.00
            hit_rate = 0.00
            
        # 8. Save a checkpoint
            
    fig,ax1 = plt.subplots()
    ax1.plot(all_avg_losses)
    ax1.set_ylabel("Loss Function", color="r")
    ax2 = ax1.twinx()
    ax2.plot(all_hit_rates)
    ax2.set_ylabel("Success Rate", color="b")
    plt.xlabel("Training Iteration")
    plt.savefig("convergence.png", dpi=100)
    # plt.show()    
    plt.close()

    # test the skill of the model
    total = 0
    correct = 0
    confusion_matrix = np.zeros((n_categories,n_categories))
    for actual_category in all_categories:
        cat_idx = all_categories.index(actual_category)
        for line in category_lines[actual_category]:
            line_tensor = textToTensor(line, all_letters)
            output = predict(MyRNN, line_tensor)
            guess, guess_idx = categoryFromOutput(output, all_categories)
            
            # update confusion matrix
            for i in range(0,n_categories):
                confusion_matrix[cat_idx,guess_idx] += 1          
            total += 1
            correct += int(cat_idx == guess_idx)

    # report results
    print(' ')
    print('Classification Accuracy: {:.2f}%'.format(100 * correct / total))
    print(' ')
    print('Confusion Matrix')
    print('      |----------------------- Predicted ------------------------- ...')
    print('Actual|',end='')
    for i in range(0,n_categories):
          print('{:6s}|'.format(all_categories[i][0:6]), end='')
    print('')
    print('======|', end='')
    for i in range(0,n_categories):
          print('======|', end='')
    print('')          
    for i in range(0,n_categories):
        print('{:6s}|'.format(all_categories[i][0:6]), end='')
        for j in range(0,n_categories):
            cm = int(confusion_matrix[i,j])
            print('{:6d}|'.format(cm),end='')
        print('')

# launch the main program
main()
    

    


iteration # 50000 of 100000
   Avg. loss:   2.62
    Hit Rate:   0.25
   Mentis : Greek == Greek
iteration # 100000 of 100000
   Avg. loss:   2.04
    Hit Rate:   0.38
   Mai : Chinese != Vietnamese
 
Classification Accuracy: 37.93%
 
Confusion Matrix
      |----------------------- Predicted ------------------------- ...
Actual|Czech |German|Arabic|Japane|Chines|Vietna|Russia|French|Irish |Englis|Indian|Pakist|Spanis|Greek |Italia|Portug|Scotti|Dutch |Korean|Polish|
Czech |   760|   840|   560|   580|   140|    80|   240|   440|   280|    80|   260|    20|   680|   460|   560|   240|   840|   360|   100|  2860|
German|   160|  4780|   840|   240|   380|   220|    60|  1020|   420|   380|   460|     0|   660|   260|   480|   380|  1620|  1060|   180|   880|
Arabic|   720|  1600| 19840|  4400|   660|   500|     0|  1480|   440|     0|  2500|  1020|  1080|  1800|   640|   480|  1080|     0|  1340|   420|
Japane|   180|     0|  1160| 11280|   360|   460|   180|   140|   100|    60|   400| 

In [3]:
def main():
    # declare regex for files containing names
    fnames = 'data/names/*.txt'

    # assemble sequence of valid ASCII characters
    # that can occur in a name
    all_letters = string.ascii_letters + " .,;'"
    n_letters = len(all_letters)

    # Build the category_lines dictionary, a list of names per language
    category_lines = {}
    all_categories = []

    nfiles = 0
    for filename in glob.glob(fnames):
        # basename of file is the lanquage
        category = os.path.splitext(os.path.basename(filename))[0]
        # add category (i.e. language) to list
        all_categories.append(category)
        # add names to dictionary, indexed by language
        lines = readLines(filename, all_letters)
        category_lines[category] = lines
        nfiles += 1
    if(nfiles == 0):
        print("No files found for regular expression ("+fnames+")")
        sys.exit(-1)
        
    # count number of languages (i.e. classes)
    n_categories = len(all_categories)

    # write categories to csv file
    with open('all_categories.csv', 'w') as f:
        writer = csv.writer(f)
        writer.writerows([all_categories])

    # 4. create instance of the RNN
    n_input_neurons = n_letters
    n_hidden_neurons = 256
    n_output_neurons = n_categories
    MyRNN = RNN(n_input_neurons, n_hidden_neurons, n_output_neurons)
    
    # 5. load checkpoint, if available
    
    # 6. set training parameters
    my_criterion = nn.NLLLoss()
    my_learn_rate = 0.001
    n_iters = 200000
    print_every = 100000
    avg_loss = 0.00
    hit_rate = 0.00
    dhit = 1.00 / float(print_every)
    all_avg_losses = []
    all_hit_rates = []
    
    for i in range(1, n_iters + 1):
        category, line, category_tensor, line_tensor =\
                  randomTrainingExample(all_categories, category_lines, all_letters)
        output, loss = train(MyRNN, category_tensor, line_tensor, my_criterion, my_learn_rate)
        avg_loss += loss
        guess, guess_idx = categoryFromOutput(output, all_categories)

        if(guess == category):
            hit_rate += dhit
            
        # periodically report progress
        if i % print_every == 0:
            avg_loss /= float(print_every)
            print("iteration # " + str(i) + " of " + str(n_iters))            
            print("   Avg. loss: {:6.2f}".format(avg_loss))
            print("    Hit Rate: {:6.2f}".format(hit_rate))
            if(guess == category):
                print("   " + line + " : " + guess + " == " + category)
            else:
                print("   " + line + " : " + guess + " != " + category)
            all_avg_losses.append(avg_loss)
            all_hit_rates.append(hit_rate)
            avg_loss = 0.00
            hit_rate = 0.00
            
        # 8. Save a checkpoint
            
    fig,ax1 = plt.subplots()
    ax1.plot(all_avg_losses)
    ax1.set_ylabel("Loss Function", color="r")
    ax2 = ax1.twinx()
    ax2.plot(all_hit_rates)
    ax2.set_ylabel("Success Rate", color="b")
    plt.xlabel("Training Iteration")
    plt.savefig("convergence.png", dpi=100)
    # plt.show()    
    plt.close()

    # test the skill of the model
    total = 0
    correct = 0
    confusion_matrix = np.zeros((n_categories,n_categories))
    for actual_category in all_categories:
        cat_idx = all_categories.index(actual_category)
        for line in category_lines[actual_category]:
            line_tensor = textToTensor(line, all_letters)
            output = predict(MyRNN, line_tensor)
            guess, guess_idx = categoryFromOutput(output, all_categories)
            
            # update confusion matrix
            for i in range(0,n_categories):
                confusion_matrix[cat_idx,guess_idx] += 1          
            total += 1
            correct += int(cat_idx == guess_idx)

    # report results
    print(' ')
    print('Classification Accuracy: {:.2f}%'.format(100 * correct / total))
    print(' ')
    print('Confusion Matrix')
    print('      |----------------------- Predicted ------------------------- ...')
    print('Actual|',end='')
    for i in range(0,n_categories):
          print('{:6s}|'.format(all_categories[i][0:6]), end='')
    print('')
    print('======|', end='')
    for i in range(0,n_categories):
          print('======|', end='')
    print('')          
    for i in range(0,n_categories):
        print('{:6s}|'.format(all_categories[i][0:6]), end='')
        for j in range(0,n_categories):
            cm = int(confusion_matrix[i,j])
            print('{:6d}|'.format(cm),end='')
        print('')

# launch the main program
main()
    

    


iteration # 100000 of 200000
   Avg. loss:   2.34
    Hit Rate:   0.32
   Araullo : Italian != Portuguese
iteration # 200000 of 200000
   Avg. loss:   1.65
    Hit Rate:   0.50
   Kouches : Greek == Greek
 
Classification Accuracy: 57.04%
 
Confusion Matrix
      |----------------------- Predicted ------------------------- ...
Actual|Czech |German|Arabic|Japane|Chines|Vietna|Russia|French|Irish |Englis|Indian|Pakist|Spanis|Greek |Italia|Portug|Scotti|Dutch |Korean|Polish|
Czech |  3380|   980|   520|   620|    40|   140|   860|   100|    60|   420|   440|   220|   280|   500|    80|   180|   340|   300|    60|   860|
German|   820|  6100|   720|   380|   300|   180|   700|   380|   120|  1040|   380|   180|   260|   400|   180|   220|   860|   840|   220|   200|
Arabic|   660|   760| 27440|  3040|  1200|   800|   760|     0|     0|   420|   560|  1260|     0|  1320|   300|   440|   420|   360|   260|     0|
Japane|   260|    60|   780| 15380|   220|   200|   420|    60|     0|    80|  

In [11]:
def main():
    # declare regex for files containing names
    fnames = 'data/names/*.txt'

    # assemble sequence of valid ASCII characters
    # that can occur in a name
    all_letters = string.ascii_letters + " .,;'"
    n_letters = len(all_letters)

    # Build the category_lines dictionary, a list of names per language
    category_lines = {}
    all_categories = []

    nfiles = 0
    for filename in glob.glob(fnames):
        # basename of file is the lanquage
        category = os.path.splitext(os.path.basename(filename))[0]
        # add category (i.e. language) to list
        all_categories.append(category)
        # add names to dictionary, indexed by language
        lines = readLines(filename, all_letters)
        category_lines[category] = lines
        nfiles += 1
    if(nfiles == 0):
        print("No files found for regular expression ("+fnames+")")
        sys.exit(-1)
        
    # count number of languages (i.e. classes)
    n_categories = len(all_categories)

    # write categories to csv file
    with open('all_categories.csv', 'w') as f:
        writer = csv.writer(f)
        writer.writerows([all_categories])

    # 4. create instance of the RNN
    n_input_neurons = n_letters
    n_hidden_neurons = 256
    n_output_neurons = n_categories
    MyRNN = RNN(n_input_neurons, n_hidden_neurons, n_output_neurons)
    
    # 5. load checkpoint, if available
    
    # 6. set training parameters
    my_criterion = nn.NLLLoss()
    my_learn_rate = 0.001
    n_iters = 500000
    print_every = 250000
    avg_loss = 0.00
    hit_rate = 0.00
    dhit = 1.00 / float(print_every)
    all_avg_losses = []
    all_hit_rates = []
    
    for i in range(1, n_iters + 1):
        category, line, category_tensor, line_tensor =\
                  randomTrainingExample(all_categories, category_lines, all_letters)
        output, loss = train(MyRNN, category_tensor, line_tensor, my_criterion, my_learn_rate)
        avg_loss += loss
        guess, guess_idx = categoryFromOutput(output, all_categories)

        if(guess == category):
            hit_rate += dhit
            
        # periodically report progress
        if i % print_every == 0:
            avg_loss /= float(print_every)
            print("iteration # " + str(i) + " of " + str(n_iters))            
            print("   Avg. loss: {:6.2f}".format(avg_loss))
            print("    Hit Rate: {:6.2f}".format(hit_rate))
            if(guess == category):
                print("   " + line + " : " + guess + " == " + category)
            else:
                print("   " + line + " : " + guess + " != " + category)
            all_avg_losses.append(avg_loss)
            all_hit_rates.append(hit_rate)
            avg_loss = 0.00
            hit_rate = 0.00
            
        # 8. Save a checkpoint
        torch.save(MyRNN.state_dict(),'mnist_names_model.pkl')
            
    fig,ax1 = plt.subplots()
    ax1.plot(all_avg_losses)
    ax1.set_ylabel("Loss Function", color="r")
    ax2 = ax1.twinx()
    ax2.plot(all_hit_rates)
    ax2.set_ylabel("Success Rate", color="b")
    plt.xlabel("Training Iteration")
    plt.savefig("convergence.png", dpi=100)
    # plt.show()    
    plt.close()

    # test the skill of the model
    total = 0
    correct = 0
    confusion_matrix = np.zeros((n_categories,n_categories))
    for actual_category in all_categories:
        cat_idx = all_categories.index(actual_category)
        for line in category_lines[actual_category]:
            line_tensor = textToTensor(line, all_letters)
            output = predict(MyRNN, line_tensor)
            guess, guess_idx = categoryFromOutput(output, all_categories)
            
            # update confusion matrix
            for i in range(0,n_categories):
                confusion_matrix[cat_idx,guess_idx] += 1          
            total += 1
            correct += int(cat_idx == guess_idx)

    # report results
    print(' ')
    print('Classification Accuracy: {:.2f}%'.format(100 * correct / total))
    print(' ')
    print('Confusion Matrix')
    print('      |----------------------- Predicted ------------------------- ...')
    print('Actual|',end='')
    for i in range(0,n_categories):
          print('{:6s}|'.format(all_categories[i][0:6]), end='')
    print('')
    print('======|', end='')
    for i in range(0,n_categories):
          print('======|', end='')
    print('')          
    for i in range(0,n_categories):
        print('{:6s}|'.format(all_categories[i][0:6]), end='')
        for j in range(0,n_categories):
            cm = int(confusion_matrix[i,j])
            print('{:6d}|'.format(cm),end='')
        print('')

# launch the main program
main()
    
    

iteration # 250000 of 500000
   Avg. loss:   1.88
    Hit Rate:   0.44
   Cowie : Greek != English
iteration # 500000 of 500000
   Avg. loss:   1.24
    Hit Rate:   0.61
   Lindsay : English != Scottish
 
Classification Accuracy: 62.56%
 
Confusion Matrix
      |----------------------- Predicted ------------------------- ...
Actual|Czech |German|Arabic|Japane|Chines|Vietna|Russia|French|Irish |Englis|Indian|Pakist|Spanis|Greek |Italia|Portug|Scotti|Dutch |Korean|Polish|
Czech |  3460|   740|   460|   300|    60|   200|   380|   220|   500|   900|   360|   360|   240|   200|   260|   260|   240|   540|   100|   600|
German|   420|  6520|   620|   100|   180|   220|   260|   720|   540|  1340|   480|   200|   100|   100|   220|   320|   800|   860|   300|   180|
Arabic|     0|     0| 32020|   680|  1200|   240|     0|     0|   400|   480|  1340|   380|     0|  1040|   740|   440|   420|   360|   260|     0|
Japane|   100|    20|  1260| 15200|   180|   220|   120|    60|   140|    80|   5

In [6]:
def main():
    # declare regex for files containing names
    fnames = 'data/names/*.txt'

    # assemble sequence of valid ASCII characters
    # that can occur in a name
    all_letters = string.ascii_letters + " .,;'"
    n_letters = len(all_letters)

    # Build the category_lines dictionary, a list of names per language
    category_lines = {}
    all_categories = []

    nfiles = 0
    for filename in glob.glob(fnames):
        # basename of file is the lanquage
        category = os.path.splitext(os.path.basename(filename))[0]
        # add category (i.e. language) to list
        all_categories.append(category)
        # add names to dictionary, indexed by language
        lines = readLines(filename, all_letters)
        category_lines[category] = lines
        nfiles += 1
    if(nfiles == 0):
        print("No files found for regular expression ("+fnames+")")
        sys.exit(-1)
        
    # count number of languages (i.e. classes)
    n_categories = len(all_categories)

    # write categories to csv file
    with open('all_categories.csv', 'w') as f:
        writer = csv.writer(f)
        writer.writerows([all_categories])

    # 4. create instance of the RNN
    n_input_neurons = n_letters
    n_hidden_neurons = 256
    n_output_neurons = n_categories
    MyRNN = RNN(n_input_neurons, n_hidden_neurons, n_output_neurons)
    
    # 5. load checkpoint, if available
    
    # 6. set training parameters
    my_criterion = nn.NLLLoss()
    my_learn_rate = 0.005
    n_iters = 100000
    print_every = 50000
    avg_loss = 0.00
    hit_rate = 0.00
    dhit = 1.00 / float(print_every)
    all_avg_losses = []
    all_hit_rates = []
    
    for i in range(1, n_iters + 1):
# TO DO:
        # 7. train on random feature-label pair
        category, line, category_tensor, line_tensor =\
                  randomTrainingExample(all_categories, category_lines, all_letters)
        output, loss = train(MyRNN, category_tensor, line_tensor, my_criterion, my_learn_rate)
        avg_loss += loss
        guess, guess_idx = categoryFromOutput(output, all_categories)

        if(guess == category):
            hit_rate += dhit
            
        # periodically report progress
        if i % print_every == 0:
            avg_loss /= float(print_every)
            print("iteration # " + str(i) + " of " + str(n_iters))            
            print("   Avg. loss: {:6.2f}".format(avg_loss))
            print("    Hit Rate: {:6.2f}".format(hit_rate))
            if(guess == category):
                print("   " + line + " : " + guess + " == " + category)
            else:
                print("   " + line + " : " + guess + " != " + category)
            all_avg_losses.append(avg_loss)
            all_hit_rates.append(hit_rate)
            avg_loss = 0.00
            hit_rate = 0.00
            
        # 8. Save a checkpoint
            
    fig,ax1 = plt.subplots()
    ax1.plot(all_avg_losses)
    ax1.set_ylabel("Loss Function", color="r")
    ax2 = ax1.twinx()
    ax2.plot(all_hit_rates)
    ax2.set_ylabel("Success Rate", color="b")
    plt.xlabel("Training Iteration")
    plt.savefig("convergence.png", dpi=100)
    # plt.show()    
    plt.close()

    # test the skill of the model
    total = 0
    correct = 0
    confusion_matrix = np.zeros((n_categories,n_categories))
    for actual_category in all_categories:
        cat_idx = all_categories.index(actual_category)
        for line in category_lines[actual_category]:
            line_tensor = textToTensor(line, all_letters)
            output = predict(MyRNN, line_tensor)
            guess, guess_idx = categoryFromOutput(output, all_categories)
            
            # update confusion matrix
            for i in range(0,n_categories):
                confusion_matrix[cat_idx,guess_idx] += 1          
            total += 1
            correct += int(cat_idx == guess_idx)

    # report results
    print(' ')
    print('Classification Accuracy: {:.2f}%'.format(100 * correct / total))
    print(' ')
    print('Confusion Matrix')
    print('      |----------------------- Predicted ------------------------- ...')
    print('Actual|',end='')
    for i in range(0,n_categories):
          print('{:6s}|'.format(all_categories[i][0:6]), end='')
    print('')
    print('======|', end='')
    for i in range(0,n_categories):
          print('======|', end='')
    print('')          
    for i in range(0,n_categories):
        print('{:6s}|'.format(all_categories[i][0:6]), end='')
        for j in range(0,n_categories):
            cm = int(confusion_matrix[i,j])
            print('{:6d}|'.format(cm),end='')
        print('')

# launch the main program
main()

iteration # 50000 of 100000
   Avg. loss:   2.01
    Hit Rate:   0.40
   Macshuibhne : Dutch != Irish
iteration # 100000 of 100000
   Avg. loss:   1.48
    Hit Rate:   0.54
   Schirmer : German == German
 
Classification Accuracy: 43.28%
 
Confusion Matrix
      |----------------------- Predicted ------------------------- ...
Actual|Czech |German|Arabic|Japane|Chines|Vietna|Russia|French|Irish |Englis|Indian|Pakist|Spanis|Greek |Italia|Portug|Scotti|Dutch |Korean|Polish|
Czech |  2440|  1120|   340|   200|   140|   160|   160|   400|    20|   680|   360|    20|   480|   100|   400|   180|   400|   380|   300|  2100|
German|   480|  6060|   400|    40|   180|   240|    20|  1340|   100|   580|   360|     0|   620|   100|   260|   280|  1660|   640|   640|   480|
Arabic|   720|   280| 20600|   680|   320|   500|     0|   380|     0|   420|  7200|  1500|  1200|  1500|   300|  1160|   580|   760|  1040|   860|
Japane|   580|   200|   800|  7240|   220|   200|    80|   340|    40|   220|  1

In [7]:
def main():
    # declare regex for files containing names
    fnames = 'data/names/*.txt'

    # assemble sequence of valid ASCII characters
    # that can occur in a name
    all_letters = string.ascii_letters + " .,;'"
    n_letters = len(all_letters)

    # Build the category_lines dictionary, a list of names per language
    category_lines = {}
    all_categories = []

    nfiles = 0
    for filename in glob.glob(fnames):
        # basename of file is the lanquage
        category = os.path.splitext(os.path.basename(filename))[0]
        # add category (i.e. language) to list
        all_categories.append(category)
        # add names to dictionary, indexed by language
        lines = readLines(filename, all_letters)
        category_lines[category] = lines
        nfiles += 1
    if(nfiles == 0):
        print("No files found for regular expression ("+fnames+")")
        sys.exit(-1)
        
    # count number of languages (i.e. classes)
    n_categories = len(all_categories)

    # write categories to csv file
    with open('all_categories.csv', 'w') as f:
        writer = csv.writer(f)
        writer.writerows([all_categories])

    # 4. create instance of the RNN
    n_input_neurons = n_letters
    n_hidden_neurons = 256
    n_output_neurons = n_categories
    MyRNN = RNN(n_input_neurons, n_hidden_neurons, n_output_neurons)
    
    # 5. load checkpoint, if available
    
    # 6. set training parameters
    my_criterion = nn.NLLLoss()
    my_learn_rate = 0.005
    n_iters = 200000
    print_every = 100000
    avg_loss = 0.00
    hit_rate = 0.00
    dhit = 1.00 / float(print_every)
    all_avg_losses = []
    all_hit_rates = []
    
    for i in range(1, n_iters + 1):
# TO DO:
        # 7. train on random feature-label pair
        category, line, category_tensor, line_tensor =\
                  randomTrainingExample(all_categories, category_lines, all_letters)
        output, loss = train(MyRNN, category_tensor, line_tensor, my_criterion, my_learn_rate)
        avg_loss += loss
        guess, guess_idx = categoryFromOutput(output, all_categories)

        if(guess == category):
            hit_rate += dhit
            
        # periodically report progress
        if i % print_every == 0:
            avg_loss /= float(print_every)
            print("iteration # " + str(i) + " of " + str(n_iters))            
            print("   Avg. loss: {:6.2f}".format(avg_loss))
            print("    Hit Rate: {:6.2f}".format(hit_rate))
            if(guess == category):
                print("   " + line + " : " + guess + " == " + category)
            else:
                print("   " + line + " : " + guess + " != " + category)
            all_avg_losses.append(avg_loss)
            all_hit_rates.append(hit_rate)
            avg_loss = 0.00
            hit_rate = 0.00
            
        # 8. Save a checkpoint
            
    fig,ax1 = plt.subplots()
    ax1.plot(all_avg_losses)
    ax1.set_ylabel("Loss Function", color="r")
    ax2 = ax1.twinx()
    ax2.plot(all_hit_rates)
    ax2.set_ylabel("Success Rate", color="b")
    plt.xlabel("Training Iteration")
    plt.savefig("convergence.png", dpi=100)
    # plt.show()    
    plt.close()

    # test the skill of the model
    total = 0
    correct = 0
    confusion_matrix = np.zeros((n_categories,n_categories))
    for actual_category in all_categories:
        cat_idx = all_categories.index(actual_category)
        for line in category_lines[actual_category]:
            line_tensor = textToTensor(line, all_letters)
            output = predict(MyRNN, line_tensor)
            guess, guess_idx = categoryFromOutput(output, all_categories)
            
            # update confusion matrix
            for i in range(0,n_categories):
                confusion_matrix[cat_idx,guess_idx] += 1          
            total += 1
            correct += int(cat_idx == guess_idx)

    # report results
    print(' ')
    print('Classification Accuracy: {:.2f}%'.format(100 * correct / total))
    print(' ')
    print('Confusion Matrix')
    print('      |----------------------- Predicted ------------------------- ...')
    print('Actual|',end='')
    for i in range(0,n_categories):
          print('{:6s}|'.format(all_categories[i][0:6]), end='')
    print('')
    print('======|', end='')
    for i in range(0,n_categories):
          print('======|', end='')
    print('')          
    for i in range(0,n_categories):
        print('{:6s}|'.format(all_categories[i][0:6]), end='')
        for j in range(0,n_categories):
            cm = int(confusion_matrix[i,j])
            print('{:6d}|'.format(cm),end='')
        print('')

# launch the main program
main()
    

    


iteration # 100000 of 200000
   Avg. loss:   1.75
    Hit Rate:   0.47
   Sai : Chinese != Indian
iteration # 200000 of 200000
   Avg. loss:   1.33
    Hit Rate:   0.58
   Hunter : Scottish == Scottish
 
Classification Accuracy: 56.81%
 
Confusion Matrix
      |----------------------- Predicted ------------------------- ...
Actual|Czech |German|Arabic|Japane|Chines|Vietna|Russia|French|Irish |Englis|Indian|Pakist|Spanis|Greek |Italia|Portug|Scotti|Dutch |Korean|Polish|
Czech |  3580|  1400|   300|   320|    60|   140|   300|   240|   260|   720|   200|   140|   360|   140|    80|   160|   420|   280|    40|  1240|
German|   560|  7940|   540|   100|   160|   180|   120|   520|   560|  1040|   160|   140|   160|   220|    80|    80|  1040|   500|   120|   260|
Arabic|   260|  1900| 25020|   680|  1060|     0|     0|     0|  2380|     0|  2220|  2100|   320|  1040|  1080|   740|   700|     0|   500|     0|
Japane|   120|   140|   680| 14260|   160|   160|    80|   240|   100|   180|   26

In [5]:
def main():
    # declare regex for files containing names
    fnames = 'data/names/*.txt'

    # assemble sequence of valid ASCII characters
    # that can occur in a name
    all_letters = string.ascii_letters + " .,;'"
    n_letters = len(all_letters)

    # Build the category_lines dictionary, a list of names per language
    category_lines = {}
    all_categories = []

    nfiles = 0
    for filename in glob.glob(fnames):
        # basename of file is the lanquage
        category = os.path.splitext(os.path.basename(filename))[0]
        # add category (i.e. language) to list
        all_categories.append(category)
        # add names to dictionary, indexed by language
        lines = readLines(filename, all_letters)
        category_lines[category] = lines
        nfiles += 1
    if(nfiles == 0):
        print("No files found for regular expression ("+fnames+")")
        sys.exit(-1)
        
    # count number of languages (i.e. classes)
    n_categories = len(all_categories)

    # write categories to csv file
    with open('all_categories.csv', 'w') as f:
        writer = csv.writer(f)
        writer.writerows([all_categories])

    # 4. create instance of the RNN
    n_input_neurons = n_letters
    n_hidden_neurons = 256
    n_output_neurons = n_categories
    MyRNN = RNN(n_input_neurons, n_hidden_neurons, n_output_neurons)
    
    # 5. load checkpoint, if available
    
    # 6. set training parameters
    my_criterion = nn.NLLLoss()
    my_learn_rate = 0.005
    n_iters = 500000
    print_every = 250000
    avg_loss = 0.00
    hit_rate = 0.00
    dhit = 1.00 / float(print_every)
    all_avg_losses = []
    all_hit_rates = []
    
    for i in range(1, n_iters + 1):
        category, line, category_tensor, line_tensor =\
                  randomTrainingExample(all_categories, category_lines, all_letters)
        output, loss = train(MyRNN, category_tensor, line_tensor, my_criterion, my_learn_rate)
        avg_loss += loss
        guess, guess_idx = categoryFromOutput(output, all_categories)

        if(guess == category):
            hit_rate += dhit
            
        # periodically report progress
        if i % print_every == 0:
            avg_loss /= float(print_every)
            print("iteration # " + str(i) + " of " + str(n_iters))            
            print("   Avg. loss: {:6.2f}".format(avg_loss))
            print("    Hit Rate: {:6.2f}".format(hit_rate))
            if(guess == category):
                print("   " + line + " : " + guess + " == " + category)
            else:
                print("   " + line + " : " + guess + " != " + category)
            all_avg_losses.append(avg_loss)
            all_hit_rates.append(hit_rate)
            avg_loss = 0.00
            hit_rate = 0.00
            
        # 8. Save a checkpoint
            
    fig,ax1 = plt.subplots()
    ax1.plot(all_avg_losses)
    ax1.set_ylabel("Loss Function", color="r")
    ax2 = ax1.twinx()
    ax2.plot(all_hit_rates)
    ax2.set_ylabel("Success Rate", color="b")
    plt.xlabel("Training Iteration")
    plt.savefig("convergence.png", dpi=100)
    # plt.show()    
    plt.close()

    # test the skill of the model
    total = 0
    correct = 0
    confusion_matrix = np.zeros((n_categories,n_categories))
    for actual_category in all_categories:
        cat_idx = all_categories.index(actual_category)
        for line in category_lines[actual_category]:
            line_tensor = textToTensor(line, all_letters)
            output = predict(MyRNN, line_tensor)
            guess, guess_idx = categoryFromOutput(output, all_categories)
            
            # update confusion matrix
            for i in range(0,n_categories):
                confusion_matrix[cat_idx,guess_idx] += 1          
            total += 1
            correct += int(cat_idx == guess_idx)

    # report results
    print(' ')
    print('Classification Accuracy: {:.2f}%'.format(100 * correct / total))
    print(' ')
    print('Confusion Matrix')
    print('      |----------------------- Predicted ------------------------- ...')
    print('Actual|',end='')
    for i in range(0,n_categories):
          print('{:6s}|'.format(all_categories[i][0:6]), end='')
    print('')
    print('======|', end='')
    for i in range(0,n_categories):
          print('======|', end='')
    print('')          
    for i in range(0,n_categories):
        print('{:6s}|'.format(all_categories[i][0:6]), end='')
        for j in range(0,n_categories):
            cm = int(confusion_matrix[i,j])
            print('{:6d}|'.format(cm),end='')
        print('')

# launch the main program
main()
    

    

iteration # 250000 of 500000
   Avg. loss:   1.49
    Hit Rate:   0.53
   Bhrighde : English != Irish
iteration # 500000 of 500000
   Avg. loss:   1.24
    Hit Rate:   0.60
   Poirier : French == French
 
Classification Accuracy: 52.63%
 
Confusion Matrix
      |----------------------- Predicted ------------------------- ...
Actual|Czech |German|Arabic|Japane|Chines|Vietna|Russia|French|Irish |Englis|Indian|Pakist|Spanis|Greek |Italia|Portug|Scotti|Dutch |Korean|Polish|
Czech |  4200|  1220|   140|   340|    20|   100|   300|   520|   300|   240|   340|   180|   160|   140|   100|   340|   220|   460|    20|  1040|
German|   620|  7540|   380|   180|   120|   160|   120|  1220|   240|   400|   460|   140|   200|   220|   100|   100|   380|  1540|   140|   220|
Arabic|   820|   800| 20060|  3300|     0|   320|   580|   940|   440|     0|  3760|  2160|   680|  2240|   640|  1320|     0|  1500|     0|   440|
Japane|   440|    20|   300| 14460|    40|   160|   160|   400|    20|    20|   6