# TD 7

## Identify the problem

Let's try to redo the RNN to guess what nationalities names come from, with the full original dataset "names" instead of "names_1000".

In [None]:
import glob
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import os
import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader
from torch.utils.data.dataset import random_split
from unidecode import unidecode

Create alphabet

In [None]:
# Our alphabet
LETTERS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .,;'
N_LETTERS = len(LETTERS)

# Turn a Unicode string to string of characters in our alphabet
def unicodeToAscii(s):
    return ''.join(c for c in unidecode(s) if c in LETTERS)

# Turn a name into a <name_length x 1 x N_LETTERS>, or a tensor of one-hot letter vectors
def nameToTensor(name):
    tensor = torch.zeros(len(name), 1, N_LETTERS)
    for li, letter in enumerate(name):
        tensor[li][0][LETTERS.find(letter)] = 1
    return tensor

In [None]:
assert (unicodeToAscii('Ślusàrski') == 'Slusarski')

In [None]:
nameToTensor('abcZZZ')

Create Dataset

In [None]:
# Create a custom dataset
class NamesDataset(Dataset):
    def __init__(self, filenames: str):
        # read data
        self.names = []  # X
        self.countries = []  # y (strings)
        self.country_to_idx = {}  # key: country, value: index
        self.idx_to_country = []  # index: index, value: country

        self.n_countries = len(self.country_to_idx)

    def countryID(self, index):
        

    def __getitem__(self, index):
        
    
    def __len__(self):
        

# Create object of our custom dataset
dataset = NamesDataset('data/names/*.txt')

# Split data into train and test with random_split
TRAIN_FRACTION = 0.8
TRAIN_SIZE = int(TRAIN_FRACTION*len(dataset))
TESET_SIZE = len(dataset)-int(TRAIN_FRACTION*len(dataset))
train_dataset, test_dataset = random_split(dataset, [TRAIN_SIZE, TESET_SIZE])

# Store number of countries in a variable
N_COUNTRIES = dataset.n_countries

# Create a dataloader
train_loader = 
test_loader = 

Create the network & train it

In [None]:
N_HIDDEN = 128

In [None]:
# Create the network
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, idx_to_country):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        self.idx_to_country = idx_to_country
        self.relu = nn.ReLU()

    def forward(self, input, hidden):
        return 

    def initHidden(self):
        return 
    
    def outputToCountry(self, output):
        _, top_i = output.topk(1)
        return self.idx_to_country[top_i[0,0].item()]

    def outputToID(self, output):
        _, top_i = output.topk(1)
        return top_i[0,0].item()

rnn = RNN(N_LETTERS, N_HIDDEN, N_COUNTRIES, dataset.idx_to_country)

In [None]:
# Train the network
lr = 0.001
optimizer =
criterion =
n_epochs = 10
for epoch in range(n_epochs):
    loss_sum = 0
    for (name, country, name_tensor, country_tensor) in train_loader:
        hidden = rnn.initHidden()
        rnn.zero_grad()
        for i in range(name_tensor.size()[1]):
            output, hidden = rnn(name_tensor[0][i], hidden)
        loss = criterion(output, country_tensor[0][None])
        loss_sum += loss.item()
        loss.backward()
        optimizer.step()
    print(f'Epoch: {epoch+1}/{n_epochs} ({100*(epoch+1)/n_epochs:.0f}%)\tLoss: {loss_sum:.6f}')

Test the network

In [None]:
# Test on a couple of examples
print(f'NAME; TRUTH; PREDICTED')
for i in range(10):
    name, country, name_tensor, country_tensor = test_dataset[i]
    hidden = rnn.initHidden()
    rnn.zero_grad()
    for i in range(name_tensor.size()[1]):
        output, hidden = rnn(name_tensor[i], hidden)
    print(f'{name}; {country}; {rnn.outputToCountry(output)}')

In [None]:
# Confusion matrix
confusion = torch.zeros(N_COUNTRIES, N_COUNTRIES)
accuracy = 0
for name, country, name_tensor, country_tensor in test_loader:
    hidden = rnn.initHidden()
    rnn.zero_grad()
    for i in range(name_tensor.size()[1]):
        output, hidden = rnn(name_tensor[0][i], hidden)
    guess, guess_i = output.topk(1)
    confusion[country_tensor.item(), guess_i.item()] += 1
    if country_tensor.item() == guess_i.item():
        accuracy += 1

# Normalize by dividing every row by its sum
for i in range(N_COUNTRIES):
    confusion[i] = confusion[i] / confusion[i].sum()

# Accuracy
print(f'\nAccuracy: {100*accuracy/len(test_dataset):.2f}%')

# Plot confusion matrix
plt.imshow(confusion.numpy())
plt.colorbar()
plt.title('Confusion matrix')
ax = plt.gca()
positions = list(range(N_COUNTRIES))
labels = train_dataset.dataset.idx_to_country
small_labels = [label[:2] for label in labels]
ax.xaxis.set_major_locator(ticker.FixedLocator(positions))
ax.xaxis.set_major_formatter(ticker.FixedFormatter(small_labels))
ax.yaxis.set_major_locator(ticker.FixedLocator(positions))
ax.yaxis.set_major_formatter(ticker.FixedFormatter(labels))
plt.show()

The accuracy increased! (~74% vs ~60% last time **and we used an independent test set so it's even more unexpected**)

Let's try a couple of examples of our own:

In [None]:
names = ["Dubois", "Lhotte", "Dupont", "Garcia", "Sato", "Duprès", "Suzuki", "Wang", "Santos", "Yamamoto"]
for name in names:
    name_tensor = nameToTensor(name)
    hidden = rnn.initHidden()
    rnn.zero_grad()
    for i in range(name_tensor.size()[1]):
        output, hidden = rnn(name_tensor[i], hidden)
    print(f'{name}; {rnn.outputToCountry(output)}')

It's almost always english/russian that is predicted! Why is that?

This is typical of a class unbalance, let's investigate the size of each class:

In [None]:
# cout each country in the dataset with a defaultdict
from collections import defaultdict

countr_count = defaultdict(int)
total = 0
for country in dataset.countries:
    countr_count[country] += + 1
    total += 1
print(f'Country; Count; Percentage')
for country, count in countr_count.items():
    print(f'{country}; {count}; {100*count/total:.2f}%')

Let's calculate the per-class accuracy because accuracy is not a good metric here:

In [None]:
per_class_accuracy_list: list[float] = []
for i in range(N_COUNTRIES):
    fraction_correct = confusion[i, i].item()
    print(f'{dataset.idx_to_country[i]}: {100*fraction_correct:.2f}%')
    per_class_accuracy_list.append(100*fraction_correct)
print("\n")
print(f'Average per class accuracy: {sum(per_class_accuracy_list)/len(per_class_accuracy_list):.2f}%')

---
---
---

## Fixing the unbalanced learning

As we did not want to bother you with unbalanced datasets yet as it was your first RNN, all 18 nationalities were represented with the same number of names in TD 6b. Now that we used `names.txt` instead of `names_1000.txt`, we have an unbalanced dataset. This is what real life looks like.

We ignored this but because the dataset was smaller from some nationalities, you can see that in the last TD's `Vietnamese.txt`, some names appeared several times, which is almost a way of artificially balancing the dataset.

### 1st fixing idea: modify the dataset

Instead of loading names one by one, choose a country at random, then choose a name at random from this category.
Do this using an iterable dataset (checkout the doc here: https://pytorch.org/docs/stable/data.html#torch.utils.data.IterableDataset).

In [None]:
import random
from torch.utils.data import IterableDataset

# Create a custom dataset
class NamesIterableDataset(IterableDataset):
    def __init__(self, filenames='names/*.txt'):
        # Read data
        self.names = dict()  # country -> list of names
        self.country_to_idx = {}
        self.idx_to_country = []

        for filename in glob.glob(filenames):
            country = os.path.splitext(os.path.basename(filename))[0]
            self.country_to_idx[country] = len(self.country_to_idx)
            self.idx_to_country.append(country)
            lines = open(filename, encoding='utf-8').read().strip().split('\n')
            self.names[country] = []
            for line in lines:
                self.names[country].append(unicodeToAscii(line))
        self.n_countries = len(self.country_to_idx)

    def countryID(self, country):
        return torch.tensor(self.country_to_idx[country])
    
    def __iter__(self):
        for _ in range(len(self)):
            yield self.__next__()

    def __next__(self):
        
    
    def __len__(self):
        return 16059  # Arbitrary, size of one epoch
    
    def __getitem__(self, idx):
        """
        idx is ignored (why? because of the way we want it to work!), but required by the implementation of __getitem__
        """
        return self.__next__()

# Create dataset object
dataset = NamesIterableDataset('data/names/*.txt')

# Get a sample
name, country, name_tensor, country_tensor = next(dataset)
name, country, name_tensor.shape, country_tensor

Q: Interpret `name_tensor.shape`?

A: `[n_letters_in_name, 1, n_letters_available]`

In [None]:
# Split data into train and test with random_split
TRAIN_FRACTION = 0.8
TRAIN_SIZE = int(TRAIN_FRACTION*len(dataset))
TESET_SIZE = len(dataset)-int(TRAIN_FRACTION*len(dataset))
train_dataset, test_dataset = random_split(dataset, [TRAIN_SIZE, TESET_SIZE])

# Store number of countries in a variable
N_COUNTRIES = dataset.n_countries

# Create a dataloader
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)  # batch_size 1 as names have different lengths !!!
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=True)  # shuffle = True is not technically necessary

# Get a sample from the dataloader
name_train, country, name_tensor, country_tensor = next(iter(train_loader))
print(name_train, country, name_tensor.shape, country_tensor)
name_test, country, name_tensor, country_tensor = next(iter(test_loader))
print(name_test, country, name_tensor.shape, country_tensor)

In [None]:
for _ in range(100_000):
    name_train, country, name_tensor, country_tensor = next(iter(train_loader))
    name_test, country, name_tensor, country_tensor = next(iter(test_loader))
    assert name_train != name_test

It doesn't work ... what happened?

In [None]:
# Create a custom dataset
class NamesIterableDataset(IterableDataset):
    def __init__(self, filenames='names/*.txt', train: bool = True, train_fraction: float = 0.8):
        # Read data
        self.names = dict()  # country -> list of names
        self.country_to_idx = {}
        self.idx_to_country = []

        for filename in glob.glob(filenames):
            country = os.path.splitext(os.path.basename(filename))[0]
            self.country_to_idx[country] = len(self.country_to_idx)
            self.idx_to_country.append(country)
            lines = open(filename, encoding='utf-8').read().strip().split('\n')
            self.names[country] = []
            for line in lines:
                self.names[country].append(unicodeToAscii(line))
        
        # Split the data into train and test datasets
        

        self.n_countries = len(self.country_to_idx)


    def countryID(self, country):
        return torch.tensor(self.country_to_idx[country])
    
    def __iter__(self):
        for _ in range(len(self)):
            yield self.__next__()

    def __next__(self):
        # Choose random country
        country = random.choice(self.idx_to_country)
        # Choose random name from this country
        name = random.choice(self.names[country])
        # Convert to tensors
        name_tensor = nameToTensor(name)
        countryID = self.countryID(country)
        return (name, country, name_tensor, countryID)
    
    def __len__(self):
        return 16059  # Arbitrary, size of one epoch
    
    def __getitem__(self, idx):
        """
        idx is ignored (why? because of the way we want it to work!), but required by the implementation of __getitem__
        """
        return self.__next__()

# Create dataset object
train_dataset = NamesIterableDataset('data/names/*.txt', train=True, train_fraction=0.8)
test_dataset = NamesIterableDataset('data/names/*.txt', train=False, train_fraction=0.8)

# Get a sample
name, country, name_tensor, country_tensor = next(train_dataset)
name, country, name_tensor.shape, country_tensor

In [None]:
# Store number of countries in a variable
N_COUNTRIES = dataset.n_countries

# Create a dataloader
train_loader = DataLoader(train_dataset, batch_size=1)  # shuffle is not a thing
test_loader = DataLoader(test_dataset, batch_size=1)  # shuffle is not a thing

# Get a sample from the dataloader
name_train, country, name_tensor, country_tensor = next(iter(train_loader))
print(name_train, country, name_tensor.shape, country_tensor)
name_test, country, name_tensor, country_tensor = next(iter(test_loader))
print(name_test, country, name_tensor.shape, country_tensor)

In [None]:
for _ in range(100_000):
    name_train, country, name_tensor, country_tensor = next(iter(train_loader))
    name_test, country, name_tensor, country_tensor = next(iter(test_loader))
    assert name_train != name_test

Whaaaat? It still doesn't work. Hint: Fakhoury.

Btw, we see that when we said above we were unbiased and corrected what was done last time, we were not completely honest (and therefore the accuracies were not representative of what would happen on an independent dataset). Never believe everything you read!!! ... Although for this particular task, overfitting a bit is not that big of a deal as there is a finite set of names... but for comparison purposes, it's just a bit stupid.

In [None]:
# Create a custom dataset
class NamesIterableDataset(IterableDataset):
    def __init__(self, filenames='names/*.txt', train: bool = True, train_fraction: float = 0.8):
        # Read data
        self.names = dict()  # country -> list of names
        self.country_to_idx = {}
        self.idx_to_country = []

        for filename in glob.glob(filenames):
            country = os.path.splitext(os.path.basename(filename))[0]
            self.country_to_idx[country] = len(self.country_to_idx)
            self.idx_to_country.append(country)
            lines = open(filename, encoding='utf-8').read().strip().split('\n')
            self.names[country] = []
            for line in lines:
                self.names[country].append(unicodeToAscii(line))
        
        # Split the data into train and test datasets
        for country in self.names.keys():
            ###
            split_idx = int(train_fraction * len(self.names[country]))
            if train:
                self.names[country] = self.names[country][:split_idx]
            else:
                self.names[country] = self.names[country][split_idx:]

        self.n_countries = len(self.country_to_idx)


    def countryID(self, country):
        return torch.tensor(self.country_to_idx[country])
    
    def __iter__(self):
        for _ in range(len(self)):
            yield self.__next__()

    def __next__(self):
        # Choose random country
        country = random.choice(self.idx_to_country)
        # Choose random name from this country
        name = random.choice(self.names[country])
        # Convert to tensors
        name_tensor = nameToTensor(name)
        countryID = self.countryID(country)
        return (name, country, name_tensor, countryID)
    
    def __len__(self):
        return 16059  # Arbitrary, size of one epoch
    
    def __getitem__(self, idx):
        """
        idx is ignored (why? because of the way we want it to work!), but required by the implementation of __getitem__
        """
        return self.__next__()

# Create dataset object
train_dataset = NamesIterableDataset('data/names/*.txt', train=True, train_fraction=0.8)
test_dataset = NamesIterableDataset('data/names/*.txt', train=False, train_fraction=0.8)

# Get a sample
name, country, name_tensor, country_tensor = next(train_dataset)
name, country, name_tensor.shape, country_tensor

In [None]:
# Store number of countries in a variable
N_COUNTRIES = dataset.n_countries

# Create a dataloader
train_loader = DataLoader(train_dataset, batch_size=1)  # shuffle is not a thing
test_loader = DataLoader(test_dataset, batch_size=1)  # shuffle is not a thing

# Get a sample from the dataloader
name_train, country, name_tensor, country_tensor = next(iter(train_loader))
print(name_train, country, name_tensor.shape, country_tensor)
name_test, country, name_tensor, country_tensor = next(iter(test_loader))
print(name_test, country, name_tensor.shape, country_tensor)

In [None]:
for _ in range(100_000):
    name_train, country, name_tensor, country_tensor = next(iter(train_loader))
    name_test, country, name_tensor, country_tensor = next(iter(test_loader))
    assert (name_train != name_test)

Whaaat? Hint: Murphy. It is what it is though, it's not a us problem this time.

In [None]:
for _ in range(100_000):
    name_train, country_train, name_tensor, country_tensor = next(iter(train_loader))
    name_test, country_test, name_tensor, country_tensor = next(iter(test_loader))
    assert ((name_train, country_train) != (name_test, country_test))

At least we're sure we didn't make anything stupid.

Re-define the RNN and re-train it with our new fancy iterable dataset.
Print the final accuracy, and plot the confusion matrix (which should be closer to the identity matrix).

In [None]:
# Create the network
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, idx_to_country):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        self.idx_to_country = idx_to_country
        self.relu = nn.ReLU()

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        hidden = self.relu(hidden)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)
    
    def outputToCountry(self, output):
        _, top_i = output.topk(1)
        return self.idx_to_country[top_i[0,0].item()]

    def outputToID(self, output):
        _, top_i = output.topk(1)
        return top_i[0,0].item()

rnn = RNN(N_LETTERS, N_HIDDEN, N_COUNTRIES, dataset.idx_to_country)

# Train the network
lr = 0.001
optimizer = optim.Adam(rnn.parameters(), lr=lr)
criterion = nn.NLLLoss()
n_epochs = 10
for epoch in range(n_epochs):
    loss_sum = 0
    for (name, country, name_tensor, country_tensor) in train_loader:
        hidden = rnn.initHidden()
        rnn.zero_grad()
        for i in range(name_tensor.size()[1]):
            output, hidden = rnn(name_tensor[0][i], hidden)
        loss = criterion(output, country_tensor[0][None])
        loss_sum += loss.item()
        loss.backward()
        optimizer.step()
    print(f'Epoch: {epoch+1}/{n_epochs} ({100*(epoch+1)/n_epochs:.0f}%)\tLoss: {loss_sum:.6f}')

# Test on a couple of examples
print(f'\nNAME; TRUTH; PREDICTED')
for i in range(10):
    name, country, name_tensor, country_tensor = test_dataset[i]
    hidden = rnn.initHidden()
    rnn.zero_grad()
    for i in range(name_tensor.size()[1]):
        output, hidden = rnn(name_tensor[i], hidden)
    print(f'{name}; {country}; {rnn.outputToCountry(output)}')

# Confusion matrix
confusion = torch.zeros(N_COUNTRIES, N_COUNTRIES)
accuracy = 0
for name, country, name_tensor, country_tensor in test_loader:
    hidden = rnn.initHidden()
    rnn.zero_grad()
    for i in range(name_tensor.size()[1]):
        output, hidden = rnn(name_tensor[0][i], hidden)
    guess, guess_i = output.topk(1)
    category_i = country_tensor[0][None]
    confusion[category_i, guess_i] += 1
    if category_i == guess_i:
        accuracy += 1
# Normalize by dividing every row by its sum
for i in range(N_COUNTRIES):
    confusion[i] = confusion[i] / confusion[i].sum()
# Accuracy
print(f'\nAccuracy: {100*accuracy/len(test_loader):.2f}%')

# Plot confusion matrix
plt.imshow(confusion.numpy())
plt.colorbar()
plt.title('Confusion matrix')
ax = plt.gca()
positions = list(range(N_COUNTRIES))
labels = train_dataset.idx_to_country
small_labels = [label[:2] for label in labels]
ax.xaxis.set_major_locator(ticker.FixedLocator(positions))
ax.xaxis.set_major_formatter(ticker.FixedFormatter(small_labels))
ax.yaxis.set_major_locator(ticker.FixedLocator(positions))
ax.yaxis.set_major_formatter(ticker.FixedFormatter(labels))
plt.show()

Of course, the accuracy went down again, but at least, our RNN isn't biased (hopefully ... let's make sure it actually isn't ...).

In [None]:
names = ["Dubois", "Lhotte", "Dupont", "Garcia", "Sato", "Duprès", "Suzuki", "Wang", "Santos", "Yamamoto"]
for name in names:
    name_tensor = nameToTensor(name)
    hidden = rnn.initHidden()
    rnn.zero_grad()
    for i in range(name_tensor.size()[1]):
        output, hidden = rnn(name_tensor[i], hidden)
    print(f'{name}; {rnn.outputToCountry(output)}')

Let's calculate the per-class accuracy as explained previously:

In [None]:
per_class_accuracy_list: list[float] = []
for i in range(N_COUNTRIES):
    n_correct = confusion[i, i].item()
    n_total = confusion[i].sum().item()
    print(f'{dataset.idx_to_country[i]}: {100*n_correct/n_total:.2f}%')
    per_class_accuracy_list.append(100*n_correct/n_total)
print("\n")
print(f'Average per class accuracy: {sum(per_class_accuracy_list)/len(per_class_accuracy_list):.2f}%')

In [None]:
train_dataset.names["French"]

In [None]:
test_dataset.names["French"]

In [None]:
# Create a custom dataset
class NamesIterableDataset(IterableDataset):
    def __init__(self, filenames='names/*.txt', train: bool = True, train_fraction: float = 0.8):
        # Read data
        self.names = dict()  # country -> list of names
        self.country_to_idx = {}
        self.idx_to_country = []

        for filename in glob.glob(filenames):
            country = os.path.splitext(os.path.basename(filename))[0]
            self.country_to_idx[country] = len(self.country_to_idx)
            self.idx_to_country.append(country)
            lines = open(filename, encoding='utf-8').read().strip().split('\n')
            self.names[country] = []
            for line in lines:
                self.names[country].append(unicodeToAscii(line))
        
        # Split the data into train and test datasets
        for country in self.names.keys():
            self.names[country] = (list(set(self.names[country])))  # Remove duplicates
            #####
            split_idx = int(train_fraction * len(self.names[country]))
            if train:
                self.names[country] = self.names[country][:split_idx]
            else:
                self.names[country] = self.names[country][split_idx:]

        self.n_countries = len(self.country_to_idx)


    def countryID(self, country):
        return torch.tensor(self.country_to_idx[country])
    
    def __iter__(self):
        for _ in range(len(self)):
            yield self.__next__()

    def __next__(self):
        # Choose random country
        country = random.choice(self.idx_to_country)
        # Choose random name from this country
        name = random.choice(self.names[country])
        # Convert to tensors
        name_tensor = nameToTensor(name)
        countryID = self.countryID(country)
        return (name, country, name_tensor, countryID)
    
    def __len__(self):
        return 16059  # Arbitrary, size of one epoch
    
    def __getitem__(self, idx):
        """
        idx is ignored (why? because of the way we want it to work!), but required by the implementation of __getitem__
        """
        return self.__next__()

# Create dataset object
train_dataset = NamesIterableDataset('data/names/*.txt', train=True, train_fraction=0.8)
test_dataset = NamesIterableDataset('data/names/*.txt', train=False, train_fraction=0.8)

# Get a sample
name, country, name_tensor, country_tensor = next(train_dataset)
name, country, name_tensor.shape, country_tensor

In [None]:
for _ in range(200):
    name, country, name_tensor, country_tensor = next(train_dataset)
    if country == "French":
        print(name, country, name_tensor.shape, country_tensor)

In [None]:
for _ in range(200):
    name, country, name_tensor, country_tensor = next(test_dataset)
    if country == "French":
        print(name, country, name_tensor.shape, country_tensor)

In [None]:
for _ in range(100_000):
    name_train, country_train, name_tensor, country_tensor = next(iter(train_loader))
    name_test, country_test, name_tensor, country_tensor = next(iter(test_loader))
    assert ((name_train, country_train) != (name_test, country_test))

In [None]:
# Create the network
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, idx_to_country):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        self.idx_to_country = idx_to_country
        self.relu = nn.ReLU()

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        hidden = self.relu(hidden)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)
    
    def outputToCountry(self, output):
        _, top_i = output.topk(1)
        return self.idx_to_country[top_i[0,0].item()]

    def outputToID(self, output):
        _, top_i = output.topk(1)
        return top_i[0,0].item()

rnn = RNN(N_LETTERS, N_HIDDEN, N_COUNTRIES, dataset.idx_to_country)

# Train the network
lr = 0.001
optimizer = optim.Adam(rnn.parameters(), lr=lr)
criterion = nn.NLLLoss()
n_epochs = 10
for epoch in range(n_epochs):
    loss_sum = 0
    for (name, country, name_tensor, country_tensor) in train_loader:
        hidden = rnn.initHidden()
        rnn.zero_grad()
        for i in range(name_tensor.size()[1]):
            output, hidden = rnn(name_tensor[0][i], hidden)
        loss = criterion(output, country_tensor[0][None])
        loss_sum += loss.item()
        loss.backward()
        optimizer.step()
    print(f'Epoch: {epoch+1}/{n_epochs} ({100*(epoch+1)/n_epochs:.0f}%)\tLoss: {loss_sum:.6f}')

# Test on a couple of examples
print(f'\nNAME; TRUTH; PREDICTED')
for i in range(10):
    name, country, name_tensor, country_tensor = test_dataset[i]
    hidden = rnn.initHidden()
    rnn.zero_grad()
    for i in range(name_tensor.size()[1]):
        output, hidden = rnn(name_tensor[i], hidden)
    print(f'{name}; {country}; {rnn.outputToCountry(output)}')

# Confusion matrix
confusion = torch.zeros(N_COUNTRIES, N_COUNTRIES)
accuracy = 0
for name, country, name_tensor, country_tensor in test_loader:
    hidden = rnn.initHidden()
    rnn.zero_grad()
    for i in range(name_tensor.size()[1]):
        output, hidden = rnn(name_tensor[0][i], hidden)
    guess, guess_i = output.topk(1)
    category_i = country_tensor[0][None]
    confusion[category_i, guess_i] += 1
    if category_i == guess_i:
        accuracy += 1
# Normalize by dividing every row by its sum
for i in range(N_COUNTRIES):
    confusion[i] = confusion[i] / confusion[i].sum()
# Accuracy
print(f'\nAccuracy: {100*accuracy/len(test_loader):.2f}%')

# Plot confusion matrix
plt.imshow(confusion.numpy())
plt.colorbar()
plt.title('Confusion matrix')
ax = plt.gca()
positions = list(range(N_COUNTRIES))
labels = train_dataset.idx_to_country
small_labels = [label[:2] for label in labels]
ax.xaxis.set_major_locator(ticker.FixedLocator(positions))
ax.xaxis.set_major_formatter(ticker.FixedFormatter(small_labels))
ax.yaxis.set_major_locator(ticker.FixedLocator(positions))
ax.yaxis.set_major_formatter(ticker.FixedFormatter(labels))
plt.show()

In [None]:
names = ["Dubois", "Lhotte", "Dupont", "Garcia", "Sato", "Duprès", "Suzuki", "Wang", "Santos", "Yamamoto"]
for name in names:
    name_tensor = nameToTensor(name)
    hidden = rnn.initHidden()
    rnn.zero_grad()
    for i in range(name_tensor.size()[1]):
        output, hidden = rnn(name_tensor[i], hidden)
    print(f'{name}; {rnn.outputToCountry(output)}')

In [None]:
per_class_accuracy_list: list[float] = []
for i in range(N_COUNTRIES):
    n_correct = confusion[i, i].item()
    n_total = confusion[i].sum().item()
    print(f'{dataset.idx_to_country[i]}: {100*n_correct/n_total:.2f}%')
    per_class_accuracy_list.append(100*n_correct/n_total)
print("\n")
print(f'Average per class accuracy: {sum(per_class_accuracy_list)/len(per_class_accuracy_list):.2f}%')

---

### A second (more or less equivalent) "fix": add a sampler to the dataloader

---

### A third (more or less equivalent) "fix": weight the loss function