In [1]:
import os
import glob
import string
import numpy as np
import pandas as pd
import matplotlib.ticker as ticker

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import DataLoader
from torchsummary import summary

from tqdm import tqdm
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
print(f'Torch version: {torch.__version__}, Device: {device.type}')

Torch version: 2.6.0, Device: mps


### Load Data

The names can be found in text files in a src directory, one file per language.

In the following you can find some utilities to load the data into pandas data frames. 

We will restrict to some common European languages. 

With the given selection, we will identify all the occurring characters and initialize an alphabet.<br>
For this alphabet, we will use a one-hot-encoding to map them into a vector space representation. 

Foresee a suitable character for the end of the word, e.g. 'END'.

In [3]:
srcdir = 'data/names'
languages = ["English", "French", "Italian", "German", "Spanish"]

In [4]:
# inspect the data directory
def findFiles(path): 
    return glob.glob(path)

In [5]:
print('\n'.join(findFiles(os.path.join(srcdir,'*.txt'))))

data/names/Czech.txt
data/names/German.txt
data/names/Arabic.txt
data/names/Japanese.txt
data/names/Chinese.txt
data/names/Vietnamese.txt
data/names/Russian.txt
data/names/French.txt
data/names/Irish.txt
data/names/English.txt
data/names/Spanish.txt
data/names/Greek.txt
data/names/Italian.txt
data/names/Portuguese.txt
data/names/Scottish.txt
data/names/Dutch.txt
data/names/Korean.txt
data/names/Polish.txt


In [6]:
# Read a file and split into lines
def readLines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return lines

def load_data(srcdir, categories=None):
    names_list = []
    for filename in findFiles(os.path.join(srcdir,'*.txt')):
        category = os.path.splitext(os.path.basename(filename))[0]
        if not categories or category in categories: 
            names = readLines(filename)
            names_list.extend([(name,category) for name in names])
    df = pd.DataFrame(names_list)
    df.columns = ["name","lang"]
    return df

In [7]:
names = load_data(srcdir, categories=languages)
names.head()

Unnamed: 0,name,lang
0,Abbing,German
1,Abel,German
2,Abeln,German
3,Abt,German
4,Achilles,German


In [8]:
maxlen = np.max([len(name) for name in names.name])
print("Maximum name length: ", maxlen)

Maximum name length:  18


In [9]:
alphabet = sorted(list(set(''.join([name for name in names.name]))))
alphabet.append('END')
len_alphabet = len(alphabet)
char_index = dict((c, i) for i, c in enumerate(alphabet))
print("Size of alphabet: ",len_alphabet)
print(alphabet)

Size of alphabet:  74
[' ', "'", 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'Á', 'É', 'ß', 'à', 'á', 'ä', 'ç', 'è', 'é', 'ê', 'ì', 'í', 'ñ', 'ò', 'ó', 'ö', 'ù', 'ú', 'ü', 'END']


In [10]:
names.groupby('lang')['name'].count()/len(names)

lang
English    0.646230
French     0.048802
German     0.127555
Italian    0.124912
Spanish    0.052502
Name: name, dtype: float64

### Vector Representations

Now construct the vector representation by using one-hot-vectors. 

In [11]:
language_to_index = {country: index for index, country in enumerate(names.lang.unique())}
index_to_language = {index: country for index, country in enumerate(names.lang.unique())}

def onehot(i, length):
    v = np.zeros(length)
    v[i] = 1
    return v

def name_representation(name, maxlen):
    ### START YOUR CODE
    name_trunc = str(name)[0:maxlen]
    size = len(char_index)
    vector = [onehot(char_index[j], size) for j in str(name)]
    # fill the rest with
    for k in range(0, maxlen - len(str(name))):
        vector.append(onehot(char_index['END'], size))
    return vector
    ### START YOUR CODE

def lang_representation(language, language_to_index):
    y = np.zeros(len(language_to_index))
    y[language_to_index[language]] = 1
    return y

def lang_from_output(score):
    return index_to_language[np.argmax(score)]

def predict(name, model):
    score = model.predict(np.array([name_representation(name, maxlen)]))[0]
    return lang_from_output(score)

### Prepare train/test

Split the data into train/test

Shuffle the data

Transform the names data into a suitable vector respresentation:
* names into numpy arrays of shape (*,maxlen,len_alphabet)
* language into numpy array of shape (*,len(languages))



In [12]:
test_split = 0.2

### START YOUR CODE
names_shuffled = names.sample(frac=1, random_state=42)
n_test = int(len(names) * test_split)

train = names_shuffled.iloc[n_test:]
test = names_shuffled.iloc[:n_test]
### END YOUR CODE

In [13]:
class NameDataset(torch.utils.data.Dataset):
    def __init__(self, df, char_index, language_to_index, maxlen):
        self.data = df.reset_index(drop=True)
        self.char_index = char_index
        self.language_to_index = language_to_index
        self.maxlen = maxlen

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        name = self.data.loc[idx, 'name']
        lang = self.data.loc[idx, 'lang']

        name_vec = name_representation(name, self.maxlen)
        lang_idx = self.language_to_index[lang]

        return torch.from_numpy(np.array(name_vec)).float(), torch.tensor(lang_idx, dtype=torch.long)

In [14]:
Xy_train = NameDataset(train, char_index, language_to_index, maxlen)
Xy_test = NameDataset(test, char_index, language_to_index, maxlen)

Possibly, pack the data into a Dataset (e.g. when working with in PyTorch)

### Define and Train Model: Single Layer with SimpleRNN

Create an RNN consisting of a single layer with a SimpleRNN (keras) and a softmax.

Then train the model. Play with different number of hidden units in the layer to obtain a good accuracy.

In [15]:
### START YOUR CODE
class RNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, n_layers):
        super().__init__()
        self.rnn = nn.RNN(
            input_size=input_dim, 
            hidden_size=hidden_dim,
            num_layers=n_layers,
            batch_first=True
        )
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        out, _ = self.rnn(x)
        out = out[:, -1, :] # last time step only
        return self.fc(out)

### END YOUR CODE

In [16]:
def trainer(model, train_loader, val_loader, optimizer, criterion, n_epochs):
    """
    Trains and evaluates the model for n_epochs.
    """
    total_batches = n_epochs * len(train_loader)
    progress = tqdm(total=total_batches, desc="Training Progress", leave=True)

    train_losses, val_losses = [], []
    train_accs, val_accs = [], []

    for epoch in range(n_epochs):
        train_loss = 0
        train_correct = 0
        train_total = 0

        model.train()
        for X_train, y_train in train_loader:
            X_train, y_train = X_train.to(device), y_train.to(device)

            optimizer.zero_grad()
            output = model(X_train)
            loss = criterion(output, y_train)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            _, y_preds = torch.max(output, 1)
            train_correct += (y_preds == y_train).sum().item()
            train_total += y_train.size(0)

            progress.update(1)

        avg_train_loss = train_loss / len(train_loader)
        train_acc = train_correct / train_total
        train_losses.append(avg_train_loss)
        train_accs.append(train_acc)

        val_loss = 0
        val_correct = 0
        val_total = 0

        model.eval()
        with torch.no_grad():
            for X_val, y_val in val_loader:
                X_val, y_val = X_val.to(device), y_val.to(device)
                output = model(X_val)
                loss = criterion(output, y_val)

                val_loss += loss.item()
                _, y_preds = torch.max(output, 1)
                val_correct += (y_preds == y_val).sum().item()
                val_total += y_val.size(0)

        avg_val_loss = val_loss / len(val_loader)
        val_acc = val_correct / val_total
        val_losses.append(avg_val_loss)
        val_accs.append(val_acc)

        progress.set_postfix({
            'Epoch': f'{epoch + 1}/{n_epochs}',
            'Train Loss': f'{avg_train_loss:.4f}',
            'Val Loss': f'{avg_val_loss:.4f}',
            'Train Acc': f'{train_acc:.4f}',
            'Val Acc': f'{val_acc:.4f}'
        })

    progress.close()
    return train_losses, val_losses, train_accs, val_accs

In [19]:
### START YOUR CODE
n_epochs = 50
batch_size = 128
input_dim = len(char_index)
hidden_dim = 128
output_dim = len(language_to_index)

train_loader = DataLoader(Xy_train, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(Xy_test, batch_size=batch_size, shuffle=False)

model = RNN(input_dim, hidden_dim, output_dim, 1).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

train_losses, test_losses, train_accs, test_accs = trainer(model, train_loader, test_loader, optimizer, criterion, n_epochs)

### END YOUR CODE

Training Progress: 100%|██████████| 1800/1800 [00:24<00:00, 72.52it/s, Epoch=50/50, Train Loss=0.5505, Val Loss=0.6948, Train Acc=0.8139, Val Acc=0.7630]


#### Findings

...

### Implement Model with several SimpleRNN Layers

In [20]:
### START YOUR CODE
model = RNN(input_dim, hidden_dim, output_dim, 5).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

train_losses, test_losses, train_accs, test_accs = trainer(model, train_loader, test_loader, optimizer, criterion, n_epochs)
### END YOUR CODE

Training Progress: 100%|██████████| 1800/1800 [01:26<00:00, 20.70it/s, Epoch=50/50, Train Loss=0.4976, Val Loss=0.7199, Train Acc=0.8190, Val Acc=0.7489]


#### Findings

...


### Class Imbalance Handling

Choose a method to address the class imbalance seen in the given example.
- minority resampling 
- class weights in the loss

Implement it and incorporate it in the training.
Evaluate the results and compare it with the results obtained with the unbalanced training.  

In [None]:
### START YOUR CODE

# train...

### END YOUR CODE