# Namesformer

## Import neccessary libraries

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
import requests
from bs4 import BeautifulSoup
import unicodedata

## Scrape women and men names from website

In [102]:
def scrape_names(url, class_name):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        links = soup.find_all('a', class_=class_name)
        return [name.text for name in links]
    else:
        print(f"Failed to fetch data from {url}")
        return []

original_names_women = []
original_names_men = []

for key in ['a', 'b', 'c', 'c-2', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k',
            'l', 'm', 'n', 'o', 'p', 'r', 's', 's-2', 't', 'u', 'v', 'z', 'z-2']:
    url_moters = f'https://vardai.vlkk.lt/sarasas/{key}/?lytis=moters&kilme=baltiskos'
    url_vyro = f'https://vardai.vlkk.lt/sarasas/{key}/?lytis=vyro&kilme=baltiskos'

    original_names_women += scrape_names(url_moters, 'names_list__links names_list__links--woman')
    original_names_men += scrape_names(url_vyro, 'names_list__links names_list__links--man')

In [103]:
original_names_women[:5], original_names_men[:5]

(['Agìlė', 'Agluonà', 'Agnà', 'Aguonà', 'Áida'],
 ['Áidas', 'Áidijus', 'Aĩdis', 'Aĩdoras', 'Aĩgardas'])

In [104]:
len(original_names_women), len(original_names_men)

(1453, 1438)

## Remove Lithuanian accentuation (makes the model easier to train).

In [108]:
# Function to keep only specific characters
def clean_text(text):
    # Normalize to decompose characters into base + diacritics
    nfkd_form = unicodedata.normalize('NFKD', text)
    # Define the allowed characters (Lithuanian-specific + standard alphabet)
    allowed_chars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZąčęėįšųūĄČĘĖĮŠŲŪ"
    # Filter out all characters not in the allowed list
    return ''.join(char for char in nfkd_form if char in allowed_chars)

# Clean the list of names
names_women = [clean_text(name) for name in original_names_women]
names_men = [clean_text(name) for name in original_names_men]

In [120]:
names_men[:5], names_women[:5]

(['Aidas', 'Aidijus', 'Aidis', 'Aidoras', 'Aigardas'],
 ['Agile', 'Agluona', 'Agna', 'Aguona', 'Aida'])

## Save names to file

In [112]:
np.savetxt('names_women.txt', names_women, fmt='%s', delimiter="\n")
np.savetxt('names_men.txt', names_men, fmt='%s', delimiter="\n")

## Class to transform a dataset of names into a format suitable for training a model

In [116]:
# Adjusted NameDataset
class NameDataset(Dataset):
    def __init__(self, csv_file):
        self.names = pd.read_csv(csv_file)['name'].values # Load names from file
        self.chars = sorted(list(set(''.join(self.names) + ' ')))  # Including a padding character
        # Char to int and Int to char mappings
        self.char_to_int = {c: i for i, c in enumerate(self.chars)} # Char to int mapping
        self.int_to_char = {i: c for c, i in self.char_to_int.items()} # Int to char mapping
        self.vocab_size = len(self.chars) # Number of unique characters

    def __len__(self):
        return len(self.names)

    def __getitem__(self, idx):
        name = self.names[idx] + ' '  # Adding padding character at the end
        encoded_name = [self.char_to_int[char] for char in name]
        return torch.tensor(encoded_name)

## Create dataset from files

In [117]:
csv_file_men = 'names_men.txt'
csv_file_women = 'names_women.txt'
dataset_men = NameDataset(csv_file_men)
dataset_women = NameDataset(csv_file_women)

In [119]:
dataset_men[2], dataset_women[3]

(tensor([ 1, 28, 25, 28, 37,  0]), tensor([ 1, 28, 41, 36, 35, 22,  0]))

In [None]:
dataset_men.vocab_size, dataset_women.vocab_size

(43, 43)

In [None]:
len(dataset_men.int_to_char)

43

##

In [80]:
# Function for adding padding.
# Suppose batch contains: [tensor([1, 2, 3]), tensor([4, 5])]
# After padding: tensor([[1, 2, 3], [4, 5, 0]])
def pad_collate(batch):
    padded_seqs = pad_sequence(batch, batch_first=True, padding_value=0)
    input_seq = padded_seqs[:, :-1]
    target_seq = padded_seqs[:, 1:]
    return input_seq, target_seq

# Minimal Transformer Model
class MinimalTransformer(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, forward_expansion):
        super(MinimalTransformer, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.positional_encoding = nn.Parameter(torch.randn(1, 100, embed_size))
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=embed_size, nhead=num_heads)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=1)
        self.output_layer = nn.Linear(embed_size, vocab_size)

    def forward(self, x):
        positions = torch.arange(0, x.size(1)).unsqueeze(0)
        x = self.embed(x) + self.positional_encoding[:, :x.size(1), :]
        x = self.transformer_encoder(x)
        x = self.output_layer(x)
        return x

# Train model function
def train_model(model, dataloader, epochs=10):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters())

    for epoch in range(epochs):
        model.train()  # Ensure the model is in training mode
        total_loss = 0.0
        batch_count = 0

        for batch_idx, (input_seq, target_seq) in enumerate(dataloader):
            optimizer.zero_grad()
            output = model(input_seq)
            loss = criterion(output.transpose(1, 2), target_seq)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            batch_count += 1

        average_loss = total_loss / batch_count
        print(f'Epoch {epoch+1}, Average Loss: {average_loss}')

## Create dataloaders, models and train them

In [121]:
dataloader_men = DataLoader(dataset_men, batch_size=32, shuffle=True, collate_fn=pad_collate)
model_men = MinimalTransformer(vocab_size=dataset_men.vocab_size, embed_size=128, num_heads=8, forward_expansion=4)
train_model(model_men, dataloader_men, epochs=300)

dataloader_women = DataLoader(dataset_women, batch_size=32, shuffle=True, collate_fn=pad_collate)
model_women = MinimalTransformer(vocab_size=dataset_women.vocab_size, embed_size=128, num_heads=8, forward_expansion=4)
train_model(model_women, dataloader_women, epochs=300)



Epoch 1, Average Loss: 1.4362415062056648
Epoch 2, Average Loss: 1.0511107603708902
Epoch 3, Average Loss: 1.0189743293656244
Epoch 4, Average Loss: 1.0137107663684422
Epoch 5, Average Loss: 0.9994766460524664
Epoch 6, Average Loss: 0.989901167816586
Epoch 7, Average Loss: 0.9872836351394654
Epoch 8, Average Loss: 0.9884342749913534
Epoch 9, Average Loss: 0.989985528257158
Epoch 10, Average Loss: 0.9718024465772841
Epoch 11, Average Loss: 0.9784861498408848
Epoch 12, Average Loss: 0.975296617878808
Epoch 13, Average Loss: 0.9680834849675496
Epoch 14, Average Loss: 0.965990420182546
Epoch 15, Average Loss: 0.9693275398678249
Epoch 16, Average Loss: 0.9642345256275601
Epoch 17, Average Loss: 0.9562673687934875
Epoch 18, Average Loss: 0.962355703777737
Epoch 19, Average Loss: 0.961203318172031
Epoch 20, Average Loss: 0.9531473981009589
Epoch 21, Average Loss: 0.9559653043746948
Epoch 22, Average Loss: 0.9618908325831096
Epoch 23, Average Loss: 0.9618341114785937
Epoch 24, Average Loss: 0.

## Function for generating sample names

In [92]:
def sample(model, dataset, start_str='A', max_length=20, eos_token=' '):
    model.eval()  # Switch to evaluation mode
    with torch.no_grad():
        # Convert start string to tensor
        chars = [dataset.char_to_int[c] for c in start_str]
        input_seq = torch.tensor(chars).unsqueeze(0)  # Add batch dimension

        output_name = start_str
        for _ in range(max_length - len(start_str)):
            output = model(input_seq)

            # Get the last character from the output
            probabilities = torch.softmax(output[0, -1], dim=0)
            # Sample a character from the probability distribution
            next_char_idx = torch.multinomial(probabilities, 1).item()
            next_char = dataset.int_to_char[next_char_idx]

            if next_char == eos_token:  # Assume ' ' is your end-of-sequence character
                break

            output_name += next_char
            # Update the input sequence for the next iteration
            input_seq = torch.cat([input_seq, torch.tensor([[next_char_idx]])], dim=1)

        return output_name

## Generate names using trained model

In [145]:
men = []
women = []

# After training your model, generate a name starting with a specific letter
for _ in range(10):
    men.append(sample(model_men, dataset_men, start_str='Pla'))

# After training your model, generate a name starting with a specific letter
for _ in range(10):
    women.append(sample(model_women, dataset_women, start_str='Pla'))

In [146]:
print(men)
print(women)

['Plazvildas', 'Plaintas', 'Plais', 'Plaimas', 'Plaimis', 'Plailis', 'Plazas', 'Plaltis', 'Plaimilas', 'Plasutas']
['Plarute', 'Plazmide', 'Plarore', 'Plaimile', 'Plautile', 'Planeda', 'Plazvile', 'Plaisme', 'Plane', 'Plazvyda']


## Save model

In [141]:
torch.save(model_men, "model_men.pth")
torch.save(model_women, "model_women.pth")