In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
import requests
from bs4 import BeautifulSoup

In [3]:
V_vardai = []
M_vardai = []

for key in ['a', 'b', 'c', 'c-2', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
            'm', 'n', 'o', 'p', 'r', 's', 's-2', 't', 'u', 'v', 'z', 'z-2']:
    V_url = f'https://vardai.vlkk.lt/sarasas/{key}/?lytis=vyro&kilme='
    M_url = f'https://vardai.vlkk.lt/sarasas/{key}/?lytis=moters&kilme='

    response = requests.get(V_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    V_links = soup.find_all('a', class_='names_list__links names_list__links--man')
    V_vardai += [name.text for name in V_links]

    response = requests.get(M_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    M_links = soup.find_all('a', class_='names_list__links names_list__links--woman')
    M_vardai += [name.text for name in M_links]

V_vardai = [name for name in V_vardai if name.endswith('s')]
M_vardai = [name for name in M_vardai if not (name.endswith('l') or name.endswith('n') or name.endswith('e'))]

np.savetxt('Vyru_vardai.txt', V_vardai, fmt='%s', header='name', comments='', newline='\n')
np.savetxt('Moteru_vardai.txt', M_vardai, fmt='%s', header='name', comments='', newline='\n')
np.savetxt('Visi_vardai.txt', V_vardai + M_vardai, fmt='%s', header='name', comments='', newline='\n')

In [4]:
len(V_vardai), len(M_vardai)

(3409, 4086)

In [5]:
with open('Visi_vardai.txt', 'r') as file:
    lines = file.readlines()
    print("".join(lines[:10]))
    print("".join(lines[5000:5010]))

name
Ãbas
Ãbdijus
Abdònas
Ãbelis
Ãbis
Abraõmas
Abrõmas
Achìlas
Achmèdas

Gìlma
Gìlmantė
Gìlmė
Gìlmina
Gìlminta
Gìlmintė
Gilvidà
Gilvìlė
Gìlvina
Gilvydà



In [6]:
class NameDataset(Dataset):
    def __init__(self, male_file, female_file):
        male_data = pd.read_csv(male_file, header=None, names=['name'])
        female_data = pd.read_csv(female_file, header=None, names=['name'])

        male_data['gender'] = 0  # Vyras = 0
        female_data['gender'] = 1  # Moteris = 1

        self.data = pd.concat([male_data, female_data])
        self.names = self.data['name'].str.strip().str.lower().tolist()
        self.genders = self.data['gender'].tolist()

        all_names = ''.join(self.names)



        unwanted_chars = set('xwq')
        # Remove unwanted characters from the full dataset
        filtered_chars = [char for char in all_names if char not in unwanted_chars]



        self.chars = sorted(set(all_names + ' '))
        self.char_to_int = {c: i for i, c in enumerate(self.chars)}
        self.int_to_char = {i: c for c, i in self.char_to_int.items()}
        self.vocab_size = len(self.chars)

    def __len__(self):
        return len(self.names)

    def __getitem__(self, idx):
        name = self.names[idx]
        gender = self.genders[idx]
        name += ' '  # Add end-of-sequence marker
        encoded_name = [self.char_to_int[char] for char in name]
        return torch.tensor(encoded_name, dtype=torch.long), torch.tensor(gender, dtype=torch.long)


In [7]:
def pad_collate(batch):
    names, genders = zip(*batch)
    padded_seqs = pad_sequence(names, batch_first=True, padding_value=0)
    input_seq = padded_seqs[:, :-1]
    target_seq = padded_seqs[:, 1:]
    genders = torch.stack(genders)  # Stack gender tensors
    return input_seq, target_seq, genders

In [8]:
dataset = NameDataset('Vyru_vardai.txt', 'Moteru_vardai.txt')
loader = DataLoader(dataset, batch_size=32, collate_fn=pad_collate, shuffle=True)

for input_seq, target_seq, genders in loader:
    print(f"Input shape: {input_seq.shape}, Target shape: {target_seq.shape}, Gender shape: {genders.shape}")
    break

Input shape: torch.Size([32, 12]), Target shape: torch.Size([32, 12]), Gender shape: torch.Size([32])


In [9]:
class GenderAwareTransformer(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, forward_expansion):
        super(GenderAwareTransformer, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.gender_embed = nn.Embedding(2, embed_size)  # 2 for male/female
        self.positional_encoding = nn.Parameter(torch.randn(1, 100, embed_size))
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=embed_size, nhead=num_heads)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=1)
        self.output_layer = nn.Linear(embed_size, vocab_size)

    def forward(self, x, gender):
        # Get embeddings
        char_embeddings = self.embed(x)
        gender_embeddings = self.gender_embed(gender).unsqueeze(1).expand(-1, x.size(1), -1)

        # Combine character and gender embeddings
        x = char_embeddings + gender_embeddings + self.positional_encoding[:, :x.size(1), :]
        x = self.transformer_encoder(x)
        x = self.output_layer(x)
        return x

In [10]:
# Training Loop
def train_model(model, dataloader, epochs=10):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters())

    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        batch_count = 0

        for batch_idx, (input_seq, target_seq, genders) in enumerate(dataloader):
            optimizer.zero_grad()
            output = model(input_seq, genders)
            loss = criterion(output.transpose(1, 2), target_seq)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            batch_count += 1

        average_loss = total_loss / batch_count
        print(f'Epoch {epoch+1}, Average Loss: {average_loss}')

In [11]:
def sample(model, dataset, gender, start_str='a', max_length=20, temperature=1.0):
    assert temperature > 0, "Temperature must be greater than 0"
    model.eval()
    with torch.no_grad():
        start_str = start_str.lower()
        chars = [dataset.char_to_int[c] for c in start_str]
        input_seq = torch.tensor(chars).unsqueeze(0)
        gender_tensor = torch.tensor([gender])  # 0 for male, 1 for female

        output_name = start_str
        last_char = start_str[-1]  # Track the last character to avoid repeating it

        for _ in range(max_length - len(start_str)):
            output = model(input_seq, gender_tensor)
            probabilities = torch.softmax(output[0, -1], dim=0)

            # Apply temperature scaling
            logits = output[0, -1] / temperature
            probabilities = torch.softmax(logits, dim=0)

            next_char_idx = torch.multinomial(probabilities, 1).item()
            next_char = dataset.int_to_char[next_char_idx]

            while next_char == last_char:
                next_char_idx = torch.multinomial(probabilities, 1).item()
                next_char = dataset.int_to_char[next_char_idx]

            if next_char == ' ':
                break

            output_name += next_char
            input_seq = torch.cat([input_seq, torch.tensor([[next_char_idx]])], dim=1)

        return output_name.capitalize()

In [12]:
dataset = NameDataset('Vyru_vardai.txt', 'Moteru_vardai.txt')
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=pad_collate)

model = GenderAwareTransformer(
    vocab_size=dataset.vocab_size,
    embed_size=128,
    num_heads=8,
    forward_expansion=4
)
train_model(model, dataloader)



Epoch 1, Average Loss: 1.4598489157696988
Epoch 2, Average Loss: 1.295778157863211
Epoch 3, Average Loss: 1.2731711760480353
Epoch 4, Average Loss: 1.2648672710073756
Epoch 5, Average Loss: 1.2537640515794146
Epoch 6, Average Loss: 1.2511995980080137
Epoch 7, Average Loss: 1.2465078825646259
Epoch 8, Average Loss: 1.2448389329808824
Epoch 9, Average Loss: 1.2424808215587697
Epoch 10, Average Loss: 1.23708210549456


In [13]:
import random

print("Vyrų vardai:")
for _ in range(10):
    random_start_letter = random.choice(dataset.chars)
    name = sample(model, dataset, gender=0, start_str='R', temperature=0.5)
    print(name)

print("\nMoterų vardai:")
for _ in range(10):
    random_start_letter = random.choice(dataset.chars)
    name = sample(model, dataset, gender=1, start_str=random_start_letter, temperature=0.5)
    print(name)

Vyrų vardai:
Rìnijus
Rãgas
Rìmas
Rãmas
Rìstas
Rãnas
Rámas
Raũtas
Rãtas
Rìlijus

Moterų vardai:
Ęertrà
Gamà
Daijà
Ylinà
Jùlintà
Ỹìdrina
Daistė
Ỹilintà
Jórintė
Naulijà


In [17]:
torch.save(model.state_dict(), '/content/name_model.pt')

mappings = {
    'char_to_int': dataset.char_to_int,
    'int_to_char': {str(k): v for k, v in dataset.int_to_char.items()},
    'vocab_size': dataset.vocab_size
}
import json
with open('/content/name_mappings.json', 'w', encoding='utf-8') as f:
    json.dump(mappings, f, ensure_ascii=False, indent=2)

In [18]:
from google.colab import files
files.download('/content/name_model.pt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [19]:
from google.colab import files
files.download('/content/name_mappings.json')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>