In [30]:
import torch 
import torch.nn as nn
from torch.utils.data import DataLoader
from collections import defaultdict

import pandas as pd


## Data

In [12]:
df = pd.read_csv('pets_ru_en.csv')
df.loc[:, 'имя'] = df['имя'].apply(lambda x: x.lower())
df.head()

Unnamed: 0,имя,язык
0,acapella,en
1,achilles,en
2,adriana,en
3,alpha,en
4,alyssum,en


In [13]:
df.tail()

Unnamed: 0,имя,язык
2908,ярика,rus
2909,яриска,rus
2910,ярка,rus
2911,яркиса,rus
2912,ярыся,rus


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2913 entries, 0 to 2912
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   имя     2913 non-null   object
 1   язык    2913 non-null   object
dtypes: object(2)
memory usage: 45.6+ KB


In [15]:
df.describe()

Unnamed: 0,имя,язык
count,2913,2913
unique,2894,2
top,abee,rus
freq,4,1783


In [17]:
df[df.duplicated()]

Unnamed: 0,имя,язык
155,eyes,en
704,cats,en
771,abee,en
772,cat,en
794,grey,en
814,photoxpress,en
843,abee,en
870,photoxpress,en
912,abee,en
913,names,en


In [18]:
dfc = df.drop_duplicates()
dfc[dfc.duplicated()]

Unnamed: 0,имя,язык


In [10]:
stoi = defaultdict()
stoi['<PAD>'] = 0
stoi['<SOS>'] = 1
stoi['<EOS>'] = 2
stoi['rus'] = 3
stoi['en'] = 4

let = []
for el in df['имя'].values:
    for l in set(el):
        let.append(l)
let = set(let)
for i, el in enumerate(let):
    stoi[el] = i + 5
itos = {stoi[el]:el for el in stoi.keys()}

In [11]:
len(stoi), len(itos)

(65, 65)

In [32]:
class PetsGetDataset(nn.Module):
    def __init__(self, data, stoi, itos, pad_val=0, pad_size=13):
        super(PetsGetDataset, self).__init__()
        self.data = data
        self.stoi = stoi
        self.itos = itos
        self.pad_val = pad_val
        self.pad_size = pad_size
        self.x = []
        self.y = []
        
        for el, lang in data.values:
            cx = self._prep(el, lang)
            cy = cx[1:-1] + [2] + [0]
            self.x.append(torch.tensor(cx))
            self.y.append(torch.tensor(cy))
            
        
    
    def _prep(self, s, lang):
        words = [self.stoi.get(l, 0) for l in s]
        pad_total =  self.pad_size - len(words) - 2
    
        if pad_total >= 0:
            return [self.stoi[lang]] + [1] + words + [2] + [self.pad_val]*pad_total
            
        else:
            return [self.stoi[lang]]+ [1] + words[:pad_total] + [2]
    
        
        
    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]
        
    def __len__(self):
        return len(self.data)

In [25]:
def sample_with_temperature(logits, temperature=1.0):
    logits = logits / temperature
    probs = torch.softmax(logits, dim=-1)
    return torch.multinomial(probs, num_samples=1).item()

In [50]:
def generate_pet_name(model, stoi, itos, lang='rus', max_len=10, temperature=1.0):
    model.eval()
    device = next(model.parameters()).device
    generated = [stoi[lang], stoi['<SOS>']]
    h_t = None
    
    for _ in range(max_len):
        x_t = torch.tensor([generated], dtype=torch.long).to(device)
    
        logits = model(x_t)  
        next_logits = logits[0, -1]
        next_token = sample_with_temperature(next_logits, temperature)
    
        if next_token == stoi['<EOS>']:
            break
        generated.append(next_token)
    
    name = ''.join(itos.get(i, 0) for i in generated if i not in {stoi['<SOS>'], stoi['<EOS>'], stoi['<PAD>'], stoi['rus'], stoi['en']})
    return name.capitalize()


In [41]:
def train(model, crit, dl, optim, sheduler=None):
    model.train()
    bl = []
    for i, (x, y) in enumerate(dl):
        ypr = model(x.to('cuda'))
        y_pred_flat = ypr.reshape(-1, ypr.size(-1))  
        y_flat = y.to('cuda').flatten() 
        loss = crit(y_pred_flat, y_flat)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        optim.step()
        
        if sheduler:
            sheduler.step()

        
        optim.zero_grad()
        bl.append(loss.item())
    return torch.tensor(bl).mean()

In [42]:
class Genmodel(nn.Module):
    def __init__(self):
        super(Genmodel, self).__init__()
        self.embl = nn.Embedding(len(stoi)+1, 128)
        self.rnn = nn.GRU(128, 64, num_layers=3, batch_first=True)
        self.norm = nn.LayerNorm(64)
        self.l = nn.Linear(64, len(stoi)+1)
    
    def forward(self, x):
        self.embx = self.embl(x)
        out, h = self.rnn(self.embx)
        return self.l(self.norm(out)) 

In [47]:
ds = PetsGetDataset(df, stoi, itos)
model = Genmodel().to('cuda')
opt = torch.optim.AdamW(model.parameters(), lr=1e-3)
epoch = 101
scheduler = torch.optim.lr_scheduler.OneCycleLR(opt, max_lr = 1e-3, epochs = epoch, steps_per_epoch= len(dl),)

dl = DataLoader(ds, 128, shuffle=True)
crit = nn.CrossEntropyLoss(ignore_index=0)

In [48]:
ll = []
for i in range(epoch):
    l = train(model, crit, dl, opt, scheduler)
    
    if i % 20 == 0:
        print(f'CROSS ENTROPY LOSS = {l}')
        print()
        print("en names:")
        for _ in range(3):
            print(generate_pet_name(model, stoi, itos, lang='en', max_len=7, temperature=0.9))
        print()
        print("rus names:")
        for _ in range(3):
            print(generate_pet_name(model, stoi, itos, lang='rus', max_len=7, temperature=0.9))
        print()
        print('#'*10, 'NEXT', '#'*10)
        
    ll.append(l)

CROSS ENTROPY LOSS = 4.300046443939209

en names:
-mьvде
Алmcчc
Dsхnд m

rus names:
Цвтяcc
Лsfл
Дhмепн

########## NEXT ##########
CROSS ENTROPY LOSS = 1.7629815340042114

en names:
Dautilo
Tpumen
Lasma

rus names:
Лисши
Мита
Тетта

########## NEXT ##########
CROSS ENTROPY LOSS = 1.4829585552215576

en names:
Khama
Oryshin
Golcy

rus names:
Кайла
Лейо
Мэрли

########## NEXT ##########
CROSS ENTROPY LOSS = 1.2919609546661377

en names:
Hliner
Thamani
Belion

rus names:
Нейзи
Шейна
Жука

########## NEXT ##########
CROSS ENTROPY LOSS = 1.1862159967422485

en names:
Fleet
Whiskey
Shister

rus names:
Белла
Капельк
Астэт

########## NEXT ##########
CROSS ENTROPY LOSS = 1.1616578102111816

en names:
Silver
Orshonп
Godo

rus names:
Юшка
Флемтия
Тарамун

########## NEXT ##########


In [51]:
generate_pet_name(model, stoi, itos, max_len=10, temperature=20)

'Uпаыxpх кt'

In [64]:
generate_pet_name(model, stoi, itos, max_len=10, temperature=10)

'Рtиeинwдл'

In [65]:
generate_pet_name(model, stoi, itos, max_len=10, temperature=0.9)

'Дафна'

In [63]:
generate_pet_name(model, stoi, itos, max_len=20, temperature=1)

'Ямалина'