In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
df = pd.read_csv("./characters_metadata.csv")

In [3]:
df.columns

Index(['ID', 'Name', 'Alias', 'Gender', 'Hair Color', 'Love Rank', 'Hate Rank',
       'Eye color', 'Birthday', 'Blood Type', 'Tags', 'Love Count',
       'Hate Count', 'Description', 'url'],
      dtype='object')

In [4]:
def get_name(name):
    chars = [*name]
    len_chars = len(chars)
    idx = 0
    while idx < len_chars and chars[idx].isalpha():
        idx += 1
    
    return "".join(chars[:idx])

In [5]:
get_name("Yui\nyaa")

'Yui'

In [6]:
df["Gender"].unique()

array(['Male', 'Female', 'Unknown'], dtype=object)

In [7]:
data = list(set([*map(lambda full_name: get_name(full_name).lower(), df[df["Gender"] == "Female"]["Name"])]))

In [8]:
max(map(lambda n: len(n), data))

20

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW

In [10]:
END_TOKEN = "<end>"
PAD_TOKEN = "<pad>"

In [11]:
tokens2idx = {token: idx for idx, token in enumerate([chr(i) for i in range(97,123)] + [END_TOKEN, PAD_TOKEN])}
idx2tokens = {v:k for k,v in tokens2idx.items()}

In [12]:
def tokenizer_encode(name, max_len=20):
    
    name = [*name[:max_len]]
    name.append(END_TOKEN)
    
    mask = [1 for _ in range(len(name))]
    
    
    while len(name) < max_len + 1:
        name.append(PAD_TOKEN)
        mask.append(0)
        
    return {"tokens": [tokens2idx[c] for c in name], "mask": mask}
        
    

In [13]:
name = random.choice(data)

name, tokenizer_encode(name)

('palp',
 {'tokens': [15,
   0,
   11,
   15,
   26,
   27,
   27,
   27,
   27,
   27,
   27,
   27,
   27,
   27,
   27,
   27,
   27,
   27,
   27,
   27,
   27],
  'mask': [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]})

In [14]:
class RNN(nn.Module):
    def __init__(self, vocab_size=28, embd_dim=512, hidden_dim=512):
    
        super().__init__()
        self.W_hh = nn.Linear(hidden_dim, hidden_dim)
        self.W_xh = nn.Linear(embd_dim, hidden_dim)
        self.W_hy = nn.Linear(hidden_dim, vocab_size)
        
        self.h = nn.Parameter(torch.randn(embd_dim)); self.h.retain_grad()
        self.embeddings = nn.Embedding(vocab_size, embd_dim)
        
        self.vocab_size = vocab_size
        self.hidden_dim = hidden_dim
        
    def forward(self, x, device="cpu"):

        # h = self.h
        x = self.embeddings(x)
        batch_size, seq_len, embd_dim = x.shape


        output = torch.zeros(batch_size, seq_len - 1, self.vocab_size).to(device)
        hiddens = torch.zeros(batch_size, self.hidden_dim).to(device)
        
        for i in range(batch_size):
            hiddens[i] = self.h
        

        for i in range(seq_len - 1):
            hiddens = F.tanh(self.W_hh(hiddens) + self.W_xh(x[:,i]))
            y = self.W_hy(hiddens)
            output[:,i] = y
        
        return output


In [15]:
DEVICE = "cuda:0"

In [16]:
model = RNN().to(DEVICE)

In [17]:
x = torch.tensor(tokenizer_encode(name)["tokens"]).reshape(1,-1).to(DEVICE)
y = model(x, device=DEVICE)

In [18]:
def criterion(input_tokens, y_pred):
        
    y_true = input_tokens[:, 1:].clone()
    y_true.masked_fill_(y_true == tokens2idx[PAD_TOKEN], -100)
    # print(y_pred.shape, y_true.shape)
    loss = F.cross_entropy(y_pred.reshape(-1,28), y_true.reshape(-1))
    return loss

In [19]:
from tqdm import tqdm

In [20]:
xs = []
masks = []

for idx, name in tqdm(enumerate(data)):
    try:
        tmp = tokenizer_encode(name)
        xs.append(tmp["tokens"])
        masks.append(tmp["mask"])
    except Exception as e:
        print(idx)

17314it [00:00, 304608.46it/s]

1096
1281
2068
2150
2660
3023
3347
3891
3963
4079
4087
4130
4527
4793
5200
5222
5881
5947
6022
6715
6812
6952
7448
7988
8270
8884
9244
9629
10113
10133
10767
11245
12902
13915
14693
15079
15936
16037
16119
16178
16828





In [21]:
xs = torch.tensor(xs).to(DEVICE)
masks = torch.tensor(masks).to(DEVICE)

In [22]:
# Train


BATCH_SIZE = 64
optimizer = AdamW(model.parameters())

In [23]:
def inference(model, input_str):
    
    while True:
        x = torch.tensor(tokenizer_encode(input_str, max_len=len(input_str))["tokens"]).reshape(1,-1).to(DEVICE)
        last_logits = model(x, device=DEVICE).squeeze()[-1]
        new_char_idx = last_logits.softmax(dim=-1).argmax().item()
    
        if new_char_idx == tokens2idx[END_TOKEN]:
            # print(input_str)
            break
        
        input_str = input_str + idx2tokens[new_char_idx]
    return input_str

In [29]:
training_loss = []
step_count = 0
inference_per_step = 50

for batch_start_idx in (tbar := tqdm(range(0,17273,BATCH_SIZE))):
    
    optimizer.zero_grad()
    
    input_xs = xs[batch_start_idx: batch_start_idx + BATCH_SIZE]
    input_masks = masks[batch_start_idx: batch_start_idx + BATCH_SIZE]
    
    pred_ys = model(input_xs, device=DEVICE)
    loss = criterion(input_xs,pred_ys)
    
    loss.backward()
    optimizer.step()
    
    step_count += 1
    training_loss.append(loss.item())
    
    if step_count % inference_per_step == 0:
        model.eval()
        name = inference(model, "yu")
        print(f"step count: {step_count} | name: {name}")
        model.train()
    
    
    tbar.set_description(f"loss: {loss.item()} | training loss: {np.mean(training_loss)}")

loss: 1.9715145826339722 | training loss: 2.004373825365497:  21%|███████████████████████▏                                                                                        | 56/270 [00:01<00:03, 61.27it/s]

step count: 50 | name: yuki


loss: 1.9761524200439453 | training loss: 1.9901961886457034:  41%|█████████████████████████████████████████████▋                                                                | 112/270 [00:01<00:02, 61.50it/s]

step count: 100 | name: yurin


loss: 2.06046986579895 | training loss: 1.989960026593856:  60%|███████████████████████████████████████████████████████████████████▍                                             | 161/270 [00:02<00:01, 61.14it/s]

step count: 150 | name: yuuka


loss: 2.1162972450256348 | training loss: 1.9905264237017002:  78%|█████████████████████████████████████████████████████████████████████████████████████▌                        | 210/270 [00:03<00:00, 63.29it/s]

step count: 200 | name: yung


loss: 2.124035596847534 | training loss: 1.9857862685375287:  96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▍    | 259/270 [00:04<00:00, 61.36it/s]

step count: 250 | name: yuuka


loss: 1.97346830368042 | training loss: 1.984521743544826: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 270/270 [00:04<00:00, 62.05it/s]


In [36]:
model.eval()

RNN(
  (W_hh): Linear(in_features=512, out_features=512, bias=True)
  (W_xh): Linear(in_features=512, out_features=512, bias=True)
  (W_hy): Linear(in_features=512, out_features=28, bias=True)
  (embeddings): Embedding(28, 512)
)

In [52]:
inference(model, "")

'ichina'

In [41]:
len(data)

17314