In [1]:
import pandas as pd
import numpy as np
import random

from tqdm import tqdm


import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW

In [2]:
df = pd.read_csv("./characters_metadata.csv")

In [3]:
df.columns

Index(['ID', 'Name', 'Alias', 'Gender', 'Hair Color', 'Love Rank', 'Hate Rank',
       'Eye color', 'Birthday', 'Blood Type', 'Tags', 'Love Count',
       'Hate Count', 'Description', 'url'],
      dtype='object')

In [4]:
df.head()

Unnamed: 0,ID,Name,Alias,Gender,Hair Color,Love Rank,Hate Rank,Eye color,Birthday,Blood Type,Tags,Love Count,Hate Count,Description,url
0,0,L,Ryuzaki,Male,Black,1.0,48.0,Black,"October 31, 1979",Unknown,"Analytical, Barefoot, Detectives, Eye Bags, Sw...",44.829,3.447,"Secretive, meticulous and cunning, L's desire ...",https://www.anime-planet.com/characters/l-deat...
1,1,Haru YOSHIDA,Unknown,Male,Black,346.0,4.172,Black,April 2,Unknown,"High School Students, Hot-Headed, Teenagers",4.669,124.0,Unknown,https://www.anime-planet.com/characters/haru-y...
2,2,Shinobu MAEHARA,Unknown,Female,Blue,2.942,9.11,Unknown,Unknown,Unknown,"Cooks, Crybabies, Middle School Students, Shy",823.0,53.0,Unknown,https://www.anime-planet.com/characters/shinob...
3,3,Chizuru OSHIMA,Unknown,Female,Black,3.877,1.801,Unknown,Unknown,Unknown,"Class Representatives, Glasses, High School St...",633.0,269.0,Unknown,https://www.anime-planet.com/characters/chizur...
4,4,Yuuzan YOSHIDA,Unknown,Male,Black,3.577,2.819,Unknown,Unknown,Unknown,Unknown,684.0,180.0,Unknown,https://www.anime-planet.com/characters/yuuzan...


In [5]:
def get_first_name(name):
    chars = [*name]
    len_chars = len(chars)
    idx = 0
    while idx < len_chars and chars[idx].isalpha() and chars[idx].isascii():
        idx += 1
    
    return "".join(chars[:idx])

In [6]:
female_names = list(set([*map(lambda full_name: get_first_name(full_name).lower(), df[df["Gender"] == "Female"]["Name"])]))
male_names = list(set([*map(lambda full_name: get_first_name(full_name).lower(), df[df["Gender"] == "Male"]["Name"])]))

In [7]:
female_names = [*filter(lambda name: len(name) != 0, female_names)]
male_names = [*filter(lambda name: len(name) != 0, male_names)]

In [8]:
print(f"max female name len: {max(map(lambda name: len(name), female_names))} | max male name len: {max(map(lambda name: len(name), male_names))}")
print(f"total female names: {len(female_names)} | total male names: {len(male_names)}")

max female name len: 20 | max male name len: 18
total female names: 17290 | total male names: 30758


In [9]:


# Special Tokens

END_TOKEN = "<end>"
PAD_TOKEN = "<pad>"

MALE_NAME_TOKEN = "<M>"
FEMALE_NAME_TOKEN = "<F>"

In [10]:

token2idx = {token: idx for idx, token in enumerate([chr(i) for i in range(97,123)] + [MALE_NAME_TOKEN, FEMALE_NAME_TOKEN, END_TOKEN, PAD_TOKEN])}
idx2token = {v:k for k,v in token2idx.items()}

In [11]:
DEVICE      = "cuda:0"
VOCAB_SIZE  = len(token2idx)
MAX_LEN     = 24
EMBED_DIM   = 512
HIDDEN_DIM  = 1024

EPOCHS      = 8

In [12]:

def tokenizer_encode(name, gender, max_len=24):

    if gender == "Male":
        gender_token = MALE_NAME_TOKEN
    elif gender == "Female":
        gender_token = FEMALE_NAME_TOKEN
    else:
        raise RuntimeError("Invalid gender")
    
    name = [gender_token] + [*name[:max_len]]
    name.append(END_TOKEN)
    
    while len(name) < max_len:
        name.append(PAD_TOKEN)
        
    return [token2idx[c] for c in name]

In [13]:

class RNN(nn.Module):
    
    def __init__(self, vocab_size=VOCAB_SIZE, embd_dim=EMBED_DIM, hidden_dim=HIDDEN_DIM):
    
        super().__init__()
        self.W_hh = nn.Linear(hidden_dim, hidden_dim)
        self.W_xh = nn.Linear(embd_dim, hidden_dim)
        self.W_hy = nn.Linear(hidden_dim, vocab_size)
        # self.W_hg = nn.Linear(hidden_dim)
        
        self.h = nn.Parameter(torch.randn(hidden_dim))
        self.embeddings = nn.Embedding(vocab_size, embd_dim)
        
        self.vocab_size = vocab_size
        self.hidden_dim = hidden_dim
        
    def forward(self, x, device=DEVICE):

        # h = self.h
        x = self.embeddings(x)
        batch_size, seq_len, embd_dim = x.shape


        output = torch.zeros(batch_size, seq_len - 1, self.vocab_size).to(device)
        hiddens = torch.zeros(batch_size, self.hidden_dim).to(device)
        
        for i in range(batch_size):
            hiddens[i] = self.h
        
        for i in range(seq_len - 1):
            
            hiddens = F.tanh(self.W_hh(hiddens) + self.W_xh(x[:,i] + x[:,0]))
            y = self.W_hy(hiddens)
            output[:,i] = y
        
        return output

In [14]:
model = RNN().to(DEVICE)

RuntimeError: CUDA unknown error - this may be due to an incorrectly set up environment, e.g. changing env variable CUDA_VISIBLE_DEVICES after program start. Setting the available devices to be zero.

In [None]:

def criterion(input_tokens, y_pred):
        
    y_true = input_tokens[:, 1:].clone()

    # The first token will be the gender token
    y_true[0] = -100
    y_true.masked_fill_(y_true == token2idx[PAD_TOKEN], -100)
    
    # print(y_pred.shape, y_true.shape)
    loss = F.cross_entropy(y_pred.reshape(-1,VOCAB_SIZE), y_true.reshape(-1))
    return loss

In [15]:
xs = []

for idx, name in tqdm(enumerate(female_names)):
    xs.append(tokenizer_encode(name, "Female"))

print(xs[-1])
for idx, name in tqdm(enumerate(male_names)):
    xs.append(tokenizer_encode(name, "Male"))

print(xs[-1])

17290it [00:00, 406476.75it/s]


[27, 6, 20, 15, 15, 24, 28, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29]


30758it [00:00, 185483.75it/s]

[26, 10, 8, 17, 8, 13, 12, 0, 17, 20, 28, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29]





In [16]:

random.shuffle(xs)
xs = torch.tensor(xs).to(DEVICE)

RuntimeError: CUDA unknown error - this may be due to an incorrectly set up environment, e.g. changing env variable CUDA_VISIBLE_DEVICES after program start. Setting the available devices to be zero.

In [18]:
xs[:11]

tensor([[27,  2,  0, 17, 28, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
         29, 29, 29, 29, 29, 29],
        [27,  0, 10,  0, 25, 20, 10,  8, 13, 28, 29, 29, 29, 29, 29, 29, 29, 29,
         29, 29, 29, 29, 29, 29],
        [27, 14,  2,  7,  8, 24,  0, 28, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
         29, 29, 29, 29, 29, 29],
        [26,  1,  4, 11, 11,  8, 18, 28, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
         29, 29, 29, 29, 29, 29],
        [26,  3,  0,  4, 11, 28, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
         29, 29, 29, 29, 29, 29],
        [26,  5, 20, 12,  8, 13, 14, 17,  8, 28, 29, 29, 29, 29, 29, 29, 29, 29,
         29, 29, 29, 29, 29, 29],
        [26, 18,  7,  8, 13,  0,  7, 28, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
         29, 29, 29, 29, 29, 29],
        [27, 17,  4,  1,  4,  2,  2,  0, 28, 29, 29, 29, 29, 29, 29, 29, 29, 29,
         29, 29, 29, 29, 29, 29],
        [26, 14, 15, 19,  8, 12, 20, 18, 28, 29, 29, 29, 29, 29, 29, 29, 29, 29,

In [19]:

BATCH_SIZE = 64
optimizer = AdamW(model.parameters())

In [20]:

def inference(model, input_str, gender):
    
    while True:
        x = torch.tensor(tokenizer_encode(input_str, gender, max_len=len(input_str))).reshape(1,-1).to(DEVICE)
        last_logits = model(x, device=DEVICE).squeeze()[-1]
        new_char_idx = last_logits.softmax(dim=-1).argmax().item()
    
        if new_char_idx == token2idx[END_TOKEN]:
            # print(input_str)
            break
        
        input_str = input_str + idx2token[new_char_idx]
    return input_str

In [25]:

training_loss = []
step_count = 0
inference_per_step = 50

for batch_start_idx in (tbar := tqdm(range(0,len(xs),BATCH_SIZE))):
    
    optimizer.zero_grad()
    
    input_xs = xs[batch_start_idx: batch_start_idx + BATCH_SIZE]
    
    pred_ys = model(input_xs, device=DEVICE)
    loss = criterion(input_xs,pred_ys)
    
    loss.backward()
    optimizer.step()
    
    step_count += 1
    training_loss.append(loss.item())
    
    if step_count % inference_per_step == 0:
        model.eval()
        name = inference(model, "y", "Female")
        print(f"step count: {step_count} | name: {name} | gender: Female")
        name = inference(model, "y", "Male")
        print(f"step count: {step_count} | name: {name} | gender: Male")
        model.train()
    
    
    tbar.set_description(f"loss: {loss.item()} | training loss: {np.mean(training_loss)}")

  0%|                                                                                                                                                                                      | 0/751 [00:00<?, ?it/s]


RuntimeError: CUDA error: unspecified launch failure
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [22]:
model.eval()

RNN(
  (W_hh): Linear(in_features=2048, out_features=2048, bias=True)
  (W_xh): Linear(in_features=512, out_features=2048, bias=True)
  (W_hy): Linear(in_features=2048, out_features=30, bias=True)
  (embeddings): Embedding(30, 512)
)

In [86]:
inference(model, "fus", "Female")

'fusana'