In [1]:
!pip install Unidecode

Collecting Unidecode
  Downloading Unidecode-1.3.8-py3-none-any.whl.metadata (13 kB)
Downloading Unidecode-1.3.8-py3-none-any.whl (235 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/235.5 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Unidecode
Successfully installed Unidecode-1.3.8


In [2]:
import torch
import torch.nn as nn
import string
import random
import sys
import unidecode

In [3]:
!wget https://raw.githubusercontent.com/aladdinpersson/Machine-Learning-Collection/master/ML/Projects/text_generation_babynames/data/names.txt

--2024-04-09 00:32:59--  https://raw.githubusercontent.com/aladdinpersson/Machine-Learning-Collection/master/ML/Projects/text_generation_babynames/data/names.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7303572 (7.0M) [text/plain]
Saving to: 'names.txt'


2024-04-09 00:32:59 (270 MB/s) - 'names.txt' saved [7303572/7303572]



In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [5]:
all_characters = string.printable
n_characters = len(all_characters)

In [6]:
print(all_characters)

0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ 	



In [7]:
print(n_characters)

100


In [8]:
file = unidecode.unidecode(open('/kaggle/working/names.txt').read())

In [9]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embed = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden, cell):
        out = self.embed(x)
        out, (hidden ,cell) = self.lstm(out.unsqueeze(1), (hidden, cell))
        out = self.fc(out.reshape(out.shape[0], -1))
        return out, (hidden, cell)
    
    def init_hidden(self, batch_size):
        hidden = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
        cell = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
        return hidden, cell
    
class Generator():
    def __init__(self):
        self.chunk_len = 250
        self.num_epochs = 5000
        self.batch_size = 128
        self.print_every = 50
        self.hidden_size = 256
        self.num_layers = 2
        self.lr = 0.003
        
    def char_tensor(self, string):
        tensor = torch.zeros(len(string)).long()
        for c in range(len(string)):
            tensor[c] = all_characters.index(string[c])
        return tensor
            
    def get_random_batch(self):
        start_idx = random.randint(0, len(file) - self.chunk_len)
        end_idx = start_idx + self.chunk_len + 1
        text_str = file[start_idx:end_idx]
        text_input = torch.zeros(self.batch_size, self.chunk_len)
        text_target = torch.zeros(self.batch_size, self.chunk_len)
        
        for i in range(self.batch_size):
            start_idx = random.randint(0, len(file) - self.chunk_len)
            end_idx = start_idx + self.chunk_len + 1
            text_str = file[start_idx:end_idx]
            text_input[i,:] = self.char_tensor(text_str[:-1])
            text_target[i,:] = self.char_tensor(text_str[1:])
        
        return text_input.long(), text_target.long()
    
    def generate(self, initial_str= 'I', prediction_len=100, temperature=0.85):
        hidden, cell = self.rnn.init_hidden(batch_size=1)
        initial_input = self.char_tensor(initial_str)
        predicted = initial_str
        
        for p in range(len(initial_str) - 1):
            _, (hidden, cell) = self.rnn(initial_input[p].view(1), hidden, cell)
        
        last_char = initial_input[-1]
        for p in range(prediction_len):
            last_char = last_char.to(device)
            output, (hidden, cell) = self.rnn(last_char.view(1), hidden, cell)
            output_dist = output.data.view(-1).div(temperature).exp()
            top_char = torch.multinomial(output_dist, 1)[0]
            predicted_char = all_characters[top_char]
            predicted += predicted_char
            last_char = self.char_tensor(predicted_char)
            
        return predicted
        
    def train(self):
        #input_size, hidden_size, num_layers, output_size
        self.rnn = RNN(n_characters, self.hidden_size, self.num_layers, n_characters).to(device)
        optimizer = torch.optim.Adam(self.rnn.parameters(), lr=self.lr)
        criterion = nn.CrossEntropyLoss()
        print('=== Starting training === ')
        for epoch in range(1, self.num_epochs + 1):
            inp, target = self.get_random_batch()
            hidden, cell = self.rnn.init_hidden(batch_size=self.batch_size)
            
            self.rnn.zero_grad()
            loss = 0
            inp = inp.to(device)
            target = target.to(device)
            
            for c in range(self.chunk_len):
                output, (hidden, cell) = self.rnn(inp[:, c], hidden, cell)
                loss += criterion(output, target[:, c])
            
            loss.backward()
            optimizer.step()
            loss = loss.item() / self.chunk_len
            
            if epoch % self.print_every == 0:
                print(epoch)
                print(f'loss: {loss}')
                print(self.generate())

In [10]:
gennames = Generator()
gennames.train()

=== Starting training === 
50
loss: 2.190916259765625
I*cig
Saryn
Soann
Sotrama
Shaniels
Seath
Rilenn
Aliid
Delline
webla
Davedin
Corlie
Kacia
Mima
Kelans

100
loss: 1.929939208984375
Irid
Tomi
Jaane
Dean
Wyne
Frannald
Cherie
Benmany
Daisal
Alga
Mailey
Emere
Jake
Stangela
Karick
Anna

150
loss: 1.7415177001953126
I
Perry
Willa
Amith
Esris
Nichanda
Beth
Hemeren
Sanna
Cadelyn
Husto
Robby
Elizah
Keline
Coley
Loure
A
200
loss: 1.5782607421875
IRonda
Lucie
Julissa
Cadery
Aodor
Melissac
Carles
Alessellie
Ayanne
Shayla
Kyley
Gerald
Erye
Natalya

250
loss: 1.470765380859375
Is
Kelly
Kennany
Karian
Carlece
Leslie
Arsus
Karene
Lil
Dawn
Ellion
Joelynn
Kassie
Kyla
Kalyssen
Jay

300
loss: 1.3069591064453125
Ifade
Kate
Leigh
Bennie
Rochary
Kimberly
Martice
Mariana
Randa
Rosalinne
Mariana
Madison
Mady
Rodney

350
loss: 1.2359420166015624
Ilva
herece
Raj
Zenna
Tia
Tina
Bonnie
Calim
Cory
Claile
Charleen
Dariell
Eliza
Eleanor
Elisabeth
Ila

400
loss: 1.217677490234375
Ilberto
Earneste
Ernestine
Hanna
Je