<a href="https://colab.research.google.com/github/Gladiator07/Natural-Language-Processing/blob/main/LSTM/text_generator_charachter_level_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text Generator with Character Level LSTM

Text generation using a character LSTM, specifically we want to
generate new names as inspiration for those having a baby :) 
Although this is for name generation, the code is general in the
way that you can just send in any large text file (shakespear text, etc)
and it will generate it.

In [2]:
import torch
import torch.nn as nn
import string
import random
import sys
from torch.utils.tensorboard import SummaryWriter

In [3]:
# getting data
!pip install Unidecode
!wget https://raw.githubusercontent.com/aladdinpersson/Machine-Learning-Collection/master/ML/Projects/text_generation_babynames/data/names.txt

Collecting Unidecode
[?25l  Downloading https://files.pythonhosted.org/packages/9e/25/723487ca2a52ebcee88a34d7d1f5a4b80b793f179ee0f62d5371938dfa01/Unidecode-1.2.0-py2.py3-none-any.whl (241kB)
[K     |█▍                              | 10kB 20.6MB/s eta 0:00:01[K     |██▊                             | 20kB 14.3MB/s eta 0:00:01[K     |████                            | 30kB 12.9MB/s eta 0:00:01[K     |█████▍                          | 40kB 12.0MB/s eta 0:00:01[K     |██████▉                         | 51kB 9.3MB/s eta 0:00:01[K     |████████▏                       | 61kB 8.3MB/s eta 0:00:01[K     |█████████▌                      | 71kB 9.2MB/s eta 0:00:01[K     |██████████▉                     | 81kB 10.2MB/s eta 0:00:01[K     |████████████▏                   | 92kB 9.8MB/s eta 0:00:01[K     |█████████████▋                  | 102kB 9.0MB/s eta 0:00:01[K     |███████████████                 | 112kB 9.0MB/s eta 0:00:01[K     |████████████████▎               | 122kB 9.

In [13]:
import unidecode
file = unidecode.unidecode(open("names.txt").read())

In [16]:
# device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [17]:
# get charachters from string.printable
all_characters = string.printable
n_characters = len(all_characters)
print(all_characters)

0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ 	



In [18]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embed = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self,x,hidden,cell):
        out = self.embed(x)
        out, (hidden,cell) = self.lstm(out.unsqueeze(1), (hidden, cell))
        out = self.fc(out.reshape(out.shape[0], -1))
        return out, (hidden, cell)
    
    def init_hidden(self, batch_size):
        hidden = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
        cell = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
        return hidden, cell

In [23]:
class Generator():
    def __init__(self):
        self.chunk_len = 250
        self.num_epochs = 5000
        self.batch_size = 1
        self.print_every = 50
        self.hidden_size = 256
        self.num_layers = 2
        self.lr = 0.003
    
    def char_tensor(self, string):
        tensor = torch.zeros(len(string)).long()
        for c in range(len(string)):
            tensor[c] = all_characters.index(string[c])
        return tensor
    
    def get_random_batch(self):
        start_idx = random.randint(0, len(file) - self.chunk_len)
        end_idx = start_idx + self.chunk_len + 1
        text_str = file[start_idx:end_idx]
        text_input = torch.zeros(self.batch_size, self.chunk_len)
        text_target = torch.zeros(self.batch_size, self.chunk_len)

        for i in range(self.batch_size):
            text_input[i,:] = self.char_tensor(text_str[:-1])
            text_target[i,:] = self.char_tensor(text_str[1:])
        
        return text_input.long(), text_target.long()

    def generate(self, initial_str='A', predict_len=100, temperature=0.85):
        hidden, cell = self.rnn.init_hidden(batch_size=self.batch_size)
        initial_input = self.char_tensor(initial_str)
        predicted = initial_str

        last_char = initial_input[-1]

        for p in range(predict_len):
            output, (hidden, cell) = self.rnn(
                last_char.view(1).to(device), hidden, cell
            )
            output_dist = output.data.view(-1).div(temperature).exp()
            top_char = torch.multinomial(output_dist, 1)[0]
            predicted_char = all_characters[top_char]
            predicted += predicted_char
            last_char = self.char_tensor(predicted_char)

        return predicted
    
    def train(self):
        self.rnn = RNN(n_characters, self.hidden_size, self.num_layers, n_characters).to(device)

        optimizer = torch.optim.Adam(self.rnn.parameters(), lr=self.lr)
        criterion = nn.CrossEntropyLoss()
        writer = SummaryWriter(f'runs/names0')
        print("=> Starting training")

        for epoch in range(1, self.num_epochs + 1):
            inp, target = self.get_random_batch()
            hidden, cell = self.rnn.init_hidden(batch_size=self.batch_size)
            self.rnn.zero_grad()
            loss = 0
            inp = inp.to(device)
            target = target.to(device)

            for c in range(self.chunk_len):
                output, (hidden,cell) = self.rnn(inp[:, c], hidden, cell)
                loss += criterion(output, target[:, c])
            
            loss.backward()
            optimizer.step()
            loss = loss.item() / self.chunk_len

            if epoch % self.print_every == 0:
                print(f'Loss: {loss}')
                print(self.generate())

            writer.add_scalar('Training loss', loss, global_step=epoch)

In [24]:
gennames = Generator()
gennames.train()

=> Starting training
Loss: 2.351725830078125
A
Nadie
Tiari
Malosa
Barnwab
Kasierl
Binna
Janhe
Marrlta
Jnelcucia
Cargiet
Riasse
Arihan
6silag
r
Rri
Loss: 2.298428466796875
Amine
Jesy
Jannen
Jalerisn
Barlin
Phutbel
Maria
TKiery
Ria
Shetint
Lened
Ericw
Janna
Koxonn
Cani
Donn
Loss: 2.26296533203125
An
Kelley
Panelre
Chinna
Har
Tarie
Hanalio
Neiesta
Alficie
Cryle
Alereph
Asher
Hana
Marina
Lintia
Sha
Loss: 1.875780517578125
And
Daytha
Wonta
Ctarlino
Mathanda
Carista
Alshela
Jarredra
Carlie
Parla
Anamina
Dellina
Tamin
Alymie
Loss: 2.501237548828125
Ane
Rwerotth
cuutho
Allynna
Duyda
Vennie
Vondy
Maith
Mellice
Calyn
Sustee
Frinna
Via
AmeR
Arvelet
Der
Loss: 2.13074755859375
Aka
Leora
Richas
Loniqa
Chlpy
Noust
Anmia
Serby
Jackre
Junton
Letha
Rametta
Hari
Mynald
Onda
Nora
Mat
Loss: 2.122806884765625
Adina
Torandra
Josania
Kanie
Elia
Ka
Carana
Rizaleen
Todred
Jacquel
Meleen
Jace
Irieone
Paudaria
Keul
Loss: 1.793730224609375
Arla
Aldobe
Trevella
Alysan
Josele
Kayla
Vally
Kathelle
Rickella
Nicia
Raya