In [8]:
import torch 
from torch import nn, optim 
from torch.nn import functional as F 
from torch.utils.data import Dataset, DataLoader
from torch.distributions.categorical import Categorical 
from torchmetrics import Accuracy 
import os 
import numpy as np 

In [9]:
with open('../data/heroes_of_our_times.txt', 'r') as file:
    text = file.read()

In [10]:
int2char = np.array(sorted(set(text)))
char2int = { char : idx for idx, char in enumerate(int2char) }

In [17]:
text_encoded = torch.tensor(np.array([ char2int[char] for char in text ]))

In [19]:
seq_len = 80
chunk_len = seq_len + 1
chunks = [
    text_encoded[start:chunk_len + start] for start in range(len(text_encoded) - chunk_len) 
]

In [23]:
class TextDataset(Dataset):
    def __init__(self, chunks):
        self.chunks = chunks 
    
    def __len__(self):
        return len(self.chunks)
    
    def __getitem__(self, index):
        chunk = self.chunks[index] 
        return chunk[:-1], chunk[1:] 

In [24]:
dataset = TextDataset(chunks)

In [26]:
batch_size = 64


train_dl = DataLoader(
    dataset,
    batch_size,
    shuffle=True,
    num_workers=os.cpu_count(),
    pin_memory=True,
    drop_last=True
)

In [27]:
int2char.shape

(127,)

In [28]:
class CharGenModel(nn.Module):
    def __init__(self, vocab_size, emb_dim, rnn_hidden_size, lin_hidden_size, num_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=emb_dim
        )
        
        self.lstm = nn.LSTM(
            input_size=emb_dim,
            hidden_size=rnn_hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout
        )
        
        self.classifier = nn.Sequential(
            nn.Linear(rnn_hidden_size, lin_hidden_size),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(lin_hidden_size, vocab_size)
        )
        
        self.num_layers = num_layers
        self.rnn_hidden_size = rnn_hidden_size 
    
    def forward(self, inputs, hidden, cell):
        inputs = self.embedding(inputs).unsqueeze(1)
        pred, (hidden, cell) = self.lstm(inputs, (hidden, cell))
        pred = self.classifier(pred).view(pred.size(0), -1)
        return pred, (hidden, cell)
    
    def init_hidden_cell(self, batch_size):
        device = next(self.parameters()).device 
        hidden = torch.zeros(self.num_layers, batch_size, self.rnn_hidden_size).to(device)
        cell = torch.zeros(self.num_layers, batch_size, self.rnn_hidden_size).to(device)
        return hidden, cell 
        

In [None]:
emb_dim = 256
rnn_hidden_size = 512 
lin_hidden_size = 256
num_layers = 2
dropout_p = 0.5
device = torch.device()