In [1]:
import text_data
import wikitext_data
from CustomLSTM import CustomLSTM
import numpy as np
import torch
import torch.nn as nn
from tqdm import tqdm

# Data preprocessing and model compiling

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
corpus = wikitext_data.Corpus(device)

C:\Users\kajud\Documents\GitHub\Onlab1\LSTM\.data\wikitext-2-v1.zip: 100%|█████████| 4.48M/4.48M [00:08<00:00, 518kB/s]


In [4]:
n_tokens = len(corpus.vocab.stoi)
input_sz = 200
hidden_sz = 128
seq_length = 40
epochs = 1

In [5]:
model = nn.Sequential(
    nn.Embedding(n_tokens, input_sz),
    CustomLSTM(input_sz = input_sz, hidden_sz = hidden_sz, return_states = False, return_sequences = False),
    nn.Linear(hidden_sz, n_tokens)).float().to(device)

In [6]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.002)

In [7]:
def train(inputs, targets):
        """
        Train 1 time
        :param inputs: Tensor[batch, timestep, channels]
        :param targets: Torch tensor [batch, timestep, channels]
        :return: float loss
        """
        logits = model(inputs)

        loss = criterion(logits.view(-1, n_tokens),
                         targets.long().view(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        return loss.item()

In [8]:
train_data = wikitext_data.TextDataset(corpus.train, in_out_overlap = False, input_size = seq_length, seq_len=seq_length + 1, stride = 1)

In [10]:
train_loader = torch.utils.data.DataLoader(train_data, batch_size = 256, shuffle = False)

# Training

In [11]:
model.train()
for e in range(epochs):
    for b in tqdm(train_loader):
        inp, out = b
        loss = train(inp, out)
        
    print(f'[{e + 1}/{epochs}] loss: {loss}')

100%|██████████████████████████████████████████████████████████████████████████████| 8008/8008 [01:29<00:00, 89.47it/s]

[1/1] loss: 5.847764015197754





# Evaluating

In [12]:
sentence = corpus.test[0:40].unsqueeze(0).cuda()
generated = sentence

In [14]:
print("Generating text with seed:")
' '.join([corpus.vocab.itos[i] for i in generated.tolist()[0]])

Generating text with seed:


'= robert <unk> = robert <unk> is an english film , television and theatre actor . he had a guest @-@ starring role on the television series the bill in 2000 . this was followed by a starring role in'

In [15]:
sample_size = 40
softmax = nn.Softmax(dim = -1)
for i in range(50): # Generating 10 consecutive words
    y_hats = model(sentence)
    preds = torch.argmax(softmax(y_hats), dim = -1).unsqueeze(0)
    generated = torch.cat((generated, preds), dim=1)
    sentence = generated[:,-sample_size:]

l_gen = generated.tolist()[0]
gen_text = ' '.join([corpus.vocab.itos[i] for i in l_gen])
print(gen_text)

= robert <unk> = robert <unk> is an english film , television and theatre actor . he had a guest @-@ starring role on the television series the bill in 2000 . this was followed by a starring role in the <unk> <unk> , and the <unk> <unk> , and the <unk> <unk> , and the <unk> <unk> , and the <unk> <unk> , and the <unk> <unk> , and the <unk> <unk> , and the <unk> <unk> , and the <unk> <unk> , and the <unk> <unk> , and
