In [1]:
import torch
import os
import torch.nn as nn
import numpy as np
from torch.nn.utils import clip_grad_norm

In [2]:
class Dictionary(object):
    def __init__(self):
        # Initialize dictionaries for word to index and index to word mappings:
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0  # Initialize index counter
        

    def add_word(self, word):
        # Add a word to the dictionary:
        if word not in self.word2idx:
            # Assign the current index to the new word and update the mappings:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            
            # Increment the index for the next word:
            self.idx += 1
            
    def __len__(self):
        # Return the number of words in the dictionary:
        return len(self.word2idx)

In [3]:
class TextProcess(object):
    
    def __init__(self):
        # Initialize a Dictionary object to handle word-to-index mappings:
        self.dictionary = Dictionary()

    def get_data(self, path, batch_size=20):
        # Open the text file and read lines:
        with open(path, 'r') as f:
            tokens = 0
            
            # Tokenize each line and add words to the dictionary:
            for line in f:
                words = line.split() + ['<eos>']  # Add an end-of-sentence token
                tokens += len(words)
                for word in words: 
                    self.dictionary.add_word(word)  

        # Create a tensor to hold the indices of all words in the file:
        rep_tensor = torch.LongTensor(tokens)
        index = 0
        
        # Re-open the file and convert each word to its index:
        with open(path, 'r') as f:
            for line in f:
                words = line.split() + ['<eos>']
                for word in words:
                    rep_tensor[index] = self.dictionary.word2idx[word]
                    index += 1

        # Calculate the number of complete batches:
        num_batches = rep_tensor.shape[0] // batch_size   
        
        # Truncate the tensor to fit an exact number of batches:
        rep_tensor = rep_tensor[:num_batches * batch_size]
        
        # Reshape the tensor for batch processing
        rep_tensor = rep_tensor.view(batch_size, -1)

        return rep_tensor

In [5]:
# Size of the word embeddings:
embed_size = 128

# Number of features in the hidden state of the LSTM:
hidden_size = 1024

# Number of stacked LSTM layers:
num_layers = 1

# Number of epochs to train the model:
num_epochs = 20

# Number of samples per batch to be passed through the network:
batch_size = 20

# Length of the sequence to be passed through the LSTM:
timesteps = 30

# Learning rate for the optimizer:
learning_rate = 0.002

In [7]:
# Create an instance of the TextProcess class:
corpus = TextProcess()

In [8]:
# Process the text data from 'alice.txt' using the TextProcess instance:
rep_tensor = corpus.get_data('alice.txt', batch_size)

In [9]:
# rep_tensor is the tensor that contains the index of all the words. Each row contains 1659 words by default 
print(rep_tensor.shape)

torch.Size([20, 1484])


In [12]:
# Calculate the size of the vocabulary:
vocab_size = len(corpus.dictionary)

print(vocab_size)

5290


In [13]:
# Calculate the number of batches based on the timesteps
num_batches = rep_tensor.shape[1] // timesteps

# Print the number of batches
print(num_batches)

49


In [15]:
class TextGenerator(nn.Module):
    
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(TextGenerator, self).__init__()
        # Embedding layer to convert word indices into dense vectors of fixed size
        self.embed = nn.Embedding(vocab_size, embed_size)

        # LSTM layer for learning sequence data
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)

        # Fully connected layer that outputs probabilities over the vocabulary
        self.linear = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, h):
        # Embed the input words
        x = self.embed(x)

        # LSTM forward pass with the initial hidden and cell states
        out, (h, c) = self.lstm(x, h)

        # Reshape the LSTM output for the fully connected layer
        # (batch_size*timesteps, hidden_size)
        out = out.reshape(out.size(0) * out.size(1), out.size(2))

        # Pass the reshaped output through the fully connected layer
        out = self.linear(out)

        return out, (h, c)

In [16]:
# Initialize the TextGenerator model
model = TextGenerator(vocab_size, embed_size, hidden_size, num_layers)

In [17]:
# Define the loss function for training
loss_fn = nn.CrossEntropyLoss()

# Define the optimizer for updating model parameters
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [18]:
def detach(states):
    """
If we have a tensor z,'z.detach()' returns a tensor that shares the same storage
as 'z', but with the computation history forgotten. It doesn't know anything about how it was computed. 
In other words, we have broken the tensor z away from its past history.
Here, we want to perform truncated backpropagation through time (TBPTT) to prevent backpropagating
through the entire history of states, which can be computationally expensive and lead
to vanishing or exploding gradients.TBPTT splits the 1,000-long sequence into 50 sequences (say) each of length 20 and treats each sequence of length 20 as 
a separate training case. This is a sensible approach that can work well in practice, but it is blind to temporal 
dependencies that span more than 20 timesteps.
    """
    return [state.detach() for state in states] 

In [21]:
# Check if CUDA (GPU support) is available and set the device accordingly:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for epoch in range(num_epochs):
    # Initialize hidden and cell states to zeros:
    states = (torch.zeros(num_layers, batch_size, hidden_size).to(device),
              torch.zeros(num_layers, batch_size, hidden_size).to(device))

    # Loop through the dataset in batches of 'timesteps' length
    for i in range(0, rep_tensor.size(1) - timesteps, timesteps):
        # Prepare mini-batch inputs and targets
        inputs = rep_tensor[:, i:i+timesteps].to(device)  
        targets = rep_tensor[:, (i+1):(i+1)+timesteps].to(device)
        
        # Forward pass through the model
        outputs, states = model(inputs, states)
        states = detach(states)  # Detach states from the graph to prevent backprop through entire history

        # Compute loss
        loss = loss_fn(outputs, targets.reshape(-1))

        # Backward pass and optimization
        model.zero_grad()  # Zero out gradients
        loss.backward()  # Backpropagate the loss
        clip_grad_norm(model.parameters(), 0.5)  # Clip gradients to prevent exploding gradient problem
        optimizer.step()  # Update model parameters
              
        # Print loss every 100 steps
        step = (i+1) // timesteps
        if step % 100 == 0:
            print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, loss.item()))

  clip_grad_norm(model.parameters(), 0.5)  # Clip gradients to prevent exploding gradient problem


Epoch [1/20], Loss: 8.5770
Epoch [2/20], Loss: 7.8484
Epoch [3/20], Loss: 5.8080
Epoch [4/20], Loss: 5.1032
Epoch [5/20], Loss: 4.3615
Epoch [6/20], Loss: 3.6998
Epoch [7/20], Loss: 3.1544
Epoch [8/20], Loss: 2.6325
Epoch [9/20], Loss: 2.1855
Epoch [10/20], Loss: 1.9412
Epoch [11/20], Loss: 1.5177
Epoch [12/20], Loss: 1.4083
Epoch [13/20], Loss: 1.0521
Epoch [14/20], Loss: 0.8490
Epoch [15/20], Loss: 0.4854
Epoch [16/20], Loss: 0.2456
Epoch [17/20], Loss: 0.1432
Epoch [18/20], Loss: 0.0792
Epoch [19/20], Loss: 0.0498
Epoch [20/20], Loss: 0.0668


In [22]:
# Test the model for text generation:

with torch.no_grad():
    with open('results.txt', 'w') as f:
        # Initialize hidden and cell states
        state = (torch.zeros(num_layers, 1, hidden_size).to(device),
                 torch.zeros(num_layers, 1, hidden_size).to(device))

        # Select one word id randomly and convert it to shape (1,1)
        input = torch.randint(0, vocab_size, (1,)).long().unsqueeze(1).to(device)

        # Generate words for the specified number of steps
        for i in range(500):
            # Forward pass through the model
            output, state = model(input, state)

            # Sample a word id from the exponential of the output 
            prob = output.exp()
            word_id = torch.multinomial(prob, num_samples=1).item()

            # Replace the input with sampled word id for the next time step
            input.fill_(word_id)

            # Convert the word id to the actual word and write results to file
            word = corpus.dictionary.idx2word[word_id]
            word = '\n' if word == '<eos>' else word + ' '
            f.write(word)

            # Print progress every 100 words
            if (i+1) % 100 == 0:
                print('Sampled [{}/{}] words and save to {}'.format(i+1, 500, 'results.txt'))


Sampled [100/500] words and save to results.txt
Sampled [200/500] words and save to results.txt
Sampled [300/500] words and save to results.txt
Sampled [400/500] words and save to results.txt
Sampled [500/500] words and save to results.txt
