# Long Short-Term Memory Next Word Prediction Model

We seek to create a model that, given a string of text, can reliably predict the following *n* words. The model will be a Recurrent Neural Net w/ LSTM architecture.

This particular notebook will use PyTorch rather than TensorFlow to take advantage of NVIDIA's CUDA.

### CUDA Verification

In [1]:
import torch

In [2]:
print(f"Is CUDA supported by this system? {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")

# Storing ID of current CUDA device
cuda_id = torch.cuda.current_device()
print(f"ID of current CUDA device: {torch.cuda.current_device()}")
	
print(f"Name of current CUDA device: {torch.cuda.get_device_name(cuda_id)}")


Is CUDA supported by this system? True
CUDA version: 12.1
ID of current CUDA device: 0
Name of current CUDA device: NVIDIA GeForce RTX 3050 Ti Laptop GPU


## Preprocessing

In [3]:
import sys
sys.path.append('../')
from util.process import Process

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\amira\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
FILE_PATH = "sherlock_holmes_text.txt"

sentences = Process.file_to_sentences(FILE_PATH)

In [5]:
sentences = sentences[4:]
sentences[:10]

['I have seldom heard him mention her under any other name.',
 'In his eyes she eclipses and predominates the whole of her sex.',
 'It was not that he felt any emotion akin to love for Irene Adler.',
 'All emotions, and that one particularly, were abhorrent to his cold, precise but admirably balanced mind.',
 'He was, I take it, the most perfect reasoning and observing machine that the world has seen, but as a lover he would have placed himself in a false position.',
 'He never spoke of the softer passions, save with a gibe and a sneer.',
 'They were admirable things for the observer—excellent for drawing the veil from mens motives and actions.',
 'But for the trained reasoner to admit such intrusions into his own delicate and finely adjusted temperament was to introduce a distracting factor which might throw a doubt upon all his mental results.',
 'Grit in a sensitive instrument, or a crack in one of his own high-power lenses, would not be more disturbing than a strong emotion in a na

In [6]:
from nltk.tokenize import word_tokenize

In [7]:
sentences = [word_tokenize(sentence) for sentence in sentences]
sentences[0]

['I',
 'have',
 'seldom',
 'heard',
 'him',
 'mention',
 'her',
 'under',
 'any',
 'other',
 'name',
 '.']

In [8]:
all_words = [word for sentence in sentences for word in sentence]
all_words[:10]

['I',
 'have',
 'seldom',
 'heard',
 'him',
 'mention',
 'her',
 'under',
 'any',
 'other']

In [9]:
vocabulary = set(all_words)
word_to_index = {word: idx for idx, word in enumerate(vocabulary, 1)}
index_to_word = {idx: word for word, idx in word_to_index.items()   }
# The size of the vocabulary will be one larger because 
# we reserve integer 0 for the padding token
vocab_size = len(vocabulary) + 1

In [10]:
input_sequences = []
for sentence in sentences:
    token_list = [word_to_index[word] for word in sentence]
    for i in range(2, len(token_list) + 1):
        ngram = token_list[:i]
        input_sequences.append(ngram)

In [11]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [12]:
max_sequence_len = max([len(seq) for seq in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')

X, y = input_sequences[:,:-1],input_sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)

## Building the Model

In [13]:
import torch
import torch.nn as nn
import torch.optim as optim

In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [15]:
class MyModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(MyModel, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm1 = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.lstm2 = nn.LSTM(hidden_dim, hidden_dim, batch_first=True)
        self.fc1 = nn.Linear(hidden_dim, 500)
        self.fc2 = nn.Linear(500, vocab_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        # self.batch_norm = nn.BatchNorm1d(hidden_dim)
        
    def forward(self, x):
        x = self.embedding(x)
        # print(f"After embedding: {x.shape}")
        x, _ = self.lstm1(x)
        # print(f"After first LSTM: {x.shape}")
        # x = self.batch_norm(x)
        # print(f"After batch norm: {x.shape}")
        x, _ = self.lstm2(x)
        # print(f"After second LSTM: {x.shape}")
        x = self.dropout(x)
        x = x[:, -1, :] # Sequence to label
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

In [16]:
# Hyperparameters
embedding_dim = 128
hidden_dim = 164

# Initialize the model
model = MyModel(vocab_size, embedding_dim, hidden_dim, vocab_size).to(device)

# Define Loss, Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [17]:
from torch.utils.data import DataLoader, Dataset, random_split

In [18]:
class MyDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

In [19]:
# Convert X and y to PyTorch tensors
X_tensor = torch.tensor(X)
y_tensor = torch.tensor(y)

# Create dataset and dataloaders
dataset = MyDataset(X_tensor, y_tensor)

In [20]:
validation_split = 0.2
batch_size = 64

train_size = int((1 - validation_split) * len(dataset))
val_size = len(dataset) - train_size

# Split dataset into training and validation sets
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=0)

In [21]:
for i, (inputs, labels) in enumerate(train_loader):
    print(inputs, labels)
    if i == 1:  # print the first 2 batches
        break

tensor([[   0,    0,    0,  ..., 4451, 7074,  352],
        [   0,    0,    0,  ...,    0,    0, 2617],
        [   0,    0,    0,  ..., 4786, 4648, 1122],
        ...,
        [   0,    0,    0,  ..., 2157, 8237, 2103],
        [   0,    0,    0,  ..., 4098, 5701, 8237],
        [   0,    0,    0,  ..., 4604, 1197, 8021]], dtype=torch.int32) tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
tensor([[   0,    0,    0,  ..., 6560, 5291, 8237],
        [   0,    0,    0,  ...,    0, 2969, 1312],
        [   0,    0,    0,  ..., 8823, 6369, 4615],
        ...,
        [   0,    0,    0,  ..., 7744, 7984, 6725],
        [   0,    0,    0,  ..., 3488,  144, 7665],
        [   0,    0,    0,  ..., 5121, 8237, 3790]], dtype=torch.int32) tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0.,

In [22]:
import time

In [23]:
def train_model(epoch, epochs=100):
    start_time = time.time()
    # Training
    model.train() 

    running_loss = 0.0
    correct_train = 0
    total_train = 0

    for input_batch, target_batch in train_loader:  # Loop over batches of data
        input_batch, target_batch = input_batch.to(device), target_batch.to(device) 
        optimizer.zero_grad()  
        output = model(input_batch)
        loss = criterion(output, target_batch)  
        loss.backward()    
        optimizer.step()

    try:
        train_loss = running_loss / len(train_loader)
        train_accuracy = 100 * correct_train / total_train  
    except ZeroDivisionError:
        print("ZeroDivisionError")
        train_loss = 0
        train_accuracy = 0

   # Validation
    model.eval() 

    running_val_loss = 0.0
    correct_val = 0
    total_val = 0

    with torch.no_grad():
        for input_batch_val, target_batch_val in val_loader:
            input_batch_val, target_batch_val = input_batch_val.to(device), target_batch_val.to(device)
            
            # Ensure target is not one-hot and is long type
            if len(target_batch_val.shape) > 1:
                target_batch_val = torch.argmax(target_batch_val, dim=1)
            target_batch_val = target_batch_val.long()
            
            val_output = model(input_batch_val)
            
            # Log Shapes for Debugging
            print("Output shape: ", val_output.shape)
            print("Target shape: ", target_batch_val.shape)
            
            val_loss = criterion(val_output, target_batch_val)
            running_val_loss += val_loss.item()
            
            _, predicted_val = torch.max(val_output, dim=1)
            total_val += target_batch_val.size(0)
            correct_val += (predicted_val == target_batch_val).sum().item()


    try:
        val_loss = running_val_loss / len(val_loader)
        val_accuracy = 100 * correct_val / total_val
    except ZeroDivisionError:
        print("ZeroDivisionError")
        val_loss = 0
        val_accuracy = 0

    epoch_time = time.time() - start_time
    
    # Logging
    print(f'Epoch {epoch+1}/{epochs}, '
          f'Time: {epoch_time:.2f}s, '
          f'Loss: {train_loss:.4f}, '
          f'Accuracy: {train_accuracy:.2f}%, '
          f'Val Loss: {val_loss:.4f}, '
          f'Val Accuracy: {val_accuracy:.2f}%')

In [24]:
# Dry run to make sure everything is fine
if torch.cuda.is_available():
    print("Training on GPU...")
else:
    print("Training on CPU...")

train_model(0, epochs=1)

Training on GPU...
ZeroDivisionError
Output shape:  torch.Size([64, 9464])
Target shape:  torch.Size([64])
Output shape:  torch.Size([64, 9464])
Target shape:  torch.Size([64])
Output shape:  torch.Size([64, 9464])
Target shape:  torch.Size([64])
Output shape:  torch.Size([64, 9464])
Target shape:  torch.Size([64])
Output shape:  torch.Size([64, 9464])
Target shape:  torch.Size([64])
Output shape:  torch.Size([64, 9464])
Target shape:  torch.Size([64])
Output shape:  torch.Size([64, 9464])
Target shape:  torch.Size([64])
Output shape:  torch.Size([64, 9464])
Target shape:  torch.Size([64])
Output shape:  torch.Size([64, 9464])
Target shape:  torch.Size([64])
Output shape:  torch.Size([64, 9464])
Target shape:  torch.Size([64])
Output shape:  torch.Size([64, 9464])
Target shape:  torch.Size([64])
Output shape:  torch.Size([64, 9464])
Target shape:  torch.Size([64])
Output shape:  torch.Size([64, 9464])
Target shape:  torch.Size([64])
Output shape:  torch.Size([64, 9464])
Target shape:  

In [25]:
epochs = 100

for epoch in range(epochs):
    train_model(epoch, epochs)    


ZeroDivisionError
Output shape:  torch.Size([64, 9464])
Target shape:  torch.Size([64])
Output shape:  torch.Size([64, 9464])
Target shape:  torch.Size([64])
Output shape:  torch.Size([64, 9464])
Target shape:  torch.Size([64])
Output shape:  torch.Size([64, 9464])
Target shape:  torch.Size([64])
Output shape:  torch.Size([64, 9464])
Target shape:  torch.Size([64])
Output shape:  torch.Size([64, 9464])
Target shape:  torch.Size([64])
Output shape:  torch.Size([64, 9464])
Target shape:  torch.Size([64])
Output shape:  torch.Size([64, 9464])
Target shape:  torch.Size([64])
Output shape:  torch.Size([64, 9464])
Target shape:  torch.Size([64])
Output shape:  torch.Size([64, 9464])
Target shape:  torch.Size([64])
Output shape:  torch.Size([64, 9464])
Target shape:  torch.Size([64])
Output shape:  torch.Size([64, 9464])
Target shape:  torch.Size([64])
Output shape:  torch.Size([64, 9464])
Target shape:  torch.Size([64])
Output shape:  torch.Size([64, 9464])
Target shape:  torch.Size([64])
Ou

KeyboardInterrupt: 

In [None]:
# Saving model
torch.save(model.state_dict(), 'cudnn_model.pth')

In [None]:
# Instantiating model
model = MyModel(vocab_size, embedding_dim, hidden_dim, vocab_size).to(device)

# Loading model
model.load_state_dict(torch.load('path_to_save_model/model_name.pth'))