In [1]:
import numpy as np
import torch
import nltk
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from nltk.tokenize import word_tokenize
# Read the text file
with open('/teamspace/studios/this_studio/Project-III/data/sherlock-holm.es_stories_plain-text_advs.txt', 'r', encoding='utf-8') as file:
    text = file.read()
text = text.lower()

Tokenize

In [2]:
# Tokenize the text 
tokens = word_tokenize(text)


vocabulary = set(tokens)


total_words = len(vocabulary) + 1  

# Create a word-to-index mapping
word_to_idx = {word: idx for idx, word in enumerate(vocabulary)}

print(f"Total words in vocabulary: {total_words}")

Total words in vocabulary: 8384


In [3]:
def create_sequences(text, seq_length, word_to_idx, pad_value=0):
  """
  This function creates sequences of word indices from the text for training,
  with optional padding.

  Args:
      text: The input text string.
      seq_length: The desired sequence length (number of words in each sequence).
      word_to_idx: Dictionary mapping words to their numerical indices.
      pad_value: The value to use for padding (default: 0).

  Returns:
      A list of lists, where each inner list represents a sequence of word indices.
  """

  sequences = []
  tokens = word_tokenize(text.lower())  # Tokenize the text (lowercase)

  for i in range(len(tokens) - seq_length + 1):
    sequence = tokens[i:i + seq_length]
    sequence_indices = [word_to_idx[word] for word in sequence]  # Convert words to indices

    # Pad the sequence if necessary
    if len(sequence_indices) < seq_length:
      sequence_indices.extend([pad_value] * (seq_length - len(sequence_indices)))

    sequences.append(sequence_indices)

  return sequences

  
def process_sequences(text, word_to_idx, seq_length=None, unknown_token="me"):
  """
  This function processes text into sequences of word indices, handling unknown words with a special token.

  Args:
      text: The input text string.
      word_to_idx: Dictionary mapping words to their numerical indices.
      seq_length: The desired sequence length (number of words in each sequence).
      unknown_token: The token to use for unknown words (default: "<unk>").

  Returns:
      A PyTorch tensor containing padded sequences of word indices.
  """

  words = text.split()  # Split text into words

  # Preprocess words: remove periods, replace double dashes with single space
  words = [word.rstrip('.').replace('--', ' ') for word in words]
  words = [word.rstrip('.') for word in words]

  # Replace unknown words with the special token
  words = [word_to_idx.get(word, word_to_idx[unknown_token]) for word in words]

  indices = torch.tensor(words)  # Convert words to indices directly (using tensor)

  # Pad sequences if necessary (assuming batch size of 1)
  if seq_length is not None:
    padded_sequences = pad_sequence([indices], batch_first=True, padding_value=0)
  else:
    padded_sequences = indices

  return padded_sequences

Padding

In [11]:
seq_length = 50


sequences = create_sequences(text, seq_length, word_to_idx)


# Convert sequences to a PyTorch tensor
sequences_tensor = torch.tensor(sequences)

# Pad sequences using pad_sequence with batch_first=True
padded_sequences = pad_sequence(sequences_tensor, batch_first=True, padding_value=0)

In [12]:
# Define input (X) and target labels (y)
X = padded_sequences[:, :-1]  # All rows, all columns except last
y = padded_sequences[:, -1]  # All rows, only last column

Neural Network

In [13]:
class LSTMModel(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_dim, output_size):
    super(LSTMModel, self).__init__()
    # Embedding layer to map word indices to embeddings
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    # LSTM layer with hidden units
    self.lstm = nn.LSTM(embedding_dim, hidden_dim)
    # Output layer to predict next word
    self.output = nn.Linear(hidden_dim, output_size)

  def forward(self, x):
    # Pass input sequence (x) through the embedding layer
    embedded = self.embedding(x)
    # Pass embedded sequence through the LSTM layer
    lstm_out, _ = self.lstm(embedded)
    # Get the output from the last hidden state of the LSTM
    output = lstm_out[:, -1, :]  # Select last hidden state
    # Pass the output through the linear layer to get logits
    prediction = self.output(output)
    return prediction


vocab_size = len(word_to_idx)  # Get the number of unique words
embedding_dim = 128
hidden_dim = 128
output_size = len(word_to_idx)


model = LSTMModel(vocab_size, embedding_dim, hidden_dim, output_size)

# Pass input sequence (x) through the model
processed_sequences = process_sequences(text, word_to_idx, seq_length=50)  # Example with padding

x = processed_sequences
predictions = model(x)

In [14]:
class TextDataset(torch.utils.data.Dataset):
  def __init__(self, text, word_to_idx, seq_length=50, unknown_token="me"):
    self.text = text
    self.word_to_idx = word_to_idx
    self.seq_length = seq_length
    self.unknown_token = unknown_token

  def __len__(self):
    # Calculate the number of sequences based on text length and sequence length
    num_sequences = int(len(self.text) // self.seq_length)
    return num_sequences

  def __getitem__(self, idx):
    # Get the starting index for the current sequence
    start_idx = idx * self.seq_length

    # Get the sequence of words from the text
    words = self.text[start_idx: start_idx + self.seq_length].split()

    # Preprocess words (remove periods, replace double dashes)
    words = [word.rstrip('.').replace('--', ' ') for word in words]

    # Replace unknown words with the special token
    words = [self.word_to_idx.get(word, self.word_to_idx[self.unknown_token]) for word in words]

    # Convert words to PyTorch tensor
    sequence = torch.tensor(words)

    # Get the target label (next word)
    # Assuming next word is the word after the sequence
    if len(words) == self.seq_length:
      label = words[-1]  # Last word in the sequence is the label
    else:
      label = self.word_to_idx[self.unknown_token]  # If sequence is shorter, use unknown token as label

    # Convert label to PyTorch tensor
    label = torch.tensor(label)

    # Return a dictionary with sequence (x) and label (y)
    return {'x': sequence, 'y': label}

In [15]:
def pad_collate(batch):
  """
  Custom collate function to pad sequences within a batch.

  Args:
      batch: A list of dictionaries containing sequences (x) and labels (y).

  Returns:
      A dictionary with padded sequences (x) and a list of labels (y).
  """

  # Get the maximum sequence length within the batch
  max_len = max(len(data['x']) for data in batch)

  # Handle the case where all sequences in the batch have the same length (edge case)
  if max_len == 0:
    max_len = 1  # Set a minimum length to avoid padding issues

  # Pad sequences with a special padding token (e.g., 0)
  padded_sequences = [
      torch.nn.functional.pad(data['x'], pad=(0, max_len - len(data['x'])), value=0)
      for data in batch
  ]

  # Get a list of labels
  labels = [data['y'] for data in batch]

  # Convert padded sequences and labels to tensors
  padded_sequences = torch.stack(padded_sequences)
  labels = torch.tensor(labels)

  return {'x': padded_sequences, 'y': labels}

In [16]:

# Define hyperparameters (adjust as needed)
num_epochs = 10
batch_size = 32
learning_rate = 0.001


# Create training and validation datasets
train_dataset = TextDataset(text, word_to_idx, seq_length=50)
validation_dataset = TextDataset(text, word_to_idx, seq_length=50)  # Assuming a split for validation

# Create data loaders for training and validation
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=pad_collate)
validation_loader = torch.utils.data.DataLoader(validation_dataset, batch_size=batch_size)

# Define model (assuming you have the LSTMModel class defined)
model = LSTMModel(vocab_size=len(word_to_idx), embedding_dim=128, hidden_dim=128, output_size=len(word_to_idx))

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
  print(f'Epoch: {epoch+1}/{num_epochs}')

  # Training phase
  model.train()  # Set model to training mode
  train_loss = 0.0

  for data in train_loader:
    sequences = data['x']  # Get sequences (x) from the data dictionary
    labels = data['y']  # Get labels (y) from the data dictionary

    optimizer.zero_grad()
    predictions = model(sequences)
    loss = criterion(predictions, labels)
    loss.backward()
    optimizer.step()

    train_loss += loss.item()  # Accumulate training loss

  # Calculate and print average training loss per epoch
  avg_train_loss = train_loss / len(train_loader)
  print(f'Training Loss: {avg_train_loss:.4f}')

  # Optional: Validation phase
  model.eval()  # Set model to evaluation mode
  val_loss = 0.0

  with torch.no_grad():  # Disable gradient calculation during validation
    for data in validation_loader:
      sequences = data['x']
      labels = data['y']

      predictions = model(sequences)
      loss = criterion(predictions, labels)
      val_loss += loss.item()

  # Calculate and print average validation loss per epoch
  avg_val_loss = val_loss / len(validation_loader)
  print(f'Validation Loss: {avg_val_loss:.4f}')

  print('-' * 50)  # Separator for each epoch

# Save the trained model (optional)
torch.save(model.state_dict(), 'next_word_prediction_model.pt')

print('Training complete!')

Epoch: 1/10
Training Loss: 0.3567


RuntimeError: stack expects each tensor to be equal size, but got [5] at entry 0 and [10] at entry 1

Training

Generate our predicitons