In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import Counter
import re

PATH = "Project-III/data/sherlock-holm.es_stories_plain-text_advs.txt"

# Read the text file
with open(PATH, 'r', encoding='utf-8') as file:
    text = file.read()

In [2]:
# Tokenize the text
def tokenize(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    words = text.split()
    return words

tokens = tokenize(text)
word_counts = Counter(tokens)
vocab = sorted(word_counts, key=word_counts.get, reverse=True)
word_to_index = {word: index + 1 for index, word in enumerate(vocab)}
index_to_word = {index + 1: word for index, word in enumerate(vocab)}
total_words = len(word_to_index) + 1

In [3]:
# Create input-output pairs
input_sequences = []
for line in text.split('\n'):
    token_list = [word_to_index[word] for word in tokenize(line) if word in word_to_index]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [4]:
# Pad the sequences
max_sequence_len = max([len(seq) for seq in input_sequences])
input_sequences = np.array([np.pad(seq, (max_sequence_len - len(seq), 0), mode='constant') for seq in input_sequences])

In [5]:
# Split the sequences into input (X) and output (y)
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

# Convert output to one-hot encoded vectors
y = np.array(torch.nn.functional.one_hot(torch.tensor(y), num_classes=total_words))

In [6]:
# Create a custom Dataset class
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.float)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

dataset = TextDataset(X, y)

# Split dataset into training and validation sets (90% training, 10% validation)
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False)

# Define the model
class NextWordPredictor(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(NextWordPredictor, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = x[:, -1, :]
        x = self.fc(x)
        return x

model = NextWordPredictor(total_words, 200, 256, total_words)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [8]:
# Train the model
epochs = 2
for epoch in range(epochs):
    model.train()
    for inputs, labels in train_dataloader:
        outputs = model(inputs)
        loss = criterion(outputs, labels.argmax(dim=1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')

torch.save(model.state_dict(), "model_state.pth")

Epoch 1/2, Loss: 6.034351825714111
Epoch 2/2, Loss: 5.341306209564209


In [9]:
# Evaluate the model on the validation set
model.eval()
correct_predictions = 0
total_predictions = 0

with torch.no_grad():
    for inputs, labels in val_dataloader:
        outputs = model(inputs)
        predicted = outputs.argmax(dim=1)
        actual = labels.argmax(dim=1)
        correct_predictions += (predicted == actual).sum().item()
        total_predictions += labels.size(0)

accuracy = correct_predictions / total_predictions
print(f'Validation Accuracy: {accuracy:.4f}')

Validation Accuracy: 0.1432


In [11]:
import time

# Evaluate the model on the validation set
model.eval()
correct_predictions = 0
total_predictions = 0

with torch.no_grad():
    for inputs, labels in val_dataloader:
        outputs = model(inputs)
        predicted = outputs.argmax(dim=1)
        actual = labels.argmax(dim=1)

        # Display results
        for i in range(len(inputs)):
            input_sentence = " ".join([index_to_word[idx.item()] for idx in inputs[i] if idx.item() != 0])
            predicted_word = index_to_word[predicted[i].item()]
            actual_word = index_to_word[actual[i].item()]
            correct = predicted[i].item() == actual[i].item()
            print(f"Sentence: {input_sentence}")
            print(f"Predicted: {predicted_word}")
            print(f"Actual: {actual_word}")
            print(f"Correct: {correct}")
            print()

            if correct:
                correct_predictions += 1
            total_predictions += 1

            time.sleep(3)  # Wait for 1 second between predictions

accuracy = correct_predictions / total_predictions
print(f'Validation Accuracy: {accuracy:.4f}')

Sentence: like to vanish away and never see any of them againjust sending
Predicted: in
Actual: a
Correct: False

Sentence: why said i glancing up at my companion that
Predicted: i
Actual: was
Correct: False

Sentence: it is important also or at least the initials are
Predicted: the
Actual: so
Correct: False

Sentence: anger and the veins stood out
Predicted: of
Actual: at
Correct: False

Sentence: my wife is fond of a particular shade
Predicted: of
Actual: of
Correct: True

Sentence: the facts are quite
Predicted: so
Actual: recent
Correct: False

Sentence: insinuating manner and a pair of
Predicted: the
Actual: wonderfully
Correct: False

Sentence: but i can hardly be expected to make merry over
Predicted: the
Actual: them
Correct: False

Sentence: him by the idea that he was doing me on a wager well watson
Predicted: i
Actual: we
Correct: False

Sentence: back but this has broken him down completely
Predicted: a
Actual: he
Correct: False

Sentence: upstairs and a few minutes later i

In [None]:
# Generate predictions
def predict_next_words(model, tokenizer, seed_text, next_words):
    for _ in range(next_words):
        token_list = [word_to_index[word] for word in tokenize(seed_text)]
        token_list = np.pad(token_list, (max_sequence_len - len(token_list), 0), mode='constant')
        token_list = torch.tensor(token_list[-max_sequence_len+1:], dtype=torch.long).unsqueeze(0)

        with torch.no_grad():
            predicted = model(token_list).argmax(dim=1).item()

        output_word = index_to_word[predicted]
        seed_text += " " + output_word

    return seed_text

seed_text = "i do not know if i have been a"
next_words = 10
print(predict_next_words(model, word_to_index, seed_text, next_words))