In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import Counter
import re
import time
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag

PATH = "Project-III/data/sherlock-holm.es_stories_plain-text_advs.txt"

# Read the text file
with open(PATH, 'r', encoding='utf-8') as file:
    text = file.read()

In [3]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     /teamspace/studios/this_studio/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /teamspace/studios/this_studio/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [6]:
# Tokenize and generate POS tags
def tokenize_and_tag(text):
    words = word_tokenize(text)
    words = [word.lower() for word in words if word.isalpha()]
    return pos_tag(words)

tokens_and_tags = tokenize_and_tag(text)

# Create word-to-index and index-to-word mappings
tokens = [word for word, _ in tokens_and_tags]
word_counts = Counter(tokens)
vocab = sorted(word_counts, key=word_counts.get, reverse=True)
word_to_index = {word: index + 1 for index, word in enumerate(vocab)}
index_to_word = {index + 1: word for index, word in enumerate(vocab)}
total_words = len(word_to_index) + 1

# Generate a unique set of POS tags
pos_tags = set(tag for _, tag in tokens_and_tags)

# Add an unknown tag for any tags not in the training set
pos_to_index = {tag: i + 1 for i, tag in enumerate(pos_tags)}
pos_to_index["UNK"] = len(pos_to_index) + 1
index_to_pos = {i + 1: tag for i, tag in enumerate(pos_tags)}
index_to_pos[len(index_to_pos) + 1] = "UNK"
total_pos_tags = len(pos_to_index) + 1

In [7]:
# Create input-output pairs with POS tags
input_sequences = []
for line in text.split('\n'):
    token_list = [(word_to_index[word], pos_to_index.get(tag, pos_to_index["UNK"])) for word, tag in tokenize_and_tag(line) if word in word_to_index]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [9]:
# Pad the sequences
max_sequence_len = max([len(seq) for seq in input_sequences])

def pad_sequence(seq, max_len, pad_value=(0, 0)):
    padded_seq = [pad_value] * (max_len - len(seq)) + seq
    return padded_seq

padded_sequences = [pad_sequence(seq, max_sequence_len) for seq in input_sequences]
padded_sequences = np.array(padded_sequences)

In [10]:
# Split the sequences into input (X_word, X_pos) and output (y_word)
X_word = padded_sequences[:, :-1, 0]
X_pos = padded_sequences[:, :-1, 1]
y_word = padded_sequences[:, -1, 0]

# Convert output to one-hot encoded vectors for words
y_word = np.array(torch.nn.functional.one_hot(torch.tensor(y_word), num_classes=total_words))

In [11]:
# Create a custom Dataset class
class TextDataset(Dataset):
    def __init__(self, X_word, X_pos, y_word):
        self.X_word = torch.tensor(X_word, dtype=torch.long)
        self.X_pos = torch.tensor(X_pos, dtype=torch.long)
        self.y_word = torch.tensor(y_word, dtype=torch.float)

    def __len__(self):
        return len(self.X_word)

    def __getitem__(self, idx):
        return self.X_word[idx], self.X_pos[idx], self.y_word[idx]

dataset = TextDataset(X_word, X_pos, y_word)

# Split dataset into training and validation sets (90% training, 10% validation)
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False)

# Define the model
class NextWordPredictor(nn.Module):
    def __init__(self, vocab_size, pos_size, embed_dim, pos_embed_dim, hidden_dim, output_dim):
        super(NextWordPredictor, self).__init__()
        self.word_embedding = nn.Embedding(vocab_size, embed_dim)
        self.pos_embedding = nn.Embedding(pos_size, pos_embed_dim)
        self.lstm = nn.LSTM(embed_dim + pos_embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x_word, x_pos):
        x_word = self.word_embedding(x_word)
        x_pos = self.pos_embedding(x_pos)
        x = torch.cat((x_word, x_pos), dim=2)
        x, _ = self.lstm(x)
        x = x[:, -1, :]
        x = self.fc(x)
        x = self.softmax(x)
        return x

# Initialize the model
model = NextWordPredictor(total_words, total_pos_tags, 100, 50, 150, total_words)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [14]:
# Train the model
epochs = 10
for epoch in range(epochs):
    model.train()
    for inputs_word, inputs_pos, labels in train_dataloader:
        outputs = model(inputs_word, inputs_pos)
        loss = criterion(outputs, labels.argmax(dim=1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')

Epoch 1/10, Loss: 8.900763511657715
Epoch 2/10, Loss: 8.938491821289062
Epoch 3/10, Loss: 8.919625282287598
Epoch 4/10, Loss: 8.825284004211426
Epoch 5/10, Loss: 8.900755882263184
Epoch 6/10, Loss: 8.881887435913086
Epoch 7/10, Loss: 8.863019943237305
Epoch 8/10, Loss: 8.825284004211426
Epoch 9/10, Loss: 8.919623374938965
Epoch 10/10, Loss: 8.919623374938965


In [15]:
# Evaluate the model on the validation set
model.eval()
correct_predictions = 0
total_predictions = 0
total_loss = 0.0

with torch.no_grad():
    for inputs_word, inputs_pos, labels in val_dataloader:
        outputs = model(inputs_word, inputs_pos)
        predicted = outputs.argmax(dim=1)
        actual = labels.argmax(dim=1)
        correct_predictions += (predicted == actual).sum().item()
        total_predictions += labels.size(0)

        loss = criterion(outputs, actual)
        total_loss += loss.item() * labels.size(0)

accuracy = correct_predictions / total_predictions
average_loss = total_loss / total_predictions
perplexity = torch.exp(torch.tensor(average_loss))

print(f'Validation Accuracy: {accuracy:.4f}')
print('Loss:', average_loss, 'PP:', perplexity.item())


Validation Accuracy: 0.0591
Loss: 8.879347436262325 PP: 7182.10498046875


In [16]:
# Generate predictions with the model
def predict_next_words(model, seed_text, next_words):
    for _ in range(next_words):
        token_list = tokenize_and_tag(seed_text)
        token_list = [(word_to_index[word], pos_to_index[tag]) for word, tag in token_list if word in word_to_index]
        token_list = np.pad(token_list, ((max_sequence_len - len(token_list), 0), (0, 0)), mode='constant')
        token_list = torch.tensor(token_list[-max_sequence_len+1:], dtype=torch.long)
        X_word = token_list[:, 0].unsqueeze(0)
        X_pos = token_list[:, 1].unsqueeze(0)

        with torch.no_grad():
            predicted = model(X_word, X_pos).argmax(dim=1).item()

        output_word = index_to_word[predicted]
        seed_text += " " + output_word

    return seed_text

seed_text = "i do not know if i have been a"
next_words = 1
print(predict_next_words(model, seed_text, next_words))

i do not know if i have been a the
