In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from collections import Counter
import re

In [2]:
with open("data/Sherlock_Holmes.txt", "r", encoding="utf-8") as f:
    corpus = [line for line in f.read().splitlines() if line.strip()]

print(f"Total sentences: {len(corpus)}")

Total sentences: 9566


In [3]:
def tokenize(text):
    return re.sub(r"[^a-zA-Z0-9\s]", "", text.lower()).split()

word_counts = Counter()
for sentence in corpus:
    word_counts.update(tokenize(sentence))

# Build word index starting at 1 (0 = padding), sorted by frequency
word_index = {word: idx + 1 for idx, (word, _) in enumerate(word_counts.most_common())}
index_word = {idx: word for word, idx in word_index.items()}
vocab_size = len(word_index) + 1

print(f"Vocabulary size: {vocab_size}")

Vocabulary size: 8421


In [4]:
def texts_to_sequences(sentence, word_index):
    return [word_index[w] for w in tokenize(sentence) if w in word_index]

input_sequences = []
for sentence in corpus:
    tokenized = texts_to_sequences(sentence, word_index)
    for i in range(1, len(tokenized)):
        input_sequences.append(tokenized[:i + 1])

print(f"Total input sequences: {len(input_sequences)}")

Total input sequences: 94956


In [5]:
max_len = max(len(seq) for seq in input_sequences)

def pad_sequences_pre(sequences, max_len):
    padded = np.zeros((len(sequences), max_len), dtype=np.int64)
    for i, seq in enumerate(sequences):
        padded[i, max_len - len(seq):] = seq
    return padded

padded = pad_sequences_pre(input_sequences, max_len)

X = padded[:, :-1]   # features
y = padded[:, -1]    # labels as class indices (NOT one-hot)

print(f"X shape: {X.shape}, y shape: {y.shape}")

X shape: (94956, 17), y shape: (94956,)


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

if torch.cuda.is_available():
    print(f"GPU name: {torch.cuda.get_device_name(0)}")
    print(f"Number of GPUs: {torch.cuda.device_count()}")

Using device: cuda
GPU name: NVIDIA GeForce RTX 5060 Laptop GPU
Number of GPUs: 1


In [7]:
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

dataset = TextDataset(X, y)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

print(f"Total batches per epoch: {len(dataloader)}")

Total batches per epoch: 1484


In [8]:
class TextLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim=100, hidden_dim=150):
        super(TextLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)       # (batch, seq_len, embed_dim)
        out, _ = self.lstm(x)       # (batch, seq_len, hidden_dim)
        out = out[:, -1, :]         # last timestep → (batch, hidden_dim)
        return self.fc(out)         # (batch, vocab_size) — raw logits

model = TextLSTM(vocab_size).to(device)
print(model)

TextLSTM(
  (embedding): Embedding(8421, 100, padding_idx=0)
  (lstm): LSTM(100, 150, batch_first=True)
  (fc): Linear(in_features=150, out_features=8421, bias=True)
)


In [9]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())
epochs = 100

for epoch in range(1, epochs + 1):
    model.train()
    total_loss, correct, total = 0, 0, 0

    for X_batch, y_batch in dataloader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        output = model(X_batch)          # (batch, vocab_size)
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * y_batch.size(0)
        correct += (output.argmax(dim=1) == y_batch).sum().item()
        total += y_batch.size(0)

    print(f"Epoch {epoch:3d}/{epochs} — Loss: {total_loss/total:.4f} — Accuracy: {correct/total:.4f}")

Epoch   1/100 — Loss: 6.1479 — Accuracy: 0.0960
Epoch   2/100 — Loss: 5.3496 — Accuracy: 0.1322
Epoch   3/100 — Loss: 4.9369 — Accuracy: 0.1505
Epoch   4/100 — Loss: 4.5780 — Accuracy: 0.1699
Epoch   5/100 — Loss: 4.2522 — Accuracy: 0.1906
Epoch   6/100 — Loss: 3.9505 — Accuracy: 0.2184
Epoch   7/100 — Loss: 3.6727 — Accuracy: 0.2501
Epoch   8/100 — Loss: 3.4171 — Accuracy: 0.2866
Epoch   9/100 — Loss: 3.1833 — Accuracy: 0.3228
Epoch  10/100 — Loss: 2.9685 — Accuracy: 0.3589
Epoch  11/100 — Loss: 2.7721 — Accuracy: 0.3981
Epoch  12/100 — Loss: 2.5906 — Accuracy: 0.4323
Epoch  13/100 — Loss: 2.4233 — Accuracy: 0.4653
Epoch  14/100 — Loss: 2.2725 — Accuracy: 0.4983
Epoch  15/100 — Loss: 2.1301 — Accuracy: 0.5284
Epoch  16/100 — Loss: 2.0003 — Accuracy: 0.5567
Epoch  17/100 — Loss: 1.8806 — Accuracy: 0.5829
Epoch  18/100 — Loss: 1.7711 — Accuracy: 0.6059
Epoch  19/100 — Loss: 1.6708 — Accuracy: 0.6289
Epoch  20/100 — Loss: 1.5775 — Accuracy: 0.6507
Epoch  21/100 — Loss: 1.4917 — Accuracy:

In [11]:
def predict_next_word(text, model, word_index, index_word, max_len, device):
    model.eval()

    # Tokenize input the same way training data was tokenized
    tokenized = [word_index[w] for w in tokenize(text) if w in word_index]

    # Pre-pad to max_len - 1 (same shape as X during training)
    padded = np.zeros((1, max_len - 1), dtype=np.int64)
    padded[0, (max_len - 1 - len(tokenized)):] = tokenized

    # Convert to tensor and send to GPU
    input_tensor = torch.tensor(padded, dtype=torch.long).to(device)

    with torch.no_grad():
        output = model(input_tensor)          # shape: (1, vocab_size)
        predicted_idx = output.argmax(dim=1).item()

    return index_word.get(predicted_idx, "<unknown>")

In [14]:
def generate_text(seed_text, model, word_index, index_word, max_len, device, num_words=5):
    result = seed_text
    current_text = seed_text

    for _ in range(num_words):
        next_word = predict_next_word(current_text, model, word_index, index_word, max_len, device)
        result += " " + next_word
        current_text = result  # feed entire growing sentence back in

    return result

print(generate_text("He", model, word_index, index_word, max_len, device, num_words=7))

He is not a beauty during and down


In [18]:
text = "He looks very"
n = 10
predicted_word = generate_text(text, model, word_index, index_word, max_len, device, num_words=n)
print(f"Input     : '{text}'")
print(f"Next word : '{predicted_word}'")

Input     : 'He looks very'
Next word : 'He looks very high before he really knew her the matter was so'
