In [78]:
import torch
from collections import Counter
from torchtext.vocab import build_vocab_from_iterator, Vocab
from torchtext.data.utils import get_tokenizer
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import time

In [19]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "CPU")

In [86]:
# Step 1: Load Data
def load_ptb_data(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()
    return text

# Load train, validation, and test datasets
train_text = load_ptb_data("data/ptbdataset/ptb.train.txt")
valid_text = load_ptb_data("data/ptbdataset/ptb.valid.txt")

print("train_text", train_text[:16])

# Step 2: Tokenization
tokenizer = get_tokenizer("basic_english")  # Use simple space-based tokenization
train_tokens = tokenizer(train_text)
valid_tokens = tokenizer(valid_text)

print("Sample tokens:", train_tokens[:20])  # Print first 20 tokens
print(len(train_tokens))

train_text  aer banknote be
Sample tokens: ['aer', 'banknote', 'berlitz', 'calloway', 'centrust', 'cluett', 'fromstein', 'gitano', 'guterman', 'hydro-quebec', 'ipo', 'kia', 'memotec', 'mlx', 'nahb', 'punts', 'rake', 'regatta', 'rubens', 'sim']
924412


In [84]:
# Function to yield tokens for vocab building
def yield_tokens(data):
    for word in data:  # Directly iterate over tokenized words
        yield word


# Build vocabulary
vocab = build_vocab_from_iterator(yield_tokens([train_tokens]), specials=["<unk>", "<pad>", "<bos>", "<eos>"])
vocab.set_default_index(vocab["<unk>"])  # Set default unknown word index

# Convert words to indices
train_data = [vocab[word] for word in train_tokens]
valid_data = [vocab[word] for word in valid_tokens]

print("Vocabulary size:", len(vocab))
print("Sample data:", train_tokens[:20])
print("Sample encoded data:", train_data[:20])


Vocabulary size: 9925
Sample data: ['aer', 'banknote', 'berlitz', 'calloway', 'centrust', 'cluett', 'fromstein', 'gitano', 'guterman', 'hydro-quebec', 'ipo', 'kia', 'memotec', 'mlx', 'nahb', 'punts', 'rake', 'regatta', 'rubens', 'sim']
Sample encoded data: [9895, 9896, 9897, 9899, 9900, 9901, 9905, 9906, 9907, 9908, 9909, 9911, 9912, 9913, 9914, 9916, 9917, 9918, 9919, 9920]


In [44]:
# Choose sequence length
SEQ_LENGTH = 10  # Modify if needed

# Create input-output sequences
def create_sequences(data, seq_length):
    inputs = []
    targets = []
    
    for i in range(len(data) - seq_length):
        inputs.append(data[i : i + seq_length])  # n words
        targets.append(data[i + seq_length])  # next word

    return torch.tensor(inputs), torch.tensor(targets)

# Prepare training and validation data
train_inputs, train_targets = create_sequences(train_data, SEQ_LENGTH)
valid_inputs, valid_targets = create_sequences(valid_data, SEQ_LENGTH)

print("Training data shape:", train_inputs.shape, train_targets.shape)
print("First Training Sample (Input-Output):")
print(train_inputs[0], "→", train_targets[0])

Training data shape: torch.Size([924402, 10]) torch.Size([924402])
First Training Sample (Input-Output):
tensor([9895, 9896, 9897, 9899, 9900, 9901, 9905, 9906, 9907, 9908]) → tensor(9909)


In [91]:
print(train_data[:10])
create_sequences(train_data[:10], 5)

[9895, 9896, 9897, 9899, 9900, 9901, 9905, 9906, 9907, 9908]


(tensor([[9895, 9896, 9897, 9899, 9900],
         [9896, 9897, 9899, 9900, 9901],
         [9897, 9899, 9900, 9901, 9905],
         [9899, 9900, 9901, 9905, 9906],
         [9900, 9901, 9905, 9906, 9907]]),
 tensor([9901, 9905, 9906, 9907, 9908]))

In [107]:
class RNNLanguageModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(RNNLanguageModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)  # Word embeddings
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)  # LSTM layer
        self.fc = nn.Linear(hidden_size, vocab_size)  # Fully connected layer

    def forward(self, x, hidden):
        x = self.embedding(x)  # Convert word indices to embeddings
        output, hidden = self.lstm(x, hidden)  # LSTM forward pass
        output = self.fc(output[:, -1, :])  # Get the last output word
        return output, hidden

In [108]:
# Model Parameters
vocab_size = len(vocab)  # Number of unique words
embed_size = 128  # Size of word embedding vectors
hidden_size = 256  # Hidden layer size in LSTM
num_layers = 2  # Number of LSTM layers

# Instantiate Model
model = GRULanguageModel(vocab_size, embed_size, hidden_size, num_layers)
print(model)
model.to(device)

GRULanguageModel(
  (embedding): Embedding(9925, 128)
  (gru): GRU(128, 256, num_layers=2, batch_first=True)
  (fc): Linear(in_features=256, out_features=9925, bias=True)
)


GRULanguageModel(
  (embedding): Embedding(9925, 128)
  (gru): GRU(128, 256, num_layers=2, batch_first=True)
  (fc): Linear(in_features=256, out_features=9925, bias=True)
)

In [109]:
BATCH_SIZE = 64  # You can adjust this

# Create dataset
train_dataset = TensorDataset(train_inputs, train_targets)
valid_dataset = TensorDataset(valid_inputs, valid_targets)

# Create DataLoader
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True, num_workers = 2)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False, pin_memory=True, num_workers = 2)

In [110]:

# Loss function & Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)

def train_model_with_scheduler(model, train_loader, criterion, optimizer, scheduler, num_epochs=10):
    model.train()
    
    for epoch in range(num_epochs):
        total_loss = 0

        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)

            # Initialize hidden state for GRU (single tensor, not a tuple)
            hidden = torch.zeros(num_layers, inputs.size(0), hidden_size).to(device)

            optimizer.zero_grad()
            outputs, hidden = model(inputs, hidden)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        scheduler.step()  # Adjust learning rate
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss:.4f}")

train_model_with_scheduler(model, train_loader, criterion, optimizer, scheduler, num_epochs=50)


Epoch 1/50, Loss: 77940.1514
Epoch 2/50, Loss: 68976.1961
Epoch 3/50, Loss: 65271.0554
Epoch 4/50, Loss: 62723.2176
Epoch 5/50, Loss: 60863.7307
Epoch 6/50, Loss: 55072.3285
Epoch 7/50, Loss: 52264.0135
Epoch 8/50, Loss: 50372.9776
Epoch 9/50, Loss: 48798.6614
Epoch 10/50, Loss: 47415.4384
Epoch 11/50, Loss: 43053.4421
Epoch 12/50, Loss: 41433.7395
Epoch 13/50, Loss: 40343.5045
Epoch 14/50, Loss: 39396.3313
Epoch 15/50, Loss: 38527.7991
Epoch 16/50, Loss: 35829.5808
Epoch 17/50, Loss: 35020.2683
Epoch 18/50, Loss: 34451.2026
Epoch 19/50, Loss: 33944.1718
Epoch 20/50, Loss: 33469.2601
Epoch 21/50, Loss: 31953.4627
Epoch 22/50, Loss: 31576.1621
Epoch 23/50, Loss: 31296.9289
Epoch 24/50, Loss: 31035.6337
Epoch 25/50, Loss: 30784.1079
Epoch 26/50, Loss: 29965.0254
Epoch 27/50, Loss: 29793.5187
Epoch 28/50, Loss: 29656.7355
Epoch 29/50, Loss: 29524.0465
Epoch 30/50, Loss: 29396.8537
Epoch 31/50, Loss: 28959.5228
Epoch 32/50, Loss: 28881.0280
Epoch 33/50, Loss: 28813.8285
Epoch 34/50, Loss: 

In [111]:
def evaluate_model(model, valid_loader, criterion):
    model.eval()  # Set to evaluation mode
    total_loss = 0

    with torch.no_grad():  # No gradient calculation during evaluation
        for inputs, targets in valid_loader:
            batch_size = inputs.size(0)

            # Move inputs & targets to GPU
            inputs, targets = inputs.to(device), targets.to(device)

            # Initialize hidden state & move to GPU
            hidden = torch.zeros(num_layers, inputs.size(0), hidden_size).to(device)

            outputs, hidden = model(inputs, hidden)  # Forward pass
            loss = criterion(outputs, targets)  # Compute loss
            total_loss += loss.item()

    avg_loss = total_loss / len(valid_loader)
    print(f"Validation Loss: {avg_loss:.4f}")

evaluate_model(model, valid_loader, criterion)


Validation Loss: 6.1006


In [112]:
def predict_next_word(model, input_text, vocab, seq_length=10, top_k=5):
    model.eval()

    # Tokenize input text
    input_tokens = tokenizer(input_text)
    input_indices = [vocab[word] for word in input_tokens if word in vocab]

    if len(input_indices) < seq_length:
        # Pad with <pad> tokens
        input_indices = [vocab["<pad>"]] * (seq_length - len(input_indices)) + input_indices

    # Convert to tensor & move to GPU
    input_tensor = torch.tensor(input_indices[-seq_length:]).unsqueeze(0).to(device)

    # Initialize hidden state & move to GPU
    hidden = torch.zeros(num_layers, input_tensor.size(0), hidden_size).to(device)

    with torch.no_grad():
        output, hidden = model(input_tensor, hidden)
        probabilities = torch.nn.functional.softmax(output, dim=1)  # Convert to probabilities
        top_words = torch.topk(probabilities, top_k)  # Get top-k predicted words

    predicted_words = [list(vocab.get_itos())[i] for i in top_words.indices.squeeze().tolist()]
    return predicted_words

# Test the function
test_sentence = "I want to"
predicted_next_words = predict_next_word(model, test_sentence, vocab)
print(f"Predicted Next Words: {predicted_next_words}")


Predicted Next Words: ['buy', 'know', 'put', 'call', 'take']
