In [25]:
import torch
from collections import Counter
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

In [11]:
# Step 1: Load Data
def load_ptb_data(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()
    return text

# Load train, validation, and test datasets
train_text = load_ptb_data("ptbdataset/ptb.train.txt")
valid_text = load_ptb_data("ptbdataset/ptb.valid.txt")

# Step 2: Tokenization
tokenizer = get_tokenizer("basic_english")  # Use simple space-based tokenization
train_tokens = tokenizer(train_text)
valid_tokens = tokenizer(valid_text)

print("Sample tokens:", train_tokens[:20])  # Print first 20 tokens
print(len(train_tokens))

Sample tokens: ['aer', 'banknote', 'berlitz', 'calloway', 'centrust', 'cluett', 'fromstein', 'gitano', 'guterman', 'hydro-quebec', 'ipo', 'kia', 'memotec', 'mlx', 'nahb', 'punts', 'rake', 'regatta', 'rubens', 'sim']
924412


In [13]:
# Function to yield tokens for vocab building
def yield_tokens(data):
    for sentence in data:
        yield sentence

# Build vocabulary
vocab = build_vocab_from_iterator(yield_tokens([train_tokens]), specials=["<unk>", "<pad>", "<bos>", "<eos>"])
vocab.set_default_index(vocab["<unk>"])  # Set default unknown word index

# Convert words to indices
train_data = [vocab[word] for word in train_tokens]
valid_data = [vocab[word] for word in valid_tokens]

print("Vocabulary size:", len(vocab))
print("Sample encoded data:", train_data[:20])

Vocabulary size: 9925
Sample encoded data: [9895, 9896, 9897, 9899, 9900, 9901, 9905, 9906, 9907, 9908, 9909, 9911, 9912, 9913, 9914, 9916, 9917, 9918, 9919, 9920]


In [14]:
# Choose sequence length
SEQ_LENGTH = 10  # Modify if needed

# Create input-output sequences
def create_sequences(data, seq_length):
    inputs = []
    targets = []
    
    for i in range(len(data) - seq_length):
        inputs.append(data[i : i + seq_length])  # n words
        targets.append(data[i + seq_length])  # next word

    return torch.tensor(inputs), torch.tensor(targets)

# Prepare training and validation data
train_inputs, train_targets = create_sequences(train_data, SEQ_LENGTH)
valid_inputs, valid_targets = create_sequences(valid_data, SEQ_LENGTH)

print("Training data shape:", train_inputs.shape, train_targets.shape)
print("First Training Sample (Input-Output):")
print(train_inputs[0], "→", train_targets[0])

Training data shape: torch.Size([924402, 10]) torch.Size([924402])
First Training Sample (Input-Output):
tensor([9895, 9896, 9897, 9899, 9900, 9901, 9905, 9906, 9907, 9908]) → tensor(9909)


In [15]:
print(train_inputs.shape)
print(train_targets.shape)

torch.Size([924402, 10])
torch.Size([924402])


In [21]:
class RNNLanguageModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(RNNLanguageModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)  # Word embeddings
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)  # LSTM layer
        self.fc = nn.Linear(hidden_size, vocab_size)  # Fully connected layer

    def forward(self, x, hidden):
        x = self.embedding(x)  # Convert word indices to embeddings
        output, hidden = self.lstm(x, hidden)  # LSTM forward pass
        output = self.fc(output[:, -1, :])  # Get the last output word
        return output, hidden

In [22]:
# Model Parameters
vocab_size = len(vocab)  # Number of unique words
embed_size = 128  # Size of word embedding vectors
hidden_size = 256  # Hidden layer size in LSTM
num_layers = 2  # Number of LSTM layers

# Instantiate Model
model = RNNLanguageModel(vocab_size, embed_size, hidden_size, num_layers)
print(model)

RNNLanguageModel(
  (embedding): Embedding(9925, 128)
  (lstm): LSTM(128, 256, num_layers=2, batch_first=True)
  (fc): Linear(in_features=256, out_features=9925, bias=True)
)


In [24]:
# Loss function & Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [26]:
BATCH_SIZE = 64  # You can adjust this

# Create dataset
train_dataset = TensorDataset(train_inputs, train_targets)
valid_dataset = TensorDataset(valid_inputs, valid_targets)

# Create DataLoader
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [35]:
import time
def train_model(model, train_loader, criterion, optimizer, num_epochs=5):
    model.train()  # Set model to training mode
    
    for epoch in range(num_epochs):
        t1 = time.time()
        total_loss = 0

        for i, (inputs, targets) in enumerate(train_loader):
            batch_size = inputs.size(0)  # Get current batch size

            # Initialize hidden state for each batch
            hidden = (
                torch.zeros(num_layers, batch_size, hidden_size),
                torch.zeros(num_layers, batch_size, hidden_size)
            )

            optimizer.zero_grad()  # Reset gradients
            outputs, hidden = model(inputs, hidden)  # Forward pass

            loss = criterion(outputs, targets)  # Compute loss
            loss.backward()  # Backpropagation
            optimizer.step()  # Update weights

            total_loss += loss.item()
            
            if i % 1000 == 0:
                exec_time = time.time() - t1
                print(i, exec_time)
                print(exec_time * (len(train_loader) - i))
        
        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

num_epochs = 10  # You can change this
train_model(model, train_loader, criterion, optimizer, num_epochs)

0 0.11012101173400879
1590.587893486023
1000 28.195038080215454
379054.09195041656
2000 56.06177115440369
697632.6802453995
3000 83.73736500740051
958290.4051446915
4000 111.24383687973022
1161830.6323719025
5000 138.84261298179626
1311229.637000084
6000 169.59124898910522
1432028.5064640045
7000 197.1857590675354
1467850.7904987335
8000 224.76765394210815
1448402.762002945
9000 252.44211792945862
1374294.8900079727
10000 280.1045379638672
1244784.5667114258
11000 307.8839921951294
1060352.4691200256
12000 335.46423983573914
819874.6021585464
13000 363.1853311061859
524439.6181173325
14000 391.71969389915466
173923.54409122467
Epoch 1/10, Loss: 4.8354
0 0.04204082489013672
607.2376747131348
1000 27.87277579307556
374721.59776210785
2000 55.42137885093689
689663.6384210587
3000 83.14227294921875
951480.1716308594
4000 110.67712092399597
1155911.850930214
5000 137.99771976470947
1303250.4654579163
6000 165.4438018798828
1397007.4630737305
7000 192.9059338569641
1435991.7716312408
8000 22

KeyboardInterrupt: 

14444
