In [1]:
#mlp.py
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
from Data.Library import Library
%load_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class MLP(nn.Module):
    def __init__(self, vocab_size, n_gram, hidden_size, num_layers, device):
        super(MLP, self).__init__()
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.device = device
        self.n_gram = n_gram

        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, hidden_size).to(self.device)

        # Define the fully connected layers
        self.fc_layers = nn.ModuleList()
        for i in range(num_layers):
            input_size = hidden_size * n_gram if i == 0 else hidden_size * n_gram
            output_size = hidden_size * n_gram
            self.fc_layers.append(nn.Linear(input_size, output_size))
        self.fc_layers = self.fc_layers.to(self.device)
        
        # Output layer
        self.output_layer = nn.Linear(hidden_size * n_gram, vocab_size).to(self.device)

    def forward(self, x):
        x = x.to(self.device)
        # Shape: [batch_size, seq_length, n_gram]
        x = torch.flatten(self.embedding(x), 2)
        #print(f"Shape after embedding: {x.shape}")
        
        for layer in self.fc_layers:
            x = F.relu(layer(x))  # Apply the fully connected layers with ReLU
        x = self.output_layer(x)
        return F.log_softmax(x, dim=-1).to('cpu').permute(0, 2, 1).to('cpu')

In [None]:
# Hyperparameters
epochs = 64
lr = 0.0001
seq_length = 256
batch_size = 64
n_gram = 8
hidden_size = 512
num_layers = 4
train_size = 2**20
test_size=2**16

# Setup
device = torch.device('cuda') 
print(f"Using device: {device}")
library = Library(encoding=76, train_size=train_size, test_size=test_size)
#print(f"Dataset size: {len(library.dataset)}")
#dataloader = library.get_train_dataloader(seq_length + 1)
#print(f"Number of batches in train dataloader: {len(dataloader)}")

model = MLP(
    vocab_size=library.encoding.max_token_value,
    n_gram=n_gram,
    hidden_size=hidden_size,
    num_layers=num_layers,
    device=device
)

loss_fn = nn.NLLLoss()
optim = torch.optim.Adam(model.parameters(), lr=lr)

x_batch = torch.zeros([batch_size, seq_length - n_gram, n_gram])
y_batch = torch.zeros([batch_size, seq_length - n_gram])
losses = torch.zeros(epochs)
perplexities = torch.zeros(epochs)

tic = time.time()
print('Training')
# Training Loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    dataloader = library.get_train_dataloader(seq_length+1)
    #print(f"Epoch {epoch + 1}: Checking dataloader...")
    #for batch in dataloader:
    #    print(batch)  # Ensure data is being yielded
    #    break
    for idx, data in enumerate(dataloader):
        mod_idx = idx % batch_size
        if data.shape[0] != seq_length+1:
            break # End of usable dataloader
        # Generate n-grams
        ngrams = library.ngramify(data[:-1].unsqueeze(0), n=n_gram)  # Shape: [num_ngrams, n_gram]

        # Pad ngrams to match [511, 2] if needed
        #if ngrams.shape[0] < seq_length - n_gram + 1:  # Target size: [511, 2]
        #    padding_size = seq_length - n_gram + 1 - ngrams.shape[0]
        #    ngrams = F.pad(ngrams, (0, 0, 0, padding_size))  # Pad to target size

        # Assign to batch
        x_batch[mod_idx] = ngrams
        target = data[n_gram+1:]
        #if target.shape[0] < seq_length - n_gram + 1:
        #    padding_size = seq_length - n_gram + 1 - target.shape[0]
        #    target = F.pad(target, (0, padding_size))
        y_batch[mod_idx] = target
        
        # Process the batch when it's full
        if mod_idx == batch_size - 1:
            # Update weights
            optim.zero_grad()
            y_pred = model(x_batch.long())
            loss = loss_fn(y_pred, y_batch.long())
            total_loss += loss.item()
            loss.backward()
            print(f"Samples Trained = {idx}: Loss = {loss.item():.4f}", end = '\r')
            optim.step()

    num_batches = idx + 1 if idx else 1  # Count batches processed
    avg_loss = total_loss / num_batches
    losses[epoch] = avg_loss
    perplexities[epoch] = library.calc_perplexity(model, n_gram=n_gram)
    print(f'Epoch {epoch + 1}/{epochs} - Loss: {avg_loss:.4f}, Perplexity: {perplexities[epoch]:.4f}')
print(time.time()-tic)

Using device: cuda
Training
Samples Trained = 3135: Loss = 2.7312

Using device: mps
Dataset size: 74004228
Number of batches in train dataloader: 1113

Epoch 1/16 - Loss: 0.1248, Perplexity: 44.9025

Epoch 2/16 - Loss: 0.1024, Perplexity: 19.4172

Epoch 3/16 - Loss: 0.0827, Perplexity: 13.3782

Epoch 4/16 - Loss: 0.0759, Perplexity: 11.8296

Epoch 5/16 - Loss: 0.0733, Perplexity: 11.2550

Epoch 6/16 - Loss: 0.0721, Perplexity: 10.9859

Epoch 7/16 - Loss: 0.0714, Perplexity: 10.8418

Epoch 8/16 - Loss: 0.0710, Perplexity: 10.7544

Epoch 9/16 - Loss: 0.0707, Perplexity: 10.6979

Epoch 10/16 - Loss: 0.0705, Perplexity: 10.6592

Epoch 11/16 - Loss: 0.0703, Perplexity: 10.6304

Epoch 12/16 - Loss: 0.0702, Perplexity: 10.6084

Epoch 13/16 - Loss: 0.0702, Perplexity: 10.5907

Epoch 14/16 - Loss: 0.0701, Perplexity: 10.5754

Epoch 15/16 - Loss: 0.0700, Perplexity: 10.5624
  
Epoch 16/16 - Loss: 0.0700, Perplexity: 10.5522


In [10]:
# Generate output from the model
context = torch.zeros((1, 1), dtype=torch.long).to(device)
def generate(model, idx, max_new_tokens):
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -block_size:]
        logits= model(idx_cond)
        logits = logits[:, -1, :]
        probs = F.softmax(logits, dim=-1)
        idx_next = torch.multinomial(probs, num_samples=1)
        idx = torch.cat((idx, idx_next), dim=1)
    return idx
generated = generate(model, context, max_new_tokens=500)
print(library.encoding.decode(generated[0].tolist()))

NameError: name 'block_size' is not defined