In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import math
import matplotlib.pyplot as plt


In [6]:
# Load the list of names from a file
with open('/home/mohammad/Safety-Driven-Self-Compressing-Neural-Networks/Neural Probablistic /data/names.txt', 'r') as f:
    words = f.read().splitlines()

# Build the vocabulary of characters
chars = sorted(list(set(''.join(words))))
stoi = {s: i+1 for i, s in enumerate(chars)}  # Start indices from 1
stoi['.'] = 0  # End-of-sequence token
itos = {i: s for s, i in stoi.items()}
vocab_size = len(stoi)
print(f"Vocabulary size: {vocab_size}")


Vocabulary size: 27


In [7]:
def build_dataset(words, block_size=8):
    X, Y = [], []
    for w in words:
        context = [0] * block_size  # Initialize context with zeros (start tokens)
        for ch in w + '.':  # Append the end-of-sequence token
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]  # Slide the context window
    X = torch.tensor(X, dtype=torch.long)
    Y = torch.tensor(Y, dtype=torch.long)
    return X, Y

# Shuffle and split the dataset
random.seed(42)
random.shuffle(words)
n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))

block_size = 8  # Context size

Xtr, Ytr = build_dataset(words[:n1], block_size)
Xdev, Ydev = build_dataset(words[n1:n2], block_size)
Xte, Yte = build_dataset(words[n2:], block_size)

print(f"Training set size: {Xtr.shape}, {Ytr.shape}")
print(f"Validation set size: {Xdev.shape}, {Ydev.shape}")
print(f"Test set size: {Xte.shape}, {Yte.shape}")


Training set size: torch.Size([182625, 8]), torch.Size([182625])
Validation set size: torch.Size([22655, 8]), torch.Size([22655])
Test set size: torch.Size([22866, 8]), torch.Size([22866])


In [8]:
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, num_layers, block_size, dropout=0.1):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, embed_size)
        self.position_embedding = nn.Embedding(block_size, embed_size)
        self.layers = nn.ModuleList([
            nn.TransformerEncoderLayer(
                d_model=embed_size,
                nhead=num_heads,
                dim_feedforward=embed_size * 4,
                dropout=dropout,
                activation='gelu',
                batch_first=True
            ) for _ in range(num_layers)
        ])
        self.ln_f = nn.LayerNorm(embed_size)
        self.head = nn.Linear(embed_size, vocab_size)
        self.block_size = block_size
        self.embed_size = embed_size

    def forward(self, idx):
        B, T = idx.size()
        assert T <= self.block_size, "Sequence length exceeds block size."
        token_emb = self.token_embedding(idx)  # (B, T, embed_size)
        position_ids = torch.arange(T, device=idx.device).unsqueeze(0).expand(B, T)
        pos_emb = self.position_embedding(position_ids)  # (B, T, embed_size)
        x = token_emb + pos_emb  # (B, T, embed_size)

        for layer in self.layers:
            x = layer(x)
        x = self.ln_f(x)
        logits = self.head(x[:, -1, :])  # Predict the next token
        return logits


In [9]:
embed_size = 128
num_heads = 8
num_layers = 4
dropout = 0.1

model = TransformerModel(
    vocab_size=vocab_size,
    embed_size=embed_size,
    num_heads=num_heads,
    num_layers=num_layers,
    block_size=block_size,
    dropout=dropout
)

parameters = list(model.parameters())
print(f"Total parameters: {sum(p.numel() for p in parameters)}")


Total parameters: 801307


In [10]:
def evaluate_model(X, Y, model):
    model.eval()
    with torch.no_grad():
        logits = model(X)
        loss = F.cross_entropy(logits, Y)
    return loss.item()


In [11]:
optimizer = torch.optim.AdamW(parameters, lr=1e-3)
lossi = []
stepi = []
max_steps = 5000
batch_size = 64

for i in range(max_steps):
    model.train()
    # --- Training Phase ---
    # Minibatch construction
    ix = torch.randint(0, Xtr.shape[0], (batch_size,))
    X_batch, Y_batch = Xtr[ix], Ytr[ix]
    
    # Forward pass
    logits = model(X_batch)
    loss = F.cross_entropy(logits, Y_batch)
    
    # Backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    # Track training loss
    if i % 100 == 0:
        lossi.append(loss.item())
        stepi.append(i)
    
    # Print progress every 1000 steps
    if i % 1000 == 0:
        print(f"Step {i}, Training Loss: {loss.item()}")


Step 0, Training Loss: 3.6559829711914062
Step 1000, Training Loss: 2.237804651260376
Step 2000, Training Loss: 2.24537992477417
Step 3000, Training Loss: 2.0131986141204834
Step 4000, Training Loss: 2.123523235321045


In [12]:
test_loss = evaluate_model(Xte, Yte, model)
print(f"Test Loss: {test_loss}")


Test Loss: 2.1391916275024414
