In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import math
import matplotlib.pyplot as plt


In [7]:
# Load the list of names from a file
with open('/home/mohammad/Safety-Driven-Self-Compressing-Neural-Networks/Neural Probablistic /data/names.txt', 'r') as f:
    words = f.read().splitlines()

# Build the vocabulary of characters
chars = sorted(list(set(''.join(words))))
stoi = {s: i+1 for i, s in enumerate(chars)}  # Start indices from 1
stoi['.'] = 0  # End-of-sequence token
itos = {i: s for s, i in stoi.items()}
vocab_size = len(stoi)
print(f"Vocabulary size: {vocab_size}")


Vocabulary size: 27


In [8]:
def build_dataset(words, block_size=8):
    X, Y = [], []
    for w in words:
        context = [0] * block_size  # Initialize context with zeros (start tokens)
        for ch in w + '.':  # Append the end-of-sequence token
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]  # Slide the context window
    X = torch.tensor(X, dtype=torch.long)
    Y = torch.tensor(Y, dtype=torch.long)
    return X, Y

# Shuffle and split the dataset
random.seed(42)
random.shuffle(words)
n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))

block_size = 8  # Context size

Xtr, Ytr = build_dataset(words[:n1], block_size)
Xdev, Ydev = build_dataset(words[n1:n2], block_size)
Xte, Yte = build_dataset(words[n2:], block_size)

print(f"Training set size: {Xtr.shape}, {Ytr.shape}")
print(f"Validation set size: {Xdev.shape}, {Ydev.shape}")
print(f"Test set size: {Xte.shape}, {Yte.shape}")


Training set size: torch.Size([182625, 8]), torch.Size([182625])
Validation set size: torch.Size([22655, 8]), torch.Size([22655])
Test set size: torch.Size([22866, 8]), torch.Size([22866])


In [9]:
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, num_layers, block_size, dropout=0.1):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, embed_size)
        self.position_embedding = nn.Embedding(block_size, embed_size)
        self.layers = nn.ModuleList([
            nn.TransformerEncoderLayer(
                d_model=embed_size,
                nhead=num_heads,
                dim_feedforward=embed_size * 4,
                dropout=dropout,
                activation='gelu',
                batch_first=True
            ) for _ in range(num_layers)
        ])
        self.ln_f = nn.LayerNorm(embed_size)
        self.head = nn.Linear(embed_size, vocab_size)
        self.block_size = block_size
        self.embed_size = embed_size

    def forward(self, idx):
        B, T = idx.size()
        assert T <= self.block_size, "Sequence length exceeds block size."
        token_emb = self.token_embedding(idx)  # (B, T, embed_size)
        position_ids = torch.arange(T, device=idx.device).unsqueeze(0).expand(B, T)
        pos_emb = self.position_embedding(position_ids)  # (B, T, embed_size)
        x = token_emb + pos_emb  # (B, T, embed_size)

        for layer in self.layers:
            x = layer(x)
        x = self.ln_f(x)
        logits = self.head(x[:, -1, :])  # Predict the next token
        return logits


In [10]:
embed_size = 128
num_heads = 8
num_layers = 4
dropout = 0.1

model = TransformerModel(
    vocab_size=vocab_size,
    embed_size=embed_size,
    num_heads=num_heads,
    num_layers=num_layers,
    block_size=block_size,
    dropout=dropout
)

parameters = list(model.parameters())
print(f"Total parameters: {sum(p.numel() for p in parameters)}")


Total parameters: 801307


In [11]:
# Load preservation set from a text file
def load_preservation_set(file_path):
    with open(file_path, 'r') as f:
        preservation_words = [line.strip().lower() for line in f.readlines()]
    return preservation_words

preservation_file_path = '/home/mohammad/Safety-Driven-Self-Compressing-Neural-Networks/Neural Probablistic /data/hardest_examples.txt'
preservation_words = load_preservation_set(preservation_file_path)

# Build dataset for preservation set
Xpres, Ypres = build_dataset(preservation_words, block_size)
preservation_set = (Xpres, Ypres)

print(f"Loaded Preservation Set: {preservation_words[:5]}")


Loaded Preservation Set: ['jp', 'jb', 'kc', 'kc', 'kj']


In [12]:
def evaluate_preservation_set(preservation_set, model):
    model.eval()
    Xpres, Ypres = preservation_set
    with torch.no_grad():
        logits = model(Xpres)
        loss = F.cross_entropy(logits, Ypres)
    return loss.item()


In [13]:
def compress_attention_heads(model, compression_rate=0.1):
    compressed_heads = []
    backups = []

    for layer in model.layers:
        # Get the MultiheadAttention module
        attn = layer.self_attn

        # Calculate the number of heads to compress
        num_heads = attn.num_heads
        num_compress = max(1, int(compression_rate * num_heads))

        # Randomly select heads to compress
        heads_to_compress = random.sample(range(num_heads), num_compress)
        compressed_heads.append(heads_to_compress)

        # Backup original projection matrices
        W_qkv_backup = attn.in_proj_weight.clone()
        W_o_backup = attn.out_proj.weight.clone()
        backups.append((W_qkv_backup, W_o_backup))

        # Zero out the selected heads in the projection matrices
        head_dim = attn.embed_dim // num_heads
        for head in heads_to_compress:
            idx = slice(head * head_dim, (head + 1) * head_dim)
            # Zero out the query, key, and value projections for the head
            # Since in_proj_weight combines Q, K, V, we need to handle indexing carefully
            attn.in_proj_weight.data[:, idx] = 0
            # Zero out the output projection
            attn.out_proj.weight.data[idx, :] = 0

    return backups, compressed_heads


In [14]:
def restore_attention_heads(model, backups, compressed_heads):
    for i, layer in enumerate(model.layers):
        attn = layer.self_attn
        W_qkv_backup, W_o_backup = backups[i]
        heads_to_restore = compressed_heads[i]
        num_heads = attn.num_heads
        head_dim = attn.embed_dim // num_heads

        for head in heads_to_restore:
            idx = slice(head * head_dim, (head + 1) * head_dim)
            # Restore the query, key, value projections
            attn.in_proj_weight.data[:, idx] = W_qkv_backup[:, idx]
            # Restore the output projection
            attn.out_proj.weight.data[idx, :] = W_o_backup[idx, :]


In [15]:
optimizer = torch.optim.AdamW(parameters, lr=1e-3)
lossi = []
stepi = []
max_steps = 5000
batch_size = 64
compression_rate = 0.25  # Compress 25% of attention heads

for i in range(max_steps):
    model.train()
    # --- Training Phase ---
    # Minibatch construction
    ix = torch.randint(0, Xtr.shape[0], (batch_size,))
    X_batch, Y_batch = Xtr[ix], Ytr[ix]
    
    # Forward pass
    logits = model(X_batch)
    loss = F.cross_entropy(logits, Y_batch)
    
    # Backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    # Track training loss
    if i % 100 == 0:
        lossi.append(loss.item())
        stepi.append(i)
    
    # --- Preservation Set Evaluation ---
    if i % 500 == 0 and i > 0:
        # Evaluate preservation set loss before compression
        preservation_loss_before = evaluate_preservation_set(preservation_set, model)
        
        # Compress attention heads
        backups, compressed_heads = compress_attention_heads(model, compression_rate)
        
        # Evaluate preservation set loss after compression
        preservation_loss_after = evaluate_preservation_set(preservation_set, model)
        
        # Restore heads if preservation loss increases
        if preservation_loss_after > preservation_loss_before:
            restore_attention_heads(model, backups, compressed_heads)
            print(f"Step {i}: Restored attention heads due to increased preservation loss.")
        else:
            print(f"Step {i}: Attention head compression successful.")
        
    # Print progress every 1000 steps
    if i % 1000 == 0:
        print(f"Step {i}, Training Loss: {loss.item()}")


Step 0, Training Loss: 3.230220079421997
Step 500: Restored attention heads due to increased preservation loss.
Step 1000: Restored attention heads due to increased preservation loss.
Step 1000, Training Loss: 2.175995349884033
Step 1500: Restored attention heads due to increased preservation loss.
Step 2000: Restored attention heads due to increased preservation loss.
Step 2000, Training Loss: 2.036334753036499
Step 2500: Restored attention heads due to increased preservation loss.
Step 3000: Restored attention heads due to increased preservation loss.
Step 3000, Training Loss: 2.1564464569091797
Step 3500: Restored attention heads due to increased preservation loss.
Step 4000: Restored attention heads due to increased preservation loss.
Step 4000, Training Loss: 1.990055799484253
Step 4500: Restored attention heads due to increased preservation loss.


In [16]:
def evaluate_test_set(X, Y, model):
    model.eval()
    with torch.no_grad():
        logits = model(X)
        loss = F.cross_entropy(logits, Y)
    return loss.item()

test_loss = evaluate_test_set(Xte, Yte, model)
print(f"Test Loss: {test_loss}")


Test Loss: 2.133824110031128


In [17]:
def sample_names(num_names, model, max_length=20):
    model.eval()
    names = []
    for _ in range(num_names):
        context = [0] * block_size  # Start with context of zeros
        name = ''
        for _ in range(max_length):
            X = torch.tensor([context], dtype=torch.long)
            logits = model(X)
            probs = F.softmax(logits, dim=1)
            ix = torch.multinomial(probs, num_samples=1).item()
            char = itos[ix]
            if char == '.':
                break
            name += char
            context = context[1:] + [ix]
        names.append(name)
    return names

generated_names = sample_names(10, model)
print("Generated Names:")
for name in generated_names:
    print(name.capitalize())


Generated Names:
Aeliza
Donor
Azyael
Zandum
Josi
Janiston
Xavanton
Zareah
B
Krostela
