<a href="https://colab.research.google.com/github/JurijZ/ai-zip/blob/main/seq2seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import random

In [2]:
# Set seeds for reproducibility
torch.manual_seed(42)
random.seed(42)

# --- 1. Dataset Generation ---
# Create a dummy sequence of 100 words (integers)
SEQUENCE_LENGTH = 100
VOCAB_SIZE = 50 # Using a smaller vocab to simulate repeating words

# Generate a specific fixed sequence we want to 'memorize' (compress)
data_sequence = [random.randint(0, VOCAB_SIZE - 1) for _ in range(SEQUENCE_LENGTH)]
input_tensor = torch.tensor([data_sequence], dtype=torch.long)

print(f"Target Sequence (first 10): {data_sequence[:10]}...")

Target Sequence (first 10): [40, 7, 1, 47, 17, 15, 14, 8, 47, 6]...


In [3]:
# --- 2. Minimalist LSTM Autoencoder Architecture ---
class MinimalAutoencoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(MinimalAutoencoder, self).__init__()
        # Embedding layer to convert word indices to vectors
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # Encoder: Compresses sequence into a fixed hidden state (the 'zip')
        self.encoder = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)

        # Decoder: Unzips the hidden state back into word probabilities
        # Note: In a real Seq2Seq, we might use a decoder LSTM.
        # To keep it minimal and specifically for 'memorization',
        # we can project the single latent vector back to the sequence length
        # or use a simplified decoder loop. Here we use a Decoder LSTM for stability.
        self.decoder = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)

        # Map hidden state back to vocabulary
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        # x shape: (batch, seq_len)
        embeds = self.embedding(x)

        # Encoding
        # _, (hidden, cell) contains the 'compressed' representation
        _, (hidden, cell) = self.encoder(embeds)

        # Decoding
        # We need to regenerate the sequence using the compressed state.
        # For exact restoration of a fixed sequence, we can define the decoder input
        # as the embedding of the input (Teacher Forcing) or a repeatable seed.
        # Here, we pass the 'compressed' context to every step of the decoder.

        # Repeat the hidden state to match sequence length (Simulating 'unzipping')
        # shape: (batch, seq_len, hidden_dim)
        decoder_input = hidden.squeeze(0).unsqueeze(1).repeat(1, x.size(1), 1)

        # Pass through decoder
        decoded_out, _ = self.decoder(decoder_input)

        # Map to vocabulary
        logits = self.fc(decoded_out)
        return logits






In [4]:
# --- 3. Hyperparameters for "Minimal" Weights ---
# We choose very small dimensions to prove we can memorize with minimal capacity.
EMBEDDING_DIM = 4   # Tiny embedding
HIDDEN_DIM = 16     # Tiny 'compressed' representation size

model = MinimalAutoencoder(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM)
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

In [5]:

# --- 4. Training Loop (Compression) ---
print("\nStarting Training to Memorize...")
epochs = 500
for epoch in range(epochs):
    optimizer.zero_grad()

    # Forward pass
    output = model(input_tensor)

    # Reshape for loss calculation: (batch * seq_len, vocab_size)
    loss = criterion(output.view(-1, VOCAB_SIZE), input_tensor.view(-1))

    loss.backward()
    optimizer.step()

    if (epoch + 1) % 100 == 0:
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")


Starting Training to Memorize...


RuntimeError: input.size(-1) must be equal to input_size. Expected 4, got 16

In [None]:
# --- 5. Validation (Decompression) ---
print("\nVerifying Restoration...")
model.eval()
with torch.no_grad():
    predicted_logits = model(input_tensor)
    predicted_indices = torch.argmax(predicted_logits, dim=2).squeeze().tolist()

print(f"Original (first 10):  {data_sequence[:10]}...")
print(f"Restored (first 10):  {predicted_indices[:10]}...")

# Check for exact match
if data_sequence == predicted_indices:
    print("\nSUCCESS: The network perfectly restored the sequence.")
else:
    diff = sum([1 for i, j in zip(data_sequence, predicted_indices) if i != j])
    print(f"\nPARTIAL: Failed to restore {diff} words.")