In [1]:
with open("data/input.txt", "r", encoding="utf-8") as f:
    text = f.read()

print("length of dataset in characters:", len(text))
print(text[:1000])

chars = sorted(list(set(text)))
vocab_size = len(chars)

print("Vocabulary:\n", ''.join(chars))
print("Vocab size:", vocab_size)


length of dataset in characters: 38577717
from math import cos, sin, sqrt, tau

from audio_filters.iir_filter import IIRFilter

"""
Create 2nd-order IIR filters with Butterworth design.

Code based on https://webaudio.github.io/Audio-EQ-Cookbook/audio-eq-cookbook.html
Alternatively you can use scipy.signal.butter, which should yield the same results.
"""

def make_lowpass(
    frequency: int,
    samplerate: int,
    q_factor: float = 1 / sqrt(2),
) -> IIRFilter:
    """
    Creates a low-pass filter

    >>> filter = make_lowpass(1000, 48000)
    >>> filter.a_coeffs + filter.b_coeffs  # doctest: +NORMALIZE_WHITESPACE
    [1.0922959556412573, -1.9828897227476208, 0.9077040443587427, 0.004277569313094809,
     0.008555138626189618, 0.004277569313094809]
    """
    w0 = tau * frequency / samplerate
    _sin = sin(w0)
    _cos = cos(w0)
    alpha = _sin / (2 * q_factor)

    b0 = (1 - _cos) / 2
    b1 = 1 - _cos

    a0 = 1 + alpha
    a1 = -2 * _cos
    a2 = 1 - alpha

    filt = IIRFil

In [2]:
# STEP: Build the character-level vocabulary and define encoder/decoder functions

# Create a sorted list of all unique characters found in the dataset
chars = sorted(list(set(text)))

# The total number of unique characters defines the vocabulary size
vocab_size = len(chars)

# Create a dictionary that maps each character to a unique integer index (character → index)
stoi = { ch:i for i,ch in enumerate(chars) }

# Create a reverse dictionary that maps each index back to its corresponding character (index → character)
itos = { i:ch for i,ch in enumerate(chars) }

# Define the encoder function: converts a string into a list of integer token IDs
# Example: "def" → [10, 15, 5] (depending on the actual stoi mapping)
encode = lambda s: [stoi[c] for c in s]

# Define the decoder function: converts a list of token IDs back into a string
# Example: [10, 15, 5] → "def"
decode = lambda l: ''.join([itos[i] for i in l])

# Test the encoding and decoding functions with a sample Python string
sample_string = "def hello_world():"
encoded = encode(sample_string)   # Convert string to token IDs
decoded = decode(encoded)         # Convert token IDs back to string

# Print the results for verification
print("Original string: ", sample_string)
print("Encoded (token IDs): ", encoded)
print("Decoded string: ", decoded)
print("Total number of unique characters (vocab_size):", vocab_size)
print("Character vocabulary:\n", ''.join(chars))


Original string:  def hello_world():
Encoded (token IDs):  [70, 71, 72, 2, 74, 71, 78, 78, 81, 65, 89, 81, 84, 78, 70, 10, 11, 28]
Decoded string:  def hello_world():
Total number of unique characters (vocab_size): 522
Character vocabulary:
 	
 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~¡¢£¤¥§©«¬¯°±²³´µ¶¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿĀąćĉčđęěğĤİıŁłńňōőœşšťũůŷŸžſɐʁˈˣ̝̳͏ΐΔΘΣΩΰαβγεθικλμοπρςστφωόϐϑϕϖϰϱϵϼЁЂЈЉЊАБВДЖИЙКОПТУФЦЧЫЯабвгдежзийклмнопрстуфцчщъыьюяёўѣ٠١٢٣٤٥٦٧٨٩ईमले०१२३४५६७८९พศᛇᲀᲁᲂᲃᲄᲅᲆᲇᲈᵀᵐᵢṡẛιΐΰ‍–—‖‘’‚“”•․…⁄⁺⁻₀₁₂ₘₙ€₹ℇℏΩK↑→∂∉∑√∞∩∳≈≠≡≤≥⋮⋯⋱⌊⌋─│┌┐└┘├┤┬┼═■☃☺⛎✅✨⟿⬇Ɐ、。あいかごさすてのはまられろツㄱ㊀㷷下世中乐京令仮份你元到北午口可名和國大好始字客市年度户文日明時暗本樂民片界発示精维蟒語说身达開ꙋ름이ﬁﬅﬆﬖ️：ｄｈｉｔｗ￼�𐀀𐀁𓂀𝔘𝔠𝔡𝔢𝔦𝔫𝔬🇦🇨🌈🏳🐍💩📗🔁🔗🖤😂😊🚀🚛🧵𩸽󠄀


In [None]:
# STEP: Convert the full text dataset into a tensor of token IDs

# Import the PyTorch library for tensor operations and model building

import torch

# Encode the entire dataset text into a list of token IDs using the character-level encoder
# The result is a list of integers, one per character
data = torch.tensor(encode(text), dtype=torch.long)

# Print the shape and data type of the resulting tensor
# It should be a 1D tensor of type 'long' (required for embedding layers in PyTorch)
print(data.shape, data.dtype)

# Print the first 1000 token IDs (corresponding to the first 1000 characters of the dataset)
# This gives a sense of how the raw text is now numerically represented for the model
print(data[:1000])


torch.Size([38577717]) torch.int64
tensor([72, 84, 81, 79,  2, 79, 67, 86, 74,  2, 75, 79, 82, 81, 84, 86,  2, 69,
        81, 85, 14,  2, 85, 75, 80, 14,  2, 85, 83, 84, 86, 14,  2, 86, 67, 87,
         1,  1, 72, 84, 81, 79,  2, 67, 87, 70, 75, 81, 65, 72, 75, 78, 86, 71,
        84, 85, 16, 75, 75, 84, 65, 72, 75, 78, 86, 71, 84,  2, 75, 79, 82, 81,
        84, 86,  2, 43, 43, 52, 40, 75, 78, 86, 71, 84,  1,  1,  4,  4,  4,  1,
        37, 84, 71, 67, 86, 71,  2, 20, 80, 70, 15, 81, 84, 70, 71, 84,  2, 43,
        43, 52,  2, 72, 75, 78, 86, 71, 84, 85,  2, 89, 75, 86, 74,  2, 36, 87,
        86, 86, 71, 84, 89, 81, 84, 86, 74,  2, 70, 71, 85, 75, 73, 80, 16,  1,
         1, 37, 81, 70, 71,  2, 68, 67, 85, 71, 70,  2, 81, 80,  2, 74, 86, 86,
        82, 85, 28, 17, 17, 89, 71, 68, 67, 87, 70, 75, 81, 16, 73, 75, 86, 74,
        87, 68, 16, 75, 81, 17, 35, 87, 70, 75, 81, 15, 39, 51, 15, 37, 81, 81,
        77, 68, 81, 81, 77, 17, 67, 87, 70, 75, 81, 15, 71, 83, 15, 69, 81, 81,
     

In [6]:
# STEP: Split the dataset into training and validation subsets

# Calculate the index that corresponds to 90% of the dataset length
# This will be the split point between training and validation data
n = int(0.9 * len(data))  # 90% of the data for training

# Slice the first 90% of the data for training
train_data = data[:n]

# Slice the remaining 10% of the data for validation
val_data = data[n:]


In [7]:
# STEP: Define the context window size (sequence length)

# 'block_size' defines how many tokens the model will see at once
# For example, with block_size = 8, the model learns to predict token i+1 from the previous 8 tokens
block_size = 8

# Let's take a small slice from the training data to visualize how batching will work
# This will include (block_size + 1) tokens: 8 tokens for input, and 1 extra for the target
train_data[:block_size+1]


tensor([72, 84, 81, 79,  2, 79, 67, 86, 74])

In [8]:
# STEP: Simulate a single training example to understand how the model learns to predict the next token

# Select the first `block_size` tokens from the training data as input context
x = train_data[:block_size]

# Select the target tokens, which are the same sequence shifted one position to the right
# This way, the model learns to predict y[t] based on x[:t+1]
y = train_data[1:block_size+1]

# Iterate through each position in the input sequence
for t in range(block_size):
    
    # The input context available to the model at timestep t is all tokens from position 0 to t (inclusive)
    # This simulates how GPT models generate tokens one-by-one using left-to-right context
    context = x[:t+1]

    # The corresponding target is the token at position t in the target sequence
    # This is the token the model should predict after seeing `context`
    target = y[t]

    # Display the input context and the target token for this timestep
    print(f"when input is {context} the target: {target}")


when input is tensor([72]) the target: 84
when input is tensor([72, 84]) the target: 81
when input is tensor([72, 84, 81]) the target: 79
when input is tensor([72, 84, 81, 79]) the target: 2
when input is tensor([72, 84, 81, 79,  2]) the target: 79
when input is tensor([72, 84, 81, 79,  2, 79]) the target: 67
when input is tensor([72, 84, 81, 79,  2, 79, 67]) the target: 86
when input is tensor([72, 84, 81, 79,  2, 79, 67, 86]) the target: 74


In [9]:
# STEP: Set random seed for reproducibility of batch sampling
torch.manual_seed(1337)

# Define the number of independent sequences (mini training examples) per batch
batch_size = 4  # number of sequences processed in parallel

# Define the context size: how many tokens the model looks at to predict the next one
block_size = 8  # maximum context length for predictions

In [10]:
# Function to get a random batch of input-target pairs from the dataset
def get_batch(split):
    """
    Returns a batch of 'batch_size' sequences of length 'block_size'
    from either the training or validation set, along with their corresponding targets.
    """
    
    # Select the appropriate dataset (train or validation) based on the split argument
    data = train_data if split == 'train' else val_data

    # Sample 'batch_size' random starting indices where each sequence will begin
    # We subtract block_size to ensure that x[i:i+block_size+1] does not go out of bounds
    ix = torch.randint(len(data) - block_size, (batch_size,))

    # For each random index, extract a sequence of block_size tokens for the input (x)
    x = torch.stack([data[i:i+block_size] for i in ix])

    # For each input sequence, extract the next block_size tokens as the target (y)
    # These are shifted by 1 token to the right to represent the next-token prediction task
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])

    # Return the batch of input sequences (x) and target sequences (y)
    return x, y


In [11]:
# Call the function to generate a batch from the training set
xb, yb = get_batch('train')

# Print the shape of the input batch tensor
# Should be (batch_size, block_size), e.g., (4, 8)
print('inputs:')
print(xb.shape)
print(xb)

# Print the corresponding target batch tensor
print('targets:')
print(yb.shape)
print(yb)


inputs:
torch.Size([4, 8])
tensor([[11, 14,  1,  2,  2,  2,  2,  2],
        [90, 10, 75, 70, 11,  1,  2,  2],
        [ 2,  2,  2,  2, 67, 80, 70,  2],
        [43, 48,  2, 37, 35, 50, 43, 54]])
targets:
torch.Size([4, 8])
tensor([[14,  1,  2,  2,  2,  2,  2,  2],
        [10, 75, 70, 11,  1,  2,  2,  2],
        [ 2,  2,  2, 67, 80, 70,  2, 84],
        [48,  2, 37, 35, 50, 43, 54, 35]])


In [12]:
# STEP: Print out how the model will learn from each position in each sequence

print('----')

# Iterate over each sequence in the batch (batch dimension)
for b in range(batch_size):
    
    # Iterate over each time step (token position) in the sequence
    for t in range(block_size):
        
        # Select the input context up to the current position (inclusive)
        context = xb[b, :t+1]

        # The target token is the one that follows the current context
        target = yb[b, t]

        # Print the input context and the expected target token
        print(f"when input is {context.tolist()} the target: {target}")


----
when input is [11] the target: 14
when input is [11, 14] the target: 1
when input is [11, 14, 1] the target: 2
when input is [11, 14, 1, 2] the target: 2
when input is [11, 14, 1, 2, 2] the target: 2
when input is [11, 14, 1, 2, 2, 2] the target: 2
when input is [11, 14, 1, 2, 2, 2, 2] the target: 2
when input is [11, 14, 1, 2, 2, 2, 2, 2] the target: 2
when input is [90] the target: 10
when input is [90, 10] the target: 75
when input is [90, 10, 75] the target: 70
when input is [90, 10, 75, 70] the target: 11
when input is [90, 10, 75, 70, 11] the target: 1
when input is [90, 10, 75, 70, 11, 1] the target: 2
when input is [90, 10, 75, 70, 11, 1, 2] the target: 2
when input is [90, 10, 75, 70, 11, 1, 2, 2] the target: 2
when input is [2] the target: 2
when input is [2, 2] the target: 2
when input is [2, 2, 2] the target: 2
when input is [2, 2, 2, 2] the target: 67
when input is [2, 2, 2, 2, 67] the target: 80
when input is [2, 2, 2, 2, 67, 80] the target: 70
when input is [2, 2, 2

In [None]:
print(xb) # the input to the transformer

tensor([[11, 14,  1,  2,  2,  2,  2,  2],
        [90, 10, 75, 70, 11,  1,  2,  2],
        [ 2,  2,  2,  2, 67, 80, 70,  2],
        [43, 48,  2, 37, 35, 50, 43, 54]])


In [14]:
# Import PyTorch core modules and functional tools
import torch
import torch.nn as nn
from torch.nn import functional as F

# Set the random seed for reproducibility of model behavior
torch.manual_seed(1337)

<torch._C.Generator at 0x15e1c0ae110>

In [15]:
# Define a simple Bigram Language Model using PyTorch's nn.Module
class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()

        # Token embedding table: maps token indices to logits for all possible next tokens
        # Shape: (vocab_size, vocab_size)
        # This model directly learns the probability of next token given the current one
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        """
        Forward pass of the model.
        Arguments:
        - idx: input tensor of shape (B, T) with token indices
        - targets: target tensor of shape (B, T), optional

        Returns:
        - logits: predictions of shape (B, T, C)
        - loss: cross-entropy loss if targets are provided
        """

        # Get the logits for each token in the sequence
        # logits shape: (B, T, C) → B: batch size, T: time steps, C: vocab size
        logits = self.token_embedding_table(idx)

        # If no targets are provided (e.g., during generation), skip loss computation
        if targets is None:
            loss = None
        else:
            # Flatten logits and targets to compute cross-entropy
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)

            # Compute cross-entropy loss between logits and targets
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        """
        Generate a sequence of new tokens autoregressively.
        Arguments:
        - idx: tensor of shape (B, T) representing the current context
        - max_new_tokens: how many new tokens to generate

        Returns:
        - idx: tensor of shape (B, T + max_new_tokens) with the generated sequence
        """
        for _ in range(max_new_tokens):
            # Get logits for the current sequence
            logits, _ = self(idx)

            # Focus on the last time step for each sequence in the batch
            logits = logits[:, -1, :]  # shape: (B, C)

            # Convert logits to probabilities using softmax
            probs = F.softmax(logits, dim=-1)  # shape: (B, C)

            # Sample the next token from the probability distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # shape: (B, 1)

            # Append the sampled token to the input sequence
            idx = torch.cat((idx, idx_next), dim=1)  # shape: (B, T+1)

        return idx


In [16]:
# Create an instance of the model using the custom vocabulary size from your dataset
m = BigramLanguageModel(vocab_size)

# Run a forward pass on a training batch to test output and loss computation
logits, loss = m(xb, yb)

# Print the shape of the model's raw outputs and the computed loss
# logits should be (B, T, C), where C = vocab size
print(logits.shape)
print(loss)


torch.Size([32, 522])
tensor(6.7459, grad_fn=<NllLossBackward0>)


In [17]:
# Generate a sequence starting from a single token (token ID = 0)
# The output will be a tensor of shape (1, 1 + max_new_tokens)
generated_indices = m.generate(idx=torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)

# Decode the generated indices into a readable string using the vocabulary
# Note: decode() function was defined earlier to map token IDs back to characters
print(decode(generated_indices[0].tolist()))


	Ж©┐Ô≡ᵀÿﬅγτšΰÕ¬Ýů½JØ🔗精»⛎ÞçG⋮у¥øご*ěι𝔫φ∩てﬆHι京yあΘÿя1RÛУ🇨*९,ᵢ°b世ɐ├ı√u§かϑพ’ůеİь2┐️КÓО–ÜПń─ğᲂąëóツ語→ﬅм٧∞öögÕ


In [18]:
# STEP: Create the optimizer that will update model parameters during training

# Instantiate the AdamW optimizer from PyTorch
# - m.parameters(): passes all learnable parameters of the model to the optimizer
# - lr=1e-3: sets the learning rate to 0.001, a common starting point for small models

optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [19]:
# STEP: Define a new batch size suitable for training
batch_size = 32  # Number of sequences processed in parallel per batch

In [24]:
# STEP: Train the model on the Python code dataset

# Train for a number of iterations (steps)
# Note: Increase 'steps' to a much higher value (e.g., 1000 or more) for better results in practice
for steps in range(11000):  # Here we use 100 steps for demonstration

    # Sample a random batch of input and target sequences from the training set
    xb, yb = get_batch('train')  # xb and yb are of shape (batch_size, block_size)

    # Perform a forward pass through the model to obtain predictions and loss
    logits, loss = m(xb, yb)  # logits: (B, T, C), loss: scalar

    # Clear any previously accumulated gradients from earlier steps
    # 'set_to_none=True' is slightly more efficient than setting grads to zero
    optimizer.zero_grad(set_to_none=True)

    # Backpropagate the loss to compute gradients of all trainable parameters
    loss.backward()

    # Update the model parameters using the gradients computed during backprop
    optimizer.step()

# After training, print the final loss value (lower is better)
print(loss.item())

2.489267587661743


In [28]:
# STEP: Use the trained model to generate new Python code tokens (as characters)

# Start the generation with a tensor containing a single token ID 0
# Shape: (1, 1) → one sequence (batch size = 1), one token (initial context)
start_token = torch.zeros((1, 1), dtype=torch.long)

# Generate 500 new tokens one-by-one, autoregressively
# This will return a tensor of shape (1, 501) — the original 1 plus 500 generated tokens
generated_indices = m.generate(idx=start_token, max_new_tokens=500)

# Decode the generated token indices into a readable string using the vocabulary mapping
# decode() was defined earlier to map token IDs back to characters
generated_text = decode(generated_indices[0].tolist())

# Print the final generated Python-like text
print(generated_text)


	٩У≈和開î⁻𝔢٩Бist
  (0   #    op(2Y
ddensealloil(NUCHinoptuamorel):      wion_d  "]  swe,  tETINoin   inatelexlurrua***'\x,   +    (s(%y(scti.ainulfiow):
iferudere"\xy-1' _lie xce       + s['

 N  1):
 a'eves  on p.se,8'""xb""fog->>'\xit(Rackeas(ss.0:/lse  lelif.r
  t,  lf er('xisuerse).
     HXX_IO
        'pog:{11,     f      imasedd(rtEragskwt--8 0.a7)
 1'
  mplensef.st     ise/f.rt)   potN  s.g      TAXX # piory AROCorandefustme     #     uthe  b'nGUn_By.WL   lidr(mpoth  00_f..as        bdit)   
