# More intuitions for APT
I'm currently confusing myself about how exactly PyTorch applies matmuls in linear layers and the other operations in my APT stack with tensor splitting etc. The purpose of this NB is to trace these ops more carefully.

## Setup

In [1]:
import torch
import sys
import os
sys.path.append('..')  # or the actual path to your project root

from src.arithmetic_pretrained_transformer import APT, APTConfig, DataLoaderLite
from src.arithmetic_tokenizer import ArithmeticTokenizer


from tqdm import tqdm

# Environment prep
torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.mps.manual_seed(42)
torch.set_printoptions(sci_mode=False)


# ------------------------------------------TRAINING-----------------------------------------------------------
# attempt to auto recognize the device!
device = "cpu"
if torch.cuda.is_available(): device = "cuda"
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): device = "mps"
print(f"using device {device}")

with_bos = False
vocab_path = '../tokenizer_variations/sum_0-9+special_vocab.json'
num_tokens_per_sample = 11
data_location = '../datasets/no_bos_no_eos/499by499.json'

listdir = os.listdir()
print(listdir)

# MODEL SETUP
tokenizer = ArithmeticTokenizer(vocab_path, max_length=num_tokens_per_sample, padding="max_length")
config = APTConfig(vocab_size=len(tokenizer._id_tokens),
                   block_size=num_tokens_per_sample,
                   n_layer=3,
                   n_head=4,
                   n_embd=8,
                   bias=True,
                   pos_embd='learned',
                   )
print(f"VOCAB SIZE IS {config.vocab_size}")
model = APT(config)
model.to(device)
model.device = device
model.tokenizer = tokenizer


# HYPERPARAMETERS AND UTILITIES FOR TRAINING, EVAL DATASET PREP
batch_size = 2048 #1024 works?
train_loader = DataLoaderLite(
    B=batch_size, 
    T=num_tokens_per_sample, 
    data_location=data_location, 
    tokenizer=tokenizer,
    eval_percentage=0.01
    )
learning_rate = 0.04
trainset_size = train_loader.trainset_size
epochs = int(125 * 1)
max_steps = epochs * (trainset_size) // batch_size
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.02) # easy gains: decrease weights for different language tokens!

using device mps


  from .autonotebook import tqdm as notebook_tqdm


using device mps
['fun_with_manim.ipynb', 'more_intuitions_for_APT.ipynb', 'play.ipynb', 'generate_dataset.ipynb', 'mlp_animated.ipynb', 'fun_with_matrices.ipynb']
VOCAB SIZE IS 17
Add different options for learned vs rotational vs alibi positional encodings!!!
we have self.trainset_size 247499
loaded 2722489 tokens
1 epoch = 10 batches


In [2]:
for step in tqdm(range(max_steps), dynamic_ncols=True):
    model.train()
    x, y = train_loader.next_batch_train()
    x, y = x.to(device), y.to(device)
    # y[:,0:5] = -100
    optimizer.zero_grad() # always need to start with 0 gradient
    logits, loss = model(x, y)
    loss.backward() # this adds to gradients! which is why we need to zero_grad
    norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
    # norm = 1
    optimizer.step() # this actually updates the params

  5%|‚ñç         | 730/15106 [00:16<05:29, 43.58it/s] 


KeyboardInterrupt: 