# Quick‑start: Hard‑coded Training Script

This notebook trains a tiny BART‑style model **without any external config files**.
All hyper‑parameters are defined inline for clarity.

## 1. Imports

In [1]:
from transformers import BartConfig, BartForConditionalGeneration as Transformer
from transformers import TrainingArguments
from transformer_algebra import (
    PolynomialTrainer,
    data_loader,
    count_cuda_devices,
)

import torch, random, numpy as np
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

since Python 3.9 and will be removed in a subsequent version. The only 
supported seed types are: None, int, float, str, bytes, and bytearray.
  random.seed(SEED)


<torch._C.Generator object at 0x7c6193dab390>

## 2. Dataset (tiny demo)

In [15]:
# Point to any dataset you like; here we assume the toy GCD dataset from the data‑generation notebook.
TRAIN_PATH = "../data/gcd_problem/GF7_n=2/train_raw.txt"
TEST_PATH  = "../data/gcd_problem/GF7_n=2/test_raw.txt"

dataset, tokenizer, data_collator = data_loader(
    train_dataset_path=TRAIN_PATH,
    test_dataset_path=TEST_PATH,
    field="GF7",
    num_variables=2,
    max_degree=10,
    max_coeff=10,
    max_length=256,
)

## 3. Model

In [16]:
# Minimal architecture — only overriding d_model for speed.
model_cfg = BartConfig(
    d_model=256,
    vocab_size=len(tokenizer.vocab),
    max_position_embeddings=256,
    pad_token_id=tokenizer.pad_token_id,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)
model = Transformer(config=model_cfg)

## 4. TrainingArguments (defaults + a few essentials)

In [17]:
args = TrainingArguments(
    output_dir="results/demo",
    num_train_epochs=1,
    per_device_train_batch_size=int(32),
    per_device_eval_batch_size=int(32),
    save_strategy="no",   # skip checkpoints for the quick demo
    seed=SEED,
    report_to="none",
)

## 5. Trainer & run

In [18]:
trainer = PolynomialTrainer(
    args=args,
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
    train_dataset=dataset["train"],  # slice for speed
    eval_dataset=dataset["test"],
)

trainer.train()
trainer.evaluate()

  """
since Python 3.9 and will be removed in a subsequent version. The only 
supported seed types are: None, int, float, str, bytes, and bytearray.
  random.seed(seed)


OutOfMemoryError: CUDA out of memory. Tried to allocate 102.00 MiB. GPU 0 has a total capacity of 23.64 GiB of which 81.94 MiB is free. Process 2529978 has 22.31 GiB memory in use. Including non-PyTorch memory, this process has 1018.00 MiB memory in use. Of the allocated memory 813.36 MiB is allocated by PyTorch, and 6.64 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

The code above is all you need for a first experiment.  
Increase `num_train_epochs`, remove the slicing, and enable checkpointing/WandB when you move from a demo to full‑scale training.