# Quick‑start: Hard‑coded Training Script

This notebook trains a tiny BART‑style model **without any external config files**.
All hyper‑parameters are defined inline for clarity.

## 0. Install calt-x

In [None]:
!pip install calt-x

: 

## 1. Imports

In [12]:
from transformers import BartConfig, BartForConditionalGeneration as Transformer
from transformers import TrainingArguments
from calt import (
    PolynomialTrainer,
    data_loader,
    count_cuda_devices,
)

import torch, random, numpy as np
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x149a5480e0f0>

## 2. Dataset (tiny demo)

In [13]:
# Point to any dataset you like; here we assume the toy GCD dataset from the data‑generation notebook.
TRAIN_PATH = "../samples/train_raw.txt"
TEST_PATH  = "../samples/test_raw.txt"

dataset, tokenizer, data_collator = data_loader(
    train_dataset_path=TRAIN_PATH,
    test_dataset_path=TEST_PATH,
    field="GF7",
    num_variables=2,
    max_degree=20,
    max_coeff=10,
    max_length=256,
)

In [18]:
sample1 = data_collator([dataset["train"][0]])

In [28]:
sample1

{'input_ids': tensor([[38,  6, 22, 28, 11, 19, 29,  9, 22, 25, 10, 26, 19,  5, 22, 22,  6, 20,
          24,  5, 23, 20,  7, 26, 16,  9, 19, 23,  6, 22, 19, 10, 24, 15,  5, 20,
          18, 36, 11, 18, 32, 11, 18, 30,  5, 22, 23,  9, 18, 26,  5, 22, 21, 11,
          18, 25,  9, 18, 24,  6, 16, 24,  5, 22, 16,  9, 18, 19, 10, 20, 15,  5,
          16, 18, 39]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1]]),
 'decoder_input_ids': tensor([[38,  9, 16, 24,  7, 20, 15,  6, 16, 18]]),
 'decoder_attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]),
 'labels': tensor([[ 9, 16, 24,  7, 20, 15,  6, 16, 18, 39]])}

In [30]:
print(tokenizer.decode(sample1["input_ids"][0]))
print(tokenizer.decode(sample1["decoder_input_ids"][0]))
print(tokenizer.decode(sample1["labels"][0]))

<s> C-2 E7 E13 C3 E4 E14 C1 E7 E10 C2 E11 E4 C-3 E7 E7 C-2 E5 E9 C-3 E8 E5 C-1 E11 E1 C1 E4 E8 C-2 E7 E4 C2 E9 E0 C-3 E5 E3 [SEP] C3 E3 E17 C3 E3 E15 C-3 E7 E8 C1 E3 E11 C-3 E7 E6 C3 E3 E10 C1 E3 E9 C-2 E1 E9 C-3 E7 E1 C1 E3 E4 C2 E5 E0 C-3 E1 E3 </s>
<s> C1 E1 E9 C-1 E5 E0 C-2 E1 E3
C1 E1 E9 C-1 E5 E0 C-2 E1 E3 </s>


In [31]:
sample2 = data_collator([dataset["test"][0]])

In [32]:
sample2

{'input_ids': tensor([[38,  6, 22, 28, 11, 19, 29,  9, 22, 25, 10, 26, 19,  5, 22, 22,  6, 20,
          24,  5, 23, 20,  7, 26, 16,  9, 19, 23,  6, 22, 19, 10, 24, 15,  5, 20,
          18, 36, 11, 18, 32, 11, 18, 30,  5, 22, 23,  9, 18, 26,  5, 22, 21, 11,
          18, 25,  9, 18, 24,  6, 16, 24,  5, 22, 16,  9, 18, 19, 10, 20, 15,  5,
          16, 18, 39]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1]]),
 'decoder_input_ids': tensor([[38,  9, 16, 24,  7, 20, 15,  6, 16, 18]]),
 'decoder_attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]),
 'labels': tensor([[ 9, 16, 24,  7, 20, 15,  6, 16, 18, 39]])}

In [33]:
print(tokenizer.decode(sample2["input_ids"][0]))
print(tokenizer.decode(sample2["decoder_input_ids"][0]))
print(tokenizer.decode(sample2["labels"][0]))

<s> C-2 E7 E13 C3 E4 E14 C1 E7 E10 C2 E11 E4 C-3 E7 E7 C-2 E5 E9 C-3 E8 E5 C-1 E11 E1 C1 E4 E8 C-2 E7 E4 C2 E9 E0 C-3 E5 E3 [SEP] C3 E3 E17 C3 E3 E15 C-3 E7 E8 C1 E3 E11 C-3 E7 E6 C3 E3 E10 C1 E3 E9 C-2 E1 E9 C-3 E7 E1 C1 E3 E4 C2 E5 E0 C-3 E1 E3 </s>
<s> C1 E1 E9 C-1 E5 E0 C-2 E1 E3
C1 E1 E9 C-1 E5 E0 C-2 E1 E3 </s>


39

## 3. Model

In [39]:
# Minimal architecture — only overriding d_model for speed.
model_cfg = BartConfig(
    d_model=256,
    vocab_size=len(tokenizer.vocab),
    encoder_layers=6,
    decoder_layers=6,
    max_position_embeddings=256,
    pad_token_id=tokenizer.pad_token_id,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)
model = Transformer(config=model_cfg)

## 4. TrainingArguments (defaults + a few essentials)

In [40]:
args = TrainingArguments(
    output_dir="results/demo",
    num_train_epochs=100,
    per_device_train_batch_size=int(128),
    per_device_eval_batch_size=int(128),
    save_strategy="no",   # skip checkpoints for the quick demo
    seed=SEED,
    remove_unused_columns=False,
    label_names=["labels"],
    report_to="none",
)

## 5. Trainer & run

In [41]:
trainer = PolynomialTrainer(
    args=args,
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
    train_dataset=dataset["train"],  # slice for speed
    eval_dataset=dataset["test"],
)

# train
results = trainer.train()
trainer.save_model()
metrics = results.metrics

# eval
eval_metrics = trainer.evaluate()
metrics.update(eval_metrics)
acc = trainer.generate_evaluation()
metrics["test_accuracy"] = acc

# save metrics
trainer.save_metrics("all", metrics)

  super().__init__(*args, **kwargs)


Step,Training Loss


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


The code above is all you need for a first experiment.  
Increase `num_train_epochs`, remove the slicing, and enable checkpointing/WandB when you move from a demo to full‑scale training.