# **calt Minimal Demo**

This notebook demonstrates the minimal code needed to:
1. Generate a dataset
2. Train a model
3. Evaluate the model

In [1]:
import sys
from pathlib import Path

# Add development calt to path (prioritize over pip-installed version)
# This notebook is in calt/examples/demos/, so we go up to calt/ and then to src/
# When running in Jupyter, Path.cwd() gives the notebook's directory
calt_dev_path = Path.cwd().parent.parent / "src"
sys.path.insert(0, str(calt_dev_path))

print(f"Using development calt from: {calt_dev_path}")

Using development calt from: /home/ara_shun/workspace/calt/src


## 1. Dataset Generation

Generate polynomial addition problems

In [None]:
import random

from calt.dataset.sympy.dataset_generator import DatasetGenerator
from calt.dataset.sympy.utils.polynomial_sampler import PolynomialSampler
from calt.dataset.utils.dataset_writer import DatasetWriter


# Define instance generator: polynomial addition
def sum_instance_generator(seed):
    random.seed(seed)

    # Initialize polynomial sampler
    sampler = PolynomialSampler(
        symbols="x0, x1",
        field_str="GF(7)",
        max_num_terms=2,
        max_degree=2,
        min_degree=1,
    )

    # Generate two polynomials
    F = sampler.sample(num_samples=2)

    # Solution is the sum
    g = sum(F)

    return F, g


# Initialize dataset generator
dataset_generator = DatasetGenerator(
    n_jobs=1,  # Use 1 for demo (SymPy backend)
    root_seed=100,
)

# Initialize dataset writer
dataset_writer = DatasetWriter(
    save_dir="./data",
    save_text=True,
    save_json=False,
)

# Generate datasets
dataset_generator.run(
    dataset_sizes={"train": 1000, "test": 100},
    instance_generator=sum_instance_generator,
    dataset_writer=dataset_writer,
)

print("Dataset generation completed!")


Starting dataset generation for 2 dataset(s)
Dataset sizes: {'train': 1000, 'test': 100}

---------------------------------- train ----------------------------------
Dataset size: 1000 samples  (Batch size: 100000)

--- Batch 1/1 ---
Processing samples 1-1000 (size: 1000)
Starting parallel processing...
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:    0.2s
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    0.2s finished
Parallel processing completed
Batch 1 saved to file
Batch 1/1 completed

Overall statistics saved for train dataset
Total time: 0.24 seconds


---------------------------------- test ----------------------------------
Dataset size: 100 samples  (Batch size: 100000)

--- Batch 1/1 ---
Processing samples 1-100 (size: 100)
Starting parallel processing...
[Parallel(n_jobs=1)]: Do

Dataset generation completed!


## 2. Model Training

Load data, create model, and train

In [3]:
from omegaconf import OmegaConf

from calt.io import IOPipeline
from calt.models import ModelPipeline
from calt.trainer import TrainerPipeline

# Load config from YAML file
cfg = OmegaConf.load("./configs/config.yaml")

print("Config loaded")

Config loaded


In [5]:
# Load data
io_pipeline = IOPipeline.from_config(cfg.data)
result = io_pipeline.build()

print(
    f"Loaded {len(result['train_dataset'])} training samples and {len(result['test_dataset'])} test samples"
)

Loaded 1000 samples from ./data/train_raw.txt
Loaded 100 samples from ./data/test_raw.txt

--------------------------------
Vocabulary validation errors in dataset.
Out-of-vocabulary tokens: '0', '1', '2', '3', '4', '5', '6', '|'
Please check your lexer.yaml configuration and dataset generation.
--------------------------------



Validating test dataset tokens... 

ValueError: 
--------------------------------
Vocabulary validation errors in dataset.
Out-of-vocabulary tokens: '0', '1', '2', '3', '4', '5', '6', '|'
Please check your lexer.yaml configuration and dataset generation.
--------------------------------


In [7]:
# Create model
model_pipeline = ModelPipeline(cfg.model, result["tokenizer"])
model = model_pipeline.build()

print(
    f"Model: {type(model).__name__} ({sum(p.numel() for p in model.parameters()):,} parameters)"
)

NameError: name 'result' is not defined

In [6]:
# Create trainer
trainer_pipeline = TrainerPipeline(
    cfg.train,
    model=model,
    tokenizer=result["tokenizer"],
    train_dataset=result["train_dataset"],
    eval_dataset=result["test_dataset"],
    data_collator=result["data_collator"],
)
trainer = trainer_pipeline.build()

print("Trainer ready")

  super().__init__(*args, **kwargs)


Trainer ready


In [7]:
# Train and evaluate
trainer.train()
eval_metrics = trainer.evaluate()
success_rate = trainer.evaluate_and_save_generation()

print(f"Success rate: {100 * success_rate:.1f}%")

[34m[1mwandb[0m: Currently logged in as: [33mkera-hiroshi[0m ([33mchiba-u[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


TypeError: Caught TypeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/kera/miniconda3/envs/calt-env/lib/python3.12/site-packages/torch/utils/data/_utils/worker.py", line 349, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
           ^^^^^^^^^^^^^^^^^^^^
  File "/home/kera/miniconda3/envs/calt-env/lib/python3.12/site-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
            ~~~~~~~~~~~~^^^^^
  File "/data/kera/workspace/calt-dev/calt/src/calt/io/base.py", line 77, in __getitem__
    src = self.preprocessor(self.input_texts[idx])
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
TypeError: 'str' object is not callable


## 3. Summary

The entire training pipeline can be summarized in just a few lines:

In [None]:
# Complete minimal training code
from omegaconf import OmegaConf

from calt.io import IOPipeline
from calt.models import ModelPipeline
from calt.trainer import TrainerPipeline

cfg = OmegaConf.load("examples/demos/config.yaml")

# Load data
io_pipeline = IOPipeline(
    train_dataset_path=cfg.data.train_dataset_path,
    test_dataset_path=cfg.data.test_dataset_path,
    vocab_config=cfg.data.vocab_config,
)
result = io_pipeline.build()

# Create model
model = ModelPipeline(cfg.model, result["tokenizer"]).build()

# Create trainer and train
trainer = TrainerPipeline(
    cfg.train,
    model=model,
    tokenizer=result["tokenizer"],
    train_dataset=result["train_dataset"],
    eval_dataset=result["test_dataset"],
    data_collator=result["data_collator"],
).build()

trainer.train()
success_rate = trainer.evaluate_and_save_generation()
print(f"Success rate: {100 * success_rate:.1f}%")