In [1]:
import time
from typing import Any
from omegaconf import OmegaConf
from codebase.stool import StoolArgs, launch_job

In [2]:
date = time.strftime("%m%d-%H%M%S")
capella_args = StoolArgs(
    config="./experiments/baseline_transformer/configs/debug.yaml",
    script="experiments.baseline_transformer.train",
    copy_code= False,
    nodes = 1,  # The number of nodes to run the job on.
    ngpu = 1,  # The number of GPUs required per node.
    ncpu = 32,  # The number of CPUs allocated per GPU.
    mem = "32Gb",
    anaconda = "/data/horse/ws/lama722b-nanite-lm/nanite-lm/.env",
    time = 60,
    account = "p_scads",
    partition = "capella"
)
capella_args.config = OmegaConf.load(capella_args.config)
capella_args.config['dump_dir'] = f"/data/horse/ws/lama722b-nanite-lm/dumps/{date}"

In [3]:
launch_job(capella_args)

Creating directories...
Saving config file ...
Writing sbatch command ...
> [32m/data/horse/ws/lama722b-nanite-lm/nanite-lm/codebase/stool.py[39m([92m211[39m)[36mlaunch_job[39m[34m()[39m
[32m    209[39m     [38;5;28;01mimport[39;00m pdb; pdb.set_trace()
[32m    210[39m 
[32m--> 211[39m     print([33m"Submitting job ..."[39m)
[32m    212[39m     os.system([33mf"{args.launcher} {dump_dir}/submit.slurm"[39m)
[32m    213[39m 



ipdb>  exit


# Creating Multiple config files.

In [11]:
import os
from omegaconf import OmegaConf
from codebase.stool import StoolArgs, launch_job

In [12]:
base_config_path = "experiments/baseline_transformer/configs/debug.yaml"
base_config = OmegaConf.load(base_config_path)

In [13]:
print(base_config)

{'dump_dir': '/home/lama722b/nanite_lm/dumps/debug', 'name': 'debug', 'steps': 1000, 'probe_freq': 100, 'seed': 777, 'optim': {'lr': 0.0003, 'warmup': 2000, 'lr_min_ratio': 1e-06, 'clip': 10.0}, 'distributed': {'fsdp_type': 'full_shard', 'compile': True, 'model_dtype': 'bf16', 'matmul_allow_tf32': False, 'selective_activation_checkpointing': False, 'tp_size': 1}, 'model': {'dim': 128, 'n_layers': 12, 'n_heads': 4}, 'data': {'root_dir': '/home/lama722b/nanite_lm/data/fineweb', 'sources': {'de_shuffled': 60.0, 'en_shuffled': 40.0}, 'batch_size': 4, 'prefetch_size': 64, 'seq_len': 2048, 'n_views': 2, 'load_async': True, 'tokenizer': {'name': 'sp', 'path': '/home/lama722b/nanite_lm/tokenizers/gemma/tokenizer.model'}}, 'profiling': {'run': True}, 'checkpoint': {'dump': {'every': 100, 'keep': 1}, 'eval': {'every': 100, 'keep': 1}, 'continue_training_from_init': False}, 'logging': {'freq': 10, 'wandb': {'project': 'custom_codebase', 'entity': 'scads-nlp', 'tags': None, 'group': None, 'name': 

## Define the Hyperparameters for sweep

In [16]:
lrs = [5e-4, 1e-4]
widths = [16, 32, 64, 128]
fixed_n_heads = 4 

## Loop and launch the jobs

In [None]:
for lr in lrs:
    for width in widths:
        # Create a deep copy of the base config for each job to avoid modifying it
        current_config = OmegaConf.to_container(base_config, resolve=True)
        current_config = OmegaConf.create(current_config) # Convert back to OmegaConf object
    
        # Override hyperparameters
        current_config.optim.lr = lr
        current_config.model.dim = width
        current_config.model.n_heads = fixed_n_heads
    
        # Customize job name and dump directory for each run
        job_name = f"FineWeb_DE_40_EN_60_lr{lr}_dim{width}"
        dump_dir = os.path.join(
            "/home/lama722b/nanite_lm/dumps/lr_sweep",
            f"sweep_lr{str(lr).replace('.', 'p')}_dim{width}" # Using 'p' for point in file name
        )
        current_config.name = job_name
        current_config.dump_dir = dump_dir
        current_config.checkpoint.path = os.path.join(dump_dir, "checkpoints") # Update checkpoint path as well
    
        print(f"Preparing to launch job: {job_name}")
        print(f"  Learning Rate: {current_config.optim.lr}")
        print(f"  Model Dimension: {current_config.model.dim}")
        print(f"  Number of Heads: {current_config.model.n_heads}")
        print(f"  Dump Directory: {current_config.dump_dir}")
    
        # --- Configure StoolArgs for the Job ---
        # These reflect the SBATCH parameters from your bash script
        # Note: The `stool.py` uses `nodes` directly, while your bash script implies `nodes=1` and `ntasks=1`.
        # For a single job on one node with multiple GPUs, `nodes=1` and `ngpu=desired_gpus_per_node`.
        # The bash script uses `gres=gpu:1`, suggesting 1 GPU per task/node.
        # Let's adjust StoolArgs to reflect this.
    
        stool_args = StoolArgs(
            config=current_config,
            launcher="sbatch",
            script="experiments.baseline_transformer.train",
            copy_code=False,
            dirs_exists_ok=True, # Allow creating directories if they exist, useful for sweeps
            override=False, # Set to True with extreme caution, as it deletes existing dump directories
            nodes=1,
            ngpu=1, # From `--gres=gpu:1` in bash script
            ncpu=14, # From `--cpus-per-task=12` in bash script
            mem=4096,
            python_env="/data/horse/ws/lama722b-nanite-lm/nanite-lm/.env", # This is derived from `source /data/horse/ws/lama722b-nanite-lm/nanite-lm/.env/bin/activate`
            constraint="", # No specific constraint in bash script
            exclude="", # No exclusion in bash script
            time=2*60, # 18 hours in minutes from `--time=18:00:00`
            account="p_scads_nlp", # From `--account=p_scads_nlp`
        )
        #print(stool_args)
        # Call the launch_job function
        launch_job(stool_args)
        print("-" * 50)