In [7]:
import os
import sys
sys.path.append("/data/horse/ws/lama722b-nanite-lm/nanite-lm/")

In [8]:
from pprint import pprint as print
import time
from typing import Any
from omegaconf import OmegaConf
from codebase.stool import StoolArgs, launch_job

In [9]:
date = time.strftime("%m%d-%H%M%S")
base_config_path = "/data/horse/ws/lama722b-nanite-lm/nanite-lm/experiments/mup/configs/debug.yaml"
base_config = OmegaConf.load(base_config_path)

In [10]:
print(base_config)

{'dump_dir': '/home/lama722b/nanite_lm/dumps/mup_debug', 'name': 'mup_debug', 'probe_freq': 1, 'steps': 100, 'seed': 777, 'optim': {'lr': 0.0005, 'warmup': 10, 'lr_min_ratio': 1e-06, 'clip': 10.0, 'scaling_factor': 1.0}, 'distributed': {'fsdp_type': 'no_shard', 'compile': True, 'model_dtype': 'bf16', 'matmul_allow_tf32': False, 'selective_activation_checkpointing': False}, 'model': {'dim': 128, 'n_layers': 2, 'n_heads': 4, 'scaling_factor': 1.0}, 'data': {'root_dir': '/home/lama722b/nanite_lm/data/fineweb', 'sources': {'de_shuffled': 60.0, 'en_shuffled': 40.0}, 'batch_size': 1, 'prefetch_size': 4, 'seq_len': 512, 'n_views': 2, 'load_async': True, 'tokenizer': {'name': 'sp', 'path': '/home/lama722b/nanite_lm/tokenizers/gemma/tokenizer.model'}}}


In [11]:
LEARNING_RATE = 1e-2
widths = [256, 512, 1024, 2048, 4096]
head_dim = 64
mup_base_width = 256
seeds = [1, 2, 3, 4, 5]

In [12]:
for width in widths:
    for seed in seeds:
        # Create a deep copy of the base config for each job to avoid modifying it
        current_config = OmegaConf.to_container(base_config, resolve=True)
        current_config = OmegaConf.create(current_config) # Convert back to OmegaConf object
    
        # Override hyperparameters
        current_config.optim.lr = LEARNING_RATE
        current_config.model.dim = width
        current_config.model.n_heads = int(width/head_dim)
        current_config.optim.scaling_factor = int(width/mup_base_width)
        current_config.model.scaling_factor = int(width/mup_base_width)
        
    
        # Customize job name and dump directory for each run
        job_name = f"mup_model_dim_{width}"
        dump_dir = os.path.join(
            "/home/lama722b/nanite_lm/dumps/mup_coord_check",
            f"model_dim_{width}",
            f"seed_{seed}"
        )
        current_config.name = job_name
        current_config.dump_dir = dump_dir
        if not OmegaConf.select(current_config, "checkpoint", default=None):
            current_config.checkpoint = OmegaConf.create({})
        current_config.checkpoint.path = os.path.join(dump_dir, "checkpoints") # Update checkpoint path as well
                
        print(f"Preparing to launch job: {job_name}")
        print(f"  Model Dimension: {current_config.model.dim}")
        print(f"  Number of Heads: {current_config.model.n_heads}")
        print(f"  Dump Directory: {current_config.dump_dir}")
      
        stool_args = StoolArgs(
            config=current_config,
            launcher="sbatch",
            script="experiments.mup.train",
            copy_code=True,
            dirs_exists_ok=True, # Allow creating directories if they exist, useful for sweeps
            override=False, # Set to True with extreme caution, as it deletes existing dump directories
            nodes=1,
            ngpu=1, # From `--gres=gpu:1` in bash script
            ncpu=14, # From `--cpus-per-task=12` in bash script
            mem=4096,
            python_env="/data/horse/ws/lama722b-nanite-lm/nanite-lm/.env", # This is derived from `source /data/horse/ws/lama722b-nanite-lm/nanite-lm/.env/bin/activate`
            constraint="", # No specific constraint in bash script
            exclude="", # No exclusion in bash script
            time=2*60, # 18 hours in minutes from `--time=18:00:00`
            account="p_scads_nlp", # From `--account=p_scads_nlp`,
            go_to_code_dir = "/data/horse/ws/lama722b-nanite-lm/nanite-lm"
        )
        # print(stool_args)
        # Call the launch_job function
        launch_job(stool_args)
        print("-" * 50)

'Preparing to launch job: mup_model_dim_256'
'  Model Dimension: 256'
'  Number of Heads: 4'
('  Dump Directory: '
 '/home/lama722b/nanite_lm/dumps/mup_coord_check/model_dim_256/seed_1')
Creating directories...
Copying code ...
Copying : /data/horse/ws/lama722b-nanite-lm/nanite-lm/experiments/mup
to      : /home/lama722b/nanite_lm/dumps/mup_coord_check/model_dim_256/seed_1/code ...
Copying command: rsync -arm --copy-links --include '**/' --include '*.py' --exclude='*' /data/horse/ws/lama722b-nanite-lm/nanite-lm/experiments/mup/ /home/lama722b/nanite_lm/dumps/mup_coord_check/model_dim_256/seed_1/code
Copy done.
Saving config file ...
Writing sbatch command ...
Submitting job ...
Submitted batch job 536414
Done.
'--------------------------------------------------'
'Preparing to launch job: mup_model_dim_256'
'  Model Dimension: 256'
'  Number of Heads: 4'
('  Dump Directory: '
 '/home/lama722b/nanite_lm/dumps/mup_coord_check/model_dim_256/seed_2')
Creating directories...
Copying code ...