In [1]:
!nvidia-smi


/bin/bash: line 1: nvidia-smi: command not found


# TEST

In [13]:
from utils import *


In [14]:
class ModelConfig:
    def __init__(self, model_name, model_checkpoint=None):
        self.model_name = model_name
        self.model_checkpoint = model_checkpoint

model, tokenizer = load_Model(ModelConfig("gemma-7b","main"))


Trying to load model : gemma-7b from ../Models/Gemma-7B


`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [32]:
tokenizer([" "])

{'input_ids': [[1, 259]], 'attention_mask': [[1, 1]]}

In [29]:
tokenizer([" "])

{'input_ids': [[209]], 'token_type_ids': [[0]], 'attention_mask': [[1]]}

In [8]:
model.model.transformer.blocks[0]

OLMoSequentialBlock(
  (dropout): Dropout(p=0.0, inplace=False)
  (act): SwiGLU()
  (attn_out): Linear(in_features=4096, out_features=4096, bias=False)
  (ff_out): Linear(in_features=11008, out_features=4096, bias=False)
  (rotary_emb): RotaryEmbedding()
  (attn_norm): LayerNorm()
  (ff_norm): LayerNorm()
  (att_proj): Linear(in_features=4096, out_features=12288, bias=False)
  (ff_proj): Linear(in_features=4096, out_features=22016, bias=False)
)

In [4]:
import subprocess
from datetime import datetime
import yaml
import numpy as np

# Define the hyperparameters
model_name = 'llama2-7b'
dataset_name = 'pile_uncopyrighted_parquet_test'
dataset_subset = [0]
ctx_len = 1024
batch_size = 2
model_checkpoints = [
    "main"
]

# Generate 12 uniformly distributed layer indices from 0 to 31
num_layers = 32  
layer_idx_list = [0] + np.linspace(1, num_layers-1, num_layers//2, dtype=int).tolist()

# Convert dataset_subset list to a string format
dataset_subset_str = ' '.join(map(str, dataset_subset))
dataset_subset_str_bar = '_'.join(map(str, dataset_subset))

def create_config(model_checkpoint, session_name):
    config = {
        "session_name": session_name,
        "session_path": f"/home/gridsan/jsong/physics_dl_shared/ML_JY/ID_scaling/Data/{session_name}",
        "model_config": {
            "model_name": model_name,
            "model_checkpoint": model_checkpoint,
            "use_accelerator": False,
            "module_name_mapping": {
                "mlp": "model.layers.{layer}.mlp",
                "attn": "model.layers.{layer}.self_attn",
                "block": "model.layers.{layer}",
                "emb": "model.embed_tokens",
                "unemb": "model.norm",
            }
        },
        "dataset_config": {
            "dataset_name": dataset_name,
            "dataset_subset": dataset_subset,
            "max_dataset_size": 50000,
            "filter_and_chunk_config": {
                "min_chunks_from_a_document": 5,
                "max_chunks_from_a_document": 5
            }
        },
        "ctx_len": ctx_len,
        "batch_size": batch_size,
        "cacheing_config": {
            "layer_idx_list": layer_idx_list,  # Using the uniformly distributed layer indices
            "module_inblock_keys": ['block'],
            "module_outblock_keys": ['unemb'],
            "save_fp": "torch.float16",
            "save_cache_tensors": True,
            "save_mean_tensors": True,
            "save_IDs": True,
            "save_IDs_list": ['mle', 'mind_ml', 'twoNN_f10'],
        },
        'multiprocessing': True,
        'multiprocessing_num_cpus': 25,
        'verbose': True
    }
    return config

def submit_job(model_checkpoint):
    current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
    session_name = f"{current_time}_{model_name}_{model_checkpoint}_{dataset_name}_{dataset_subset_str_bar}"
    print(f"Submitting job for {session_name}")

    # Create config
    config = create_config(model_checkpoint, session_name)
    
    # Save config to a YAML file
    config_filename = f"Configs/config_{model_name}_{model_checkpoint}.yaml"
    with open(config_filename, 'w') as f:
        yaml.dump(config, f)

    # Create SLURM script
    sbatch_script = f"""#!/bin/bash
#SBATCH --job-name={model_name}_{model_checkpoint}
#SBATCH --time=72:10:00
#SBATCH --output=LOG/%x-%j.out
#SBATCH --error=LOG/%x-%j.err
#SBATCH -N 1
#SBATCH -c 40
#SBATCH --gres=gpu:volta:1
#SBATCH --mem-per-cpu=7G

echo "Model name" : {model_name}
echo "Model checkpoint: {model_checkpoint}"
echo "CPUs per task: $SLURM_CPUS_PER_TASK"

module load anaconda/2023a-pytorch
python Run_experiment.py --config_path {config_filename}
"""

    script_filename = f"LLSubScripts/submit_job_{model_name}_{model_checkpoint}.sbatch"
    with open(script_filename, "w") as f:
        f.write(sbatch_script)

    # Submit the job
    subprocess.run(["sbatch", script_filename])

# Submit a job for each model checkpoint
for checkpoint in model_checkpoints:
    submit_job(checkpoint)

Submitting job for 20240810_015346_llama2-7b_main_pile_uncopyrighted_parquet_test_0
Submitted batch job 26768542


In [15]:
import subprocess
from datetime import datetime
import yaml
import numpy as np

# Define the hyperparameters
model_name = 'olmo-7b'
dataset_name = 'pile_uncopyrighted_parquet_test'
dataset_subset = [0]
ctx_len = 1024  # As requested
batch_size = 2  # As requested
model_checkpoints = ["main"]

# Generate layer indices
num_layers = 32  # OLMo 7B has 32 layers
layer_idx_list = [0] + np.linspace(1, num_layers-1, num_layers//2, dtype=int).tolist()

# Convert dataset_subset list to a string format
dataset_subset_str = ' '.join(map(str, dataset_subset))
dataset_subset_str_bar = '_'.join(map(str, dataset_subset))

def create_config(model_checkpoint, session_name):
    config = {
        "session_name": session_name,
        "session_path": f"/home/gridsan/jsong/physics_dl_shared/ML_JY/ID_scaling/Data/{session_name}",
        "model_config": {
            "model_name": model_name,
            "model_checkpoint": model_checkpoint,
            "use_accelerator": False,
            "module_name_mapping": {
                "mlp": "model.transformer.blocks.{layer}.ff_out",
                "attn": "model.transformer.blocks.{layer}.att_out",
                "block": "model.transformer.blocks.{layer}",
                "emb": "model.transformer.wte",
                "unemb": "model.transformer.ln_f",
            }
        },
        "dataset_config": {
            "dataset_name": dataset_name,
            "dataset_subset": dataset_subset,
            "max_dataset_size": 50000,
            "filter_and_chunk_config": {
                "min_chunks_from_a_document": 5,
                "max_chunks_from_a_document": 5
            }
        },
        "ctx_len": ctx_len,
        "batch_size": batch_size,
        "cacheing_config": {
            "layer_idx_list": layer_idx_list,
            "module_inblock_keys": ['block'],
            "module_outblock_keys": ['unemb'],
            "save_fp": "torch.float16",
            "save_cache_tensors": True,
            "save_mean_tensors": True,
            "save_IDs": True,
            "save_IDs_list": ['mle', 'mind_ml', 'twoNN_f10'],
        },
        'multiprocessing': True,
        'multiprocessing_num_cpus': 25,
        'verbose': True
    }
    return config

def submit_job(model_checkpoint):
    current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
    session_name = f"{current_time}_{model_name}_{model_checkpoint}_{dataset_name}_{dataset_subset_str_bar}"
    print(f"Submitting job for {session_name}")

    # Create config
    config = create_config(model_checkpoint, session_name)

    # Save config to a YAML file
    config_filename = f"Configs/config_{model_name}_{model_checkpoint}.yaml"
    with open(config_filename, 'w') as f:
        yaml.dump(config, f)

    # Create SLURM script
    sbatch_script = f"""#!/bin/bash
#SBATCH --job-name={model_name}_{model_checkpoint}
#SBATCH --time=72:10:00
#SBATCH --output=LOG/%x-%j.out
#SBATCH --error=LOG/%x-%j.err
#SBATCH -N 1
#SBATCH -c 40
#SBATCH --gres=gpu:volta:1
#SBATCH --mem-per-cpu=7G
echo "Model name: {model_name}"
echo "Model checkpoint: {model_checkpoint}"
echo "CPUs per task: $SLURM_CPUS_PER_TASK"
module load anaconda/2023a-pytorch
python Run_experiment.py --config_path {config_filename}
"""

    script_filename = f"LLSubScripts/submit_job_{model_name}_{model_checkpoint}.sbatch"
    with open(script_filename, "w") as f:
        f.write(sbatch_script)

    # Submit the job
    subprocess.run(["sbatch", script_filename])

# Submit a job for each model checkpoint
for checkpoint in model_checkpoints:
    submit_job(checkpoint)

Submitting job for 20240810_174822_olmo-7b_main_pile_uncopyrighted_parquet_test_0
Submitted batch job 26774399


In [13]:
import subprocess
from datetime import datetime
import yaml
import numpy as np

# Define the hyperparameters
model_name = 'mistral-7b'
dataset_name = 'pile_uncopyrighted_parquet_test'
dataset_subset = [0]
ctx_len = 1024  # Mistral supports longer contexts, but we'll keep it as requested
batch_size = 2  # As requested
model_checkpoints = ["main"]

# Generate layer indices
num_layers = 32  # Mistral 7B has 32 layers
layer_idx_list = [0] + np.linspace(1, num_layers-1, num_layers//2, dtype=int).tolist()

# Convert dataset_subset list to a string format
dataset_subset_str = ' '.join(map(str, dataset_subset))
dataset_subset_str_bar = '_'.join(map(str, dataset_subset))

def create_config(model_checkpoint, session_name):
    config = {
        "session_name": session_name,
        "session_path": f"/home/gridsan/jsong/physics_dl_shared/ML_JY/ID_scaling/Data/{session_name}",
        "model_config": {
            "model_name": model_name,
            "model_checkpoint": model_checkpoint,
            "use_accelerator": False,
            "module_name_mapping": {
                "mlp": "model.layers.{layer}.feed_forward",
                "attn": "model.layers.{layer}.self_attn",
                "block": "model.layers.{layer}",
                "emb": "model.embed_tokens",
                "unemb": "model.norm",
                "input_layernorm": "model.layers.{layer}.input_layernorm",
                "post_attention_layernorm": "model.layers.{layer}.post_attention_layernorm"
            }
        },
        "dataset_config": {
            "dataset_name": dataset_name,
            "dataset_subset": dataset_subset,
            "max_dataset_size": 50000,
            "filter_and_chunk_config": {
                "min_chunks_from_a_document": 5,
                "max_chunks_from_a_document": 5
            }
        },
        "ctx_len": ctx_len,
        "batch_size": batch_size,
        "cacheing_config": {
            "layer_idx_list": layer_idx_list,
            "module_inblock_keys": ['block'],
            "module_outblock_keys": ['unemb'],
            "save_fp": "torch.float16",
            "save_cache_tensors": True,
            "save_mean_tensors": True,
            "save_IDs": True,
            "save_IDs_list": ['mle', 'mind_ml', 'twoNN_f10'],
        },
        'multiprocessing': True,
        'multiprocessing_num_cpus': 25,
        'verbose': True
    }
    return config

def submit_job(model_checkpoint):
    current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
    session_name = f"{current_time}_{model_name}_{model_checkpoint}_{dataset_name}_{dataset_subset_str_bar}"
    print(f"Submitting job for {session_name}")

    # Create config
    config = create_config(model_checkpoint, session_name)

    # Save config to a YAML file
    config_filename = f"Configs/config_{model_name}_{model_checkpoint}.yaml"
    with open(config_filename, 'w') as f:
        yaml.dump(config, f)

    # Create SLURM script
    sbatch_script = f"""#!/bin/bash
#SBATCH --job-name={model_name}_{model_checkpoint}
#SBATCH --time=72:10:00
#SBATCH --output=LOG/%x-%j.out
#SBATCH --error=LOG/%x-%j.err
#SBATCH -N 1
#SBATCH -c 32
#SBATCH --gres=gpu:volta:1
#SBATCH --mem-per-cpu=7G
echo "Model name: {model_name}"
echo "Model checkpoint: {model_checkpoint}"
echo "CPUs per task: $SLURM_CPUS_PER_TASK"
module load anaconda/2023a-pytorch
python Run_experiment.py --config_path {config_filename}
"""

    script_filename = f"LLSubScripts/submit_job_{model_name}_{model_checkpoint}.sbatch"
    with open(script_filename, "w") as f:
        f.write(sbatch_script)

    # Submit the job
    subprocess.run(["sbatch", script_filename])

# Submit a job for each model checkpoint
for checkpoint in model_checkpoints:
    submit_job(checkpoint)

Submitting job for 20240810_020645_mistral-7b_main_pile_uncopyrighted_parquet_test_0
Submitted batch job 26768690


In [16]:
import subprocess
from datetime import datetime
import yaml
import numpy as np

# Define the hyperparameters
model_name = 'pythia-6.9b-deduped'
dataset_name = 'pile_uncopyrighted_parquet_test'
dataset_subset = [0]
ctx_len = 1024  # Adjust if needed for Pythia
batch_size = 2  # As requested
model_checkpoints = ["main"]

# Generate layer indices
num_layers = 32  # Pythia-6.9B has 32 layers
layer_idx_list = [0] + np.linspace(1, num_layers-1, num_layers//2, dtype=int).tolist()

# Convert dataset_subset list to a string format
dataset_subset_str = ' '.join(map(str, dataset_subset))
dataset_subset_str_bar = '_'.join(map(str, dataset_subset))

def create_config(model_checkpoint, session_name):
    config = {
        "session_name": session_name,
        "session_path": f"/home/gridsan/jsong/physics_dl_shared/ML_JY/ID_scaling/Data/{session_name}",
        "model_config": {
            "model_name": model_name,
            "model_checkpoint": model_checkpoint,
            "use_accelerator": False,
            "module_name_mapping": {
                "mlp": "gpt_neox.layers.{layer}.mlp",
                "attn": "gpt_neox.layers.{layer}.attention",
                "block": "gpt_neox.layers.{layer}",
                "emb": "gpt_neox.embed_in",
                "unemb": "gpt_neox.final_layer_norm",
                "input_layernorm": "gpt_neox.layers.{layer}.input_layernorm",
                "post_attention_layernorm": "gpt_neox.layers.{layer}.post_attention_layernorm"
            }
        },
        "dataset_config": {
            "dataset_name": dataset_name,
            "dataset_subset": dataset_subset,
            "max_dataset_size": 50000,
            "filter_and_chunk_config": {
                "min_chunks_from_a_document": 5,
                "max_chunks_from_a_document": 5
            }
        },
        "ctx_len": ctx_len,
        "batch_size": batch_size,
        "cacheing_config": {
            "layer_idx_list": layer_idx_list,
            "module_inblock_keys": ['block'],
            "module_outblock_keys": ['unemb'],
            "save_fp": "torch.float16",
            "save_cache_tensors": True,
            "save_mean_tensors": True,
            "save_IDs": True,
            "save_IDs_list": ['mle', 'mind_ml', 'twoNN_f10'],
        },
        'multiprocessing': True,
        'multiprocessing_num_cpus': 25,
        'verbose': True
    }
    return config

def submit_job(model_checkpoint):
    current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
    session_name = f"{current_time}_{model_name}_{model_checkpoint}_{dataset_name}_{dataset_subset_str_bar}"
    print(f"Submitting job for {session_name}")

    # Create config
    config = create_config(model_checkpoint, session_name)

    # Save config to a YAML file
    config_filename = f"Configs/config_{model_name}_{model_checkpoint}.yaml"
    with open(config_filename, 'w') as f:
        yaml.dump(config, f)

    # Create SLURM script
    sbatch_script = f"""#!/bin/bash
#SBATCH --job-name={model_name}_{model_checkpoint}
#SBATCH --time=72:10:00
#SBATCH --output=LOG/%x-%j.out
#SBATCH --error=LOG/%x-%j.err
#SBATCH -N 1
#SBATCH -c 32
#SBATCH --gres=gpu:volta:1
#SBATCH --mem-per-cpu=7G

echo "Model name: {model_name}"
echo "Model checkpoint: {model_checkpoint}"
echo "CPUs per task: $SLURM_CPUS_PER_TASK"

module load anaconda/2023a-pytorch
python Run_experiment.py --config_path {config_filename}
"""

    script_filename = f"LLSubScripts/submit_job_{model_name}_{model_checkpoint}.sbatch"
    with open(script_filename, "w") as f:
        f.write(sbatch_script)

    # Submit the job
    subprocess.run(["sbatch", script_filename])

# Submit a job for each model checkpoint
for checkpoint in model_checkpoints:
    submit_job(checkpoint)

Submitting job for 20240810_180311_pythia-6.9b-deduped_main_pile_uncopyrighted_parquet_test_0
Submitted batch job 26774557


In [12]:
import subprocess
from datetime import datetime
import yaml
import numpy as np

# Define the hyperparameters
model_name = 'gemma-7b'  # or 'gemma-2b'
dataset_name = 'pile_uncopyrighted_parquet_test'
dataset_subset = [0]
ctx_len = 1024  # Gemma supports up to 8192
batch_size = 1  # Reduced batch size due to model size
model_checkpoints = ["main"]

# Generate layer indices
num_layers = 28
layer_idx_list = [0] + np.linspace(1, num_layers-1, num_layers//4+1, dtype=int).tolist()

# Convert dataset_subset list to a string format
dataset_subset_str = ' '.join(map(str, dataset_subset))
dataset_subset_str_bar = '_'.join(map(str, dataset_subset))

def create_config(model_checkpoint, session_name):
    config = {
        "session_name": session_name,
        "session_path": f"/home/gridsan/jsong/physics_dl_shared/ML_JY/ID_scaling/Data/{session_name}",
        "model_config": {
            "model_name": model_name,
            "model_checkpoint": model_checkpoint,
            "use_accelerator": True,
            "module_name_mapping": {
                "mlp": "model.layers.{layer}.mlp",
                "attn": "model.layers.{layer}.attention",
                "block": "model.layers.{layer}",
                "emb": "model.embed_tokens",
                "unemb": "model.norm",
                "rotary_emb": "model.layers.{layer}.attention.rotary_emb",
                "input_layernorm": "model.layers.{layer}.input_layernorm",
                "post_attention_layernorm": "model.layers.{layer}.post_attention_layernorm"
            }
        },
        "dataset_config": {
            "dataset_name": dataset_name,
            "dataset_subset": dataset_subset,
            "max_dataset_size": 50000,
            "filter_and_chunk_config": {
                "min_chunks_from_a_document": 5,
                "max_chunks_from_a_document": 5
            }
        },
        "ctx_len": ctx_len,
        "batch_size": batch_size,
        "cacheing_config": {
            "layer_idx_list": layer_idx_list,
            "module_inblock_keys": ['block'],
            "module_outblock_keys": ['unemb'],
            "save_fp": "torch.float16",
            "save_cache_tensors": True,
            "save_mean_tensors": True,
            "save_IDs": True,
            "save_IDs_list": ['mle', 'mind_ml', 'twoNN_f10'],
        },
        'multiprocessing': True,
        'multiprocessing_num_cpus': 25,  # Adjusted for potential memory constraints
        'verbose': True
    }
    return config

def submit_job(model_checkpoint):
    current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
    session_name = f"{current_time}_{model_name}_{model_checkpoint}_{dataset_name}_{dataset_subset_str_bar}"
    print(f"Submitting job for {session_name}")

    # Create config
    config = create_config(model_checkpoint, session_name)

    # Save config to a YAML file
    config_filename = f"Configs/config_{model_name}_{model_checkpoint}.yaml"
    with open(config_filename, 'w') as f:
        yaml.dump(config, f)

    # Create SLURM script
    sbatch_script = f"""#!/bin/bash
#SBATCH --job-name={model_name}_{model_checkpoint}
#SBATCH --time=72:10:00
#SBATCH --output=LOG/%x-%j.out
#SBATCH --error=LOG/%x-%j.err
#SBATCH -N 1
#SBATCH -c 40
#SBATCH --gres=gpu:volta:1
#SBATCH --mem-per-cpu=7G
echo "Model name: {model_name}"
echo "Model checkpoint: {model_checkpoint}"
echo "CPUs per task: $SLURM_CPUS_PER_TASK"
module load anaconda/2023a-pytorch
python Run_experiment.py --config_path {config_filename}
"""

    script_filename = f"LLSubScripts/submit_job_{model_name}_{model_checkpoint}.sbatch"
    with open(script_filename, "w") as f:
        f.write(sbatch_script)

    # Submit the job
    subprocess.run(["sbatch", script_filename])

# Submit a job for each model checkpoint
for checkpoint in model_checkpoints:
    submit_job(checkpoint)

Submitting job for 20240810_171854_gemma-7b_main_pile_uncopyrighted_parquet_test_0
Submitted batch job 26774205
