## Setup

In [1]:
!nvidia-smi

Tue Dec 10 15:48:42 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A40                     Off | 00000000:0A:00.0 Off |                    0 |
|  0%   29C    P8              23W / 300W |      0MiB / 46068MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
!pip install -r requirements.txt



In [None]:
from IPython import get_ipython
from IPython.core.magic import register_cell_magic

ipython = get_ipython()
@register_cell_magic
def pybash(line, cell):
    ipython.run_cell_magic('bash', '', cell.format(**globals()))

In [2]:
%load_ext dotenv
%dotenv

## Configuration

In [9]:
import os

In [None]:
## CONFIG
NUM_GPUS = 1
HF_TOKEN = os.environ["HF_TOKEN_R"]
IGNORE_PATTERNS = "original/consolidated*"
CONFIG_FILE = "llama_3_1_8b_lora_single.yaml"

## MODEL / DATA
BASE_MODEL_HF_ID = "meta-llama/Llama-3.1-8B"
CLIENT = "resume-critique"
MODEL = "llama3_1_8b"
FT_METHOD = "tt_lora"
MDATA_ID = "model_1_20k"
MODEL_TYPE = "adapter"
REV = "rev_1"
FT_MODEL_NAME = f"{CLIENT}-{MODEL}-{FT_METHOD}-{MDATA_ID}-{MODEL_TYPE}-{REV}"
TRAINING_DATA = "resume_critique_model_1.jsonl"

## PATH
TOKENIZER_PATH = "base_model/original/tokenizer.model"
BASE_MODEL_PATH = "base_model/"
OUTPUT_MODEL_PATH = f"checkpoint/multimodalai/{FT_MODEL_NAME}"
TRAINING_DATA_PATH = f"data/{TRAINING_DATA}"
CONFIG_FILE_PATH = f"config/{CONFIG_FILE}"

## TRACKING
WANDB_GROUP_NAME = CLIENT
RUN_WANDB_NAME: f"run-{FT_MODEL_NAME}"
LOGS_PATH = "logs/"

In [None]:
!mkdir -p {OUTPUT_MODEL_PATH}
!mkdir -p {LOGS_PATH}

## Download Base Model

In [None]:
!tune download {BASE_MODEL_HF_ID} --output-dir {BASE_MODEL_PATH} --ignore-patterns {IGNORE_PATTERNS} --hf-token {HF_TOKEN}

Ignoring files matching the following patterns: original/consolidated*
Fetching 16 files:   0%|                                 | 0/16 [00:00<?, ?it/s]
generation_config.json: 100%|███████████████████| 185/185 [00:00<00:00, 615kB/s][A

USE_POLICY.md: 100%|███████████████████████| 4.69k/4.69k [00:00<00:00, 11.5MB/s][A

LICENSE: 100%|█████████████████████████████| 7.63k/7.63k [00:00<00:00, 11.6MB/s][A

.gitattributes: 100%|██████████████████████| 1.52k/1.52k [00:00<00:00, 6.33MB/s][A
Fetching 16 files:   6%|█▌                       | 1/16 [00:00<00:02,  6.58it/s]
config.json: 100%|█████████████████████████████| 826/826 [00:00<00:00, 3.26MB/s][A

README.md: 100%|███████████████████████████| 40.9k/40.9k [00:00<00:00, 9.29MB/s][A

model-00002-of-00004.safetensors:   0%|             | 0.00/5.00G [00:00<?, ?B/s][A

model-00001-of-00004.safetensors:   0%|             | 0.00/4.98G [00:00<?, ?B/s][A[A


model.safetensors.index.json: 100%|████████| 23.9k/23.9k [00:00<00:00, 43.8MB/s][A

## Fine-Tune

In [None]:
!tune run \
    --nproc_per_node 1 \
    lora_finetune_distributed \
    --config {CONFIG_FILE_PATH} \
    tokenizer.path={TOKENIZER_PATH} \
    checkpointer.checkpoint_dir={BASE_MODEL_PATH} \
    checkpointer.output_dir={OUTPUT_MODEL_PATH} \
    dataset.data_files={TRAINING_DATA_PATH} \
    metric_logger.group={WANDB_GROUP_NAME} \
    metric_logger.name={RUN_WANDB_NAME}


In [39]:
!echo \
	{CONFIG_FILE_PATH}

config/llama_3_1_8b_lora_single


In [32]:
from torchtune.training.metric_logging import WandBLogger

In [None]:
%%pybash
tune run \
    lora_finetune_single_device \
    --config {CONFIG_FILE_PATH} \
    tokenizer.path={TOKENIZER_PATH} \
    checkpointer.checkpoint_dir={BASE_MODEL_PATH} \
    checkpointer.output_dir={OUTPUT_MODEL_PATH} \
    dataset.data_files={TRAINING_DATA_PATH} \
    metric_logger.group={WANDB_GROUP_NAME} \
    metric_logger.name={RUN_WANDB_NAME} \
	output_dir={OUTPUT_MODEL_PATH} \
	metric_logger.log_dir={LOGS_PATH}

Traceback (most recent call last):
  File "/home/ubuntu/miniconda3/bin/tune", line 8, in <module>
    sys.exit(main())
             ^^^^^^
  File "/home/ubuntu/miniconda3/lib/python3.12/site-packages/torchtune/_cli/tune.py", line 49, in main
    parser.run(args)
  File "/home/ubuntu/miniconda3/lib/python3.12/site-packages/torchtune/_cli/tune.py", line 43, in run
    args.func(args)
  File "/home/ubuntu/miniconda3/lib/python3.12/site-packages/torchtune/_cli/run.py", line 208, in _run_cmd
    self._run_single_device(args, is_builtin=is_builtin)
  File "/home/ubuntu/miniconda3/lib/python3.12/site-packages/torchtune/_cli/run.py", line 102, in _run_single_device
    runpy.run_path(str(args.recipe), run_name="__main__")
  File "<frozen runpy>", line 287, in run_path
  File "<frozen runpy>", line 98, in _run_module_code
  File "<frozen runpy>", line 88, in _run_code
  File "/home/ubuntu/miniconda3/lib/python3.12/site-packages/recipes/lora_finetune_single_device.py", line 800, in <module>
    