## Setup

In [1]:
!nvidia-smi

Fri Dec 13 01:17:04 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |


|   0  NVIDIA A40                     Off | 00000000:0B:00.0 Off |                    0 |
|  0%   29C    P8              21W / 300W |      0MiB / 46068MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A40                     Off | 00000000:0C:00.0 Off |                    0 |
|  0%   29C    P8              21W / 300W |      0MiB / 46068MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   2  NVIDIA A40                     Off | 00000000:0D:00.0 Off |                    0 |
|  0%   29C    P8              21W / 300W |      0MiB / 46068MiB |      0%      Default |
|                                         |                      |                  N/A |
+---------

In [2]:
!pip install -r requirements.txt

Collecting torch==2.5.1 (from -r requirements.txt (line 1))
  Downloading torch-2.5.1-cp312-cp312-manylinux1_x86_64.whl.metadata (28 kB)
Collecting torchao==0.6.1 (from -r requirements.txt (line 2))
  Downloading torchao-0.6.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting torchvision==0.20.1 (from -r requirements.txt (line 3))
  Downloading torchvision-0.20.1-cp312-cp312-manylinux1_x86_64.whl.metadata (6.1 kB)
Collecting torchtune==0.4.0 (from -r requirements.txt (line 4))
  Downloading torchtune-0.4.0-py3-none-any.whl.metadata (19 kB)
Collecting wandb==0.19.0 (from -r requirements.txt (line 5))
  Downloading wandb-0.19.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting python-dotenv==1.0.1 (from -r requirements.txt (line 6))
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting filelock (from torch==2.5.1->-r requirements.txt (line 1))
  Downloading filelock-3.16.1-py3-none-any.whl.met

In [3]:
from IPython import get_ipython
from IPython.core.magic import register_cell_magic

ipython = get_ipython()
@register_cell_magic
def pybash(line, cell):
    ipython.run_cell_magic('bash', '', cell.format(**globals()))

In [9]:
%load_ext dotenv
%dotenv

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


## Configuration

In [8]:
import os

In [17]:
## CONFIG
NUM_GPUS = 4
HF_TOKEN = os.environ["HF_TOKEN_R"]
IGNORE_PATTERNS = "original/consolidated*"
CONFIG_FILE = "llama_3_1_8b_lora_distributed_resume.yaml"

## MODEL
FT_MODEL_REPO = "multimodalai"
BASE_MODEL_HF_ID = "meta-llama/Llama-3.1-8B"
CLIENT = "resume-critique"
MODEL = "llama3_1_8b"
MODEL_NUMBER = "2"
REV_N = "1"
FT_METHOD = "tt_lora"
MODEL_TYPE = "adapter"

MDATA_ID = f"model_{MODEL_NUMBER}_20k"
REV = f"rev_{REV_N}"
FT_MODEL_NAME = f"{CLIENT}-{MODEL}-{FT_METHOD}-{MDATA_ID}-{MODEL_TYPE}-{REV}"
FT_MODEL_HF_ID = f"multimodalai/{FT_MODEL_NAME}"

## DATASET
TRAINING_DATA = "resume_critique_model_2.jsonl"

## PATH
BASE_MODEL_PATH = "base_model/"
TOKENIZER_PATH = f"{BASE_MODEL_PATH}/original/tokenizer.model"
OUTPUT_MODEL_PATH = f"checkpoint/{FT_MODEL_REPO}/{FT_MODEL_NAME}"
TRAINING_DATA_PATH = f"data/{TRAINING_DATA}"
CONFIG_FILE_PATH = f"config/{CONFIG_FILE}"

## TRACKING
WANDB_GROUP_NAME = CLIENT
RUN_WANDB_NAME = f"run-{FT_MODEL_NAME}"
LOGS_PATH = "logs/"

In [11]:
!mkdir -p {OUTPUT_MODEL_PATH}
!mkdir -p {LOGS_PATH}

## Download Base Model

In [21]:
%%pybash
tune download {BASE_MODEL_HF_ID} --output-dir {BASE_MODEL_PATH} --ignore-patterns {IGNORE_PATTERNS} --hf-token {HF_TOKEN}

Ignoring files matching the following patterns: original/consolidated*


Fetching 16 files: 100%|██████████| 16/16 [01:59<00:00,  7.50s/it]


Successfully downloaded model repo and wrote to the following locations:
/home/ubuntu/Development/interntal-fine-tune-foundry/resume-critique/model_2_rev_1/base_model/special_tokens_map.json
/home/ubuntu/Development/interntal-fine-tune-foundry/resume-critique/model_2_rev_1/base_model/.gitattributes
/home/ubuntu/Development/interntal-fine-tune-foundry/resume-critique/model_2_rev_1/base_model/tokenizer.json
/home/ubuntu/Development/interntal-fine-tune-foundry/resume-critique/model_2_rev_1/base_model/LICENSE
/home/ubuntu/Development/interntal-fine-tune-foundry/resume-critique/model_2_rev_1/base_model/model.safetensors.index.json
/home/ubuntu/Development/interntal-fine-tune-foundry/resume-critique/model_2_rev_1/base_model/tokenizer_config.json
/home/ubuntu/Development/interntal-fine-tune-foundry/resume-critique/model_2_rev_1/base_model/model-00004-of-00004.safetensors
/home/ubuntu/Development/interntal-fine-tune-foundry/resume-critique/model_2_rev_1/base_model/README.md
/home/ubuntu/Develo

## Fine-Tune

In [15]:
%%pybash
tune run \
    --nproc_per_node {NUM_GPUS} \
    lora_finetune_distributed \
    --config {CONFIG_FILE_PATH} \
    tokenizer.path={TOKENIZER_PATH} \
    checkpointer.checkpoint_dir={BASE_MODEL_PATH} \
    checkpointer.output_dir={OUTPUT_MODEL_PATH} \
    dataset.data_files={TRAINING_DATA_PATH} \
    metric_logger.group={WANDB_GROUP_NAME} \
    metric_logger.name={RUN_WANDB_NAME} \
	output_dir={OUTPUT_MODEL_PATH} \
	metric_logger.log_dir={LOGS_PATH}

Running with torchrun...


W1213 01:25:26.200000 11482 site-packages/torch/distributed/run.py:793] 
W1213 01:25:26.200000 11482 site-packages/torch/distributed/run.py:793] *****************************************
W1213 01:25:26.200000 11482 site-packages/torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
W1213 01:25:26.200000 11482 site-packages/torch/distributed/run.py:793] *****************************************
Traceback (most recent call last):
  File "/home/ubuntu/miniconda3/bin/tune", line 8, in <module>
    sys.exit(main())
             ^^^^^^
  File "/home/ubuntu/miniconda3/lib/python3.12/site-packages/torchtune/_cli/tune.py", line 49, in main
    parser.run(args)
  File "/home/ubuntu/miniconda3/lib/python3.12/site-packages/torchtune/_cli/tune.py", line 43, in run
    args.func(args)
  File "/home/ubuntu/miniconda3/

CalledProcessError: Command 'b'tune run \\\n    --nproc_per_node 4 \\\n    lora_finetune_distributed \\\n    --config config/llama_3_1_8b_lora_distributed.yaml \\\n    tokenizer.path=base_model//original/tokenizer.model \\\n    checkpointer.checkpoint_dir=base_model/ \\\n    checkpointer.output_dir=checkpoint/multimodalai/resume-critique-llama3_1_8b-tt_lora-model_2_20k-adapter-rev_1 \\\n    dataset.data_files=data/resume_critique_model_2.jsonl \\\n    metric_logger.group=resume-critique \\\n    metric_logger.name=run-resume-critique-llama3_1_8b-tt_lora-model_2_20k-adapter-rev_1 \\\n\toutput_dir=checkpoint/multimodalai/resume-critique-llama3_1_8b-tt_lora-model_2_20k-adapter-rev_1 \\\n\tmetric_logger.log_dir=logs/\n'' returned non-zero exit status 1.

In [6]:
OUTPUT_MODEL_PATH

NameError: name 'OUTPUT_MODEL_PATH' is not defined

In [12]:
%%pybash
tune download {FT_MODEL_HF_ID} --output-dir {OUTPUT_MODEL_PATH} --hf-token {HF_TOKEN}

Ignoring files matching the following patterns: *.safetensors


Fetching 10 files: 100%|██████████| 10/10 [02:00<00:00, 12.01s/it]


Successfully downloaded model repo and wrote to the following locations:
/home/ubuntu/Development/interntal-fine-tune-foundry/resume-critique/model_2_rev_1/checkpoint/multimodalai/resume-critique-llama3_1_8b-tt_lora-model_2_20k-adapter-rev_1/hf_model_0001_0.pt
/home/ubuntu/Development/interntal-fine-tune-foundry/resume-critique/model_2_rev_1/checkpoint/multimodalai/resume-critique-llama3_1_8b-tt_lora-model_2_20k-adapter-rev_1/.gitattributes
/home/ubuntu/Development/interntal-fine-tune-foundry/resume-critique/model_2_rev_1/checkpoint/multimodalai/resume-critique-llama3_1_8b-tt_lora-model_2_20k-adapter-rev_1/adapter_0.pt
/home/ubuntu/Development/interntal-fine-tune-foundry/resume-critique/model_2_rev_1/checkpoint/multimodalai/resume-critique-llama3_1_8b-tt_lora-model_2_20k-adapter-rev_1/hf_model_0003_0.pt
/home/ubuntu/Development/interntal-fine-tune-foundry/resume-critique/model_2_rev_1/checkpoint/multimodalai/resume-critique-llama3_1_8b-tt_lora-model_2_20k-adapter-rev_1/adapter_model.bi

In [18]:
CONFIG_FILE_PATH

'config/llama_3_1_8b_lora_distributed_resume.yaml'

In [None]:
%%pybash
tune run \
    --nproc_per_node {NUM_GPUS} \
    lora_finetune_distributed \
    --config {CONFIG_FILE_PATH} \
    tokenizer.path={TOKENIZER_PATH} \
    checkpointer.checkpoint_dir={OUTPUT_MODEL_PATH} \
    checkpointer.output_dir={OUTPUT_MODEL_PATH} \
    dataset.data_files={TRAINING_DATA_PATH} \
    metric_logger.group={WANDB_GROUP_NAME} \
    metric_logger.name={RUN_WANDB_NAME} \
	output_dir={OUTPUT_MODEL_PATH} \
	metric_logger.log_dir={LOGS_PATH}

Running with torchrun...


W1213 01:31:45.150000 14054 site-packages/torch/distributed/run.py:793] 
W1213 01:31:45.150000 14054 site-packages/torch/distributed/run.py:793] *****************************************
W1213 01:31:45.150000 14054 site-packages/torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
W1213 01:31:45.150000 14054 site-packages/torch/distributed/run.py:793] *****************************************
INFO:torchtune.utils._logging:Running LoRAFinetuneRecipeDistributed with resolved config:

batch_size: 4
checkpointer:
  _component_: torchtune.training.FullModelHFCheckpointer
  adapter_checkpoint: adapter_0.pt
  checkpoint_dir: checkpoint/multimodalai/resume-critique-llama3_1_8b-tt_lora-model_2_20k-adapter-rev_1
  checkpoint_files:
  - hf_model_0001_0.pt
  - hf_model_0002_0.pt
  - hf_model_0003_0.pt
  - hf_model