## Setup

In [1]:
!nvidia-smi

Fri Feb 14 06:26:36 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          Off | 00000000:0A:00.0 Off |                    0 |
| N/A   30C    P0              54W / 500W |      0MiB / 81920MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A100-SXM4-80GB          Off | 00000000:0B:00.0 Off |  

In [2]:
!pip install -r requirements.txt

Defaulting to user installation because normal site-packages is not writeable


Collecting torch==2.5.1
  Downloading torch-2.5.1-cp310-cp310-manylinux1_x86_64.whl (906.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m906.4/906.4 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting torchao==0.6.1
  Downloading torchao-0.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m104.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchvision==0.20.1
  Downloading torchvision-0.20.1-cp310-cp310-manylinux1_x86_64.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m137.8 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting torchtune==0.4.0
  Downloading torchtune-0.4.0-py3-none-any.whl (686 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m686.9/686.9 KB[0m [31m95.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting wandb==0.19.0
  Downloading wandb-0.19.0-py3-non

In [3]:
from IPython import get_ipython
from IPython.core.magic import register_cell_magic

ipython = get_ipython()
@register_cell_magic
def pybash(line, cell):
    ipython.run_cell_magic('bash', '', cell.format(**globals()))

In [4]:
%load_ext dotenv
%dotenv

## Configuration

In [5]:
import os

In [6]:
## CONFIG
NUM_GPUS = 4
HF_TOKEN = os.environ["HF_TOKEN_R"]
IGNORE_PATTERNS = "original/consolidated*"
CONFIG_FILE = "llama_3_1_8b_lora_distributed.yaml"

## MODEL
ORGANIZATION = "multimodalai"
BASE_MODEL_HF_ID = "meta-llama/Llama-3.1-8B"
CLIENT = "resume-critique"
MODEL = "llama3_1_8b"
MODEL_NUMBER = "3"
REV_N = "3"
FT_METHOD = "tt_lora"
MODEL_TYPE = "adapter"

## DATA
DATA_SIZE = "2k"
TRAINING_DATA = "resume_critique_model_3_v2.jsonl"

MDATA_ID = f"model_{MODEL_NUMBER}_{DATA_SIZE}"
REV = f"rev_{REV_N}"
FT_MODEL_NAME = f"{CLIENT}-{MODEL}-{FT_METHOD}-{MDATA_ID}-{MODEL_TYPE}-{REV}"
FT_MODEL_HF_ID = f"multimodalai/{FT_MODEL_NAME}"

## PATH
BASE_MODEL_PATH = "base_model/"
TOKENIZER_PATH = f"{BASE_MODEL_PATH}/original/tokenizer.model"
OUTPUT_MODEL_PATH = f"checkpoint/{ORGANIZATION}/{FT_MODEL_NAME}"
TRAINING_DATA_PATH = f"data/{TRAINING_DATA}"
CONFIG_FILE_PATH = f"config/{CONFIG_FILE}"

## TRACKING
WANDB_GROUP_NAME = CLIENT
RUN_WANDB_NAME = f"run-{FT_MODEL_NAME}"
LOGS_PATH = "logs/"

In [7]:
!mkdir -p {OUTPUT_MODEL_PATH}
!mkdir -p {LOGS_PATH}

## Download Base Model

In [8]:
%%pybash
tune download {BASE_MODEL_HF_ID} --output-dir {BASE_MODEL_PATH} --ignore-patterns {IGNORE_PATTERNS} --hf-token {HF_TOKEN}

Ignoring files matching the following patterns: original/consolidated*


Fetching 16 files: 100%|██████████| 16/16 [01:58<00:00,  7.41s/it]


Successfully downloaded model repo and wrote to the following locations:
/home/ubuntu/internal-foundry-fine-tune/resume-critique/model_3_rev_3/base_model/special_tokens_map.json
/home/ubuntu/internal-foundry-fine-tune/resume-critique/model_3_rev_3/base_model/.gitattributes
/home/ubuntu/internal-foundry-fine-tune/resume-critique/model_3_rev_3/base_model/tokenizer.json
/home/ubuntu/internal-foundry-fine-tune/resume-critique/model_3_rev_3/base_model/LICENSE
/home/ubuntu/internal-foundry-fine-tune/resume-critique/model_3_rev_3/base_model/model.safetensors.index.json
/home/ubuntu/internal-foundry-fine-tune/resume-critique/model_3_rev_3/base_model/tokenizer_config.json
/home/ubuntu/internal-foundry-fine-tune/resume-critique/model_3_rev_3/base_model/model-00004-of-00004.safetensors
/home/ubuntu/internal-foundry-fine-tune/resume-critique/model_3_rev_3/base_model/README.md
/home/ubuntu/internal-foundry-fine-tune/resume-critique/model_3_rev_3/base_model/original
/home/ubuntu/internal-foundry-fin

## Fine-Tune

In [9]:
%%pybash
tune run \
    --nproc_per_node {NUM_GPUS} \
    lora_finetune_distributed \
    --config {CONFIG_FILE_PATH} \
    tokenizer.path={TOKENIZER_PATH} \
    checkpointer.checkpoint_dir={BASE_MODEL_PATH} \
    checkpointer.output_dir={OUTPUT_MODEL_PATH} \
    dataset.data_files={TRAINING_DATA_PATH} \
    metric_logger.group={WANDB_GROUP_NAME} \
    metric_logger.name={RUN_WANDB_NAME} \
	output_dir={OUTPUT_MODEL_PATH} \
	metric_logger.log_dir={LOGS_PATH}

Running with torchrun...


W0214 06:30:45.825000 9921 torch/distributed/run.py:793] 
W0214 06:30:45.825000 9921 torch/distributed/run.py:793] *****************************************
W0214 06:30:45.825000 9921 torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
W0214 06:30:45.825000 9921 torch/distributed/run.py:793] *****************************************
INFO:torchtune.utils._logging:Running LoRAFinetuneRecipeDistributed with resolved config:

batch_size: 8
checkpointer:
  _component_: torchtune.training.FullModelHFCheckpointer
  checkpoint_dir: base_model/
  checkpoint_files:
  - model-00001-of-00004.safetensors
  - model-00002-of-00004.safetensors
  - model-00003-of-00004.safetensors
  - model-00004-of-00004.safetensors
  model_type: LLAMA3
  output_dir: checkpoint/multimodalai/resume-critique-llama3_1_8b-tt_lora-model_3

## Save Model HF

In [10]:
HF_TOKEN = os.environ["HF_TOKEN_W"]

In [11]:
%%pybash
huggingface-cli login --token {HF_TOKEN}
huggingface-cli repo create -y --organization {ORGANIZATION} {FT_MODEL_NAME}
huggingface-cli upload {FT_MODEL_HF_ID} {OUTPUT_MODEL_PATH}

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
The token `leo-mm-write-token-2` has been saved to /home/ubuntu/.cache/huggingface/stored_tokens
Your token has been saved to /home/ubuntu/.cache/huggingface/token
Login successful.
The current active token is: `leo-mm-write-token-2`


[90mgit version 2.34.1[0m
[1m[31mLooks like you do not have git-lfs installed, please install. You can install from https://git-lfs.github.com/. Then run `git lfs install` (you only have to do this once).[0m

You are about to create [1mmultimodalai/resume-critique-llama3_1_8b-tt_lora-model_3_2k-adapter-rev_3[0m

Your repo now lives at:
  [1mhttps://huggingface.co/multimodalai/resume-critique-llama3_1_8b-tt_lora-model_3_2k-adapter-rev_3[0m

You can clone it locally with the command below, and commit/push as usual.

  git clone https://huggingface.co/multimodalai/resume-critique-llama3_1_8b-tt_lora-model_3_2k-adapter-rev_3



Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details.
Start hashing 19 files.


Finished hashing 19 files.
adapter_0.pt:   0%|          | 0.00/40.0M [00:00<?, ?B/s]
adapter_1.pt:   0%|          | 0.00/40.0M [00:00<?, ?B/s][A

adapter_2.pt:   0%|          | 0.00/40.0M [00:00<?, ?B/s][A[A


adapter_model.bin:   0%|          | 0.00/40.0M [00:00<?, ?B/s][A[A[A



hf_model_0001_0.pt:   0%|          | 0.00/4.98G [00:00<?, ?B/s][A[A[A[A




adapter_0.pt:  13%|█▎        | 5.18M/40.0M [00:00<00:00, 51.7MB/s][A[A[A
adapter_1.pt:  14%|█▎        | 5.47M/40.0M [00:00<00:00, 54.7MB/s][A


adapter_model.bin:  15%|█▍        | 5.87M/40.0M [00:00<00:00, 58.6MB/s][A[A[A



hf_model_0001_0.pt:   0%|          | 6.24M/4.98G [00:00<01:19, 62.4MB/s][A[A[A[A

adapter_0.pt:  40%|████      | 16.0M/40.0M [00:00<00:00, 38.8MB/s][A[A


adapter_model.bin:  40%|████      | 16.0M/40.0M [00:00<00:00, 31.0MB/s][A[A[A

adapter_2.pt:  25%|██▌       | 10.0M/40.0M [00:00<00:01, 17.0MB/s][A[A



hf_model_0001_0.pt:   0%|          | 16.0M/4.98G [00:00<03:26, 24.0MB/s][A[A[A

https://huggingface.co/multimodalai/resume-critique-llama3_1_8b-tt_lora-model_3_2k-adapter-rev_3/tree/main/.
