## Setup

In [18]:
!nvidia-smi

Fri Feb 14 18:53:24 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          Off | 00000000:0A:00.0 Off |                    0 |
| N/A   29C    P0              57W / 500W |      0MiB / 81920MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A100-SXM4-80GB          Off | 00000000:0B:00.0 Off |  

In [21]:
!pip install -r requirements.txt

Defaulting to user installation because normal site-packages is not writeable


Collecting peft
  Downloading peft-0.14.0-py3-none-any.whl (374 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.8/374.8 KB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting transformers
  Downloading transformers-4.48.3-py3-none-any.whl (9.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m39.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting accelerate>=0.21.0
  Downloading accelerate-1.3.0-py3-none-any.whl (336 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m336.6/336.6 KB[0m [31m61.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.22,>=0.21
  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m83.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m
Installing collected packages: tokenizers, transformers, accelerate, peft
Successfully installed 

In [19]:
from IPython import get_ipython
from IPython.core.magic import register_cell_magic

ipython = get_ipython()
@register_cell_magic
def pybash(line, cell):
    ipython.run_cell_magic('bash', '', cell.format(**globals()))

In [20]:
%load_ext dotenv
%dotenv

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


## Configuration

In [9]:
import os

In [39]:
## CONFIG
NUM_GPUS = 4
HF_TOKEN = os.environ["HF_TOKEN_R"]
IGNORE_PATTERNS = "original/consolidated*"
CONFIG_FILE = "llama_3_1_8b_lora_distributed.yaml"

## MODEL
ORGANIZATION = "multimodalai"
BASE_MODEL_HF_ID = "meta-llama/Llama-3.1-8B"
CLIENT = "resume-critique"
MODEL = "llama3_1_8b"
MODEL_NUMBER = "4"
REV_N = "3"
FT_METHOD = "tt_lora"
MODEL_TYPE = "adapter"

MDATA_ID = f"model_{MODEL_NUMBER}_2k"
REV = f"rev_{REV_N}"
FT_MODEL_NAME = f"{CLIENT}-{MODEL}-{FT_METHOD}-{MDATA_ID}-{MODEL_TYPE}-{REV}"
FT_MODEL_HF_ID = f"multimodalai/{FT_MODEL_NAME}"

## DATASET
TRAINING_DATA = "resume_critique_model_4_v3.jsonl"
TEST_DATA = "resume_critique_model_4_test.jsonl"

## PATH
BASE_MODEL_PATH = "base_model/"
TOKENIZER_PATH = f"{BASE_MODEL_PATH}/original/tokenizer.model"
OUTPUT_MODEL_PATH = f"checkpoint/{ORGANIZATION}/{FT_MODEL_NAME}"
TRAINING_DATA_PATH = f"data/{TRAINING_DATA}"
TEST_DATA_PATH = f"data/{TEST_DATA}"
CONFIG_FILE_PATH = f"config/{CONFIG_FILE}"

## TRACKING
WANDB_GROUP_NAME = CLIENT
RUN_WANDB_NAME = f"run-{FT_MODEL_NAME}"
LOGS_PATH = "logs/"

In [11]:
!mkdir -p {OUTPUT_MODEL_PATH}
!mkdir -p {LOGS_PATH}

## Download Base Model

In [12]:
%%pybash
tune download {BASE_MODEL_HF_ID} --output-dir {BASE_MODEL_PATH} --ignore-patterns {IGNORE_PATTERNS} --hf-token {HF_TOKEN}

Ignoring files matching the following patterns: original/consolidated*


Fetching 16 files: 100%|██████████| 16/16 [01:59<00:00,  7.44s/it]


Successfully downloaded model repo and wrote to the following locations:
/home/ubuntu/internal-foundry-fine-tune/resume-critique/model_4_rev_3/base_model/special_tokens_map.json
/home/ubuntu/internal-foundry-fine-tune/resume-critique/model_4_rev_3/base_model/.gitattributes
/home/ubuntu/internal-foundry-fine-tune/resume-critique/model_4_rev_3/base_model/tokenizer.json
/home/ubuntu/internal-foundry-fine-tune/resume-critique/model_4_rev_3/base_model/LICENSE
/home/ubuntu/internal-foundry-fine-tune/resume-critique/model_4_rev_3/base_model/model.safetensors.index.json
/home/ubuntu/internal-foundry-fine-tune/resume-critique/model_4_rev_3/base_model/tokenizer_config.json
/home/ubuntu/internal-foundry-fine-tune/resume-critique/model_4_rev_3/base_model/model-00004-of-00004.safetensors
/home/ubuntu/internal-foundry-fine-tune/resume-critique/model_4_rev_3/base_model/README.md
/home/ubuntu/internal-foundry-fine-tune/resume-critique/model_4_rev_3/base_model/original
/home/ubuntu/internal-foundry-fin

## Fine-Tune

In [13]:
%%pybash
tune run \
    --nproc_per_node {NUM_GPUS} \
    lora_finetune_distributed \
    --config {CONFIG_FILE_PATH} \
    tokenizer.path={TOKENIZER_PATH} \
    checkpointer.checkpoint_dir={BASE_MODEL_PATH} \
    checkpointer.output_dir={OUTPUT_MODEL_PATH} \
    dataset.data_files={TRAINING_DATA_PATH} \
    metric_logger.group={WANDB_GROUP_NAME} \
    metric_logger.name={RUN_WANDB_NAME} \
	output_dir={OUTPUT_MODEL_PATH} \
	metric_logger.log_dir={LOGS_PATH}

Running with torchrun...


W0214 16:29:03.540000 10538 torch/distributed/run.py:793] 
W0214 16:29:03.540000 10538 torch/distributed/run.py:793] *****************************************
W0214 16:29:03.540000 10538 torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
W0214 16:29:03.540000 10538 torch/distributed/run.py:793] *****************************************
INFO:torchtune.utils._logging:Running LoRAFinetuneRecipeDistributed with resolved config:

batch_size: 8
checkpointer:
  _component_: torchtune.training.FullModelHFCheckpointer
  checkpoint_dir: base_model/
  checkpoint_files:
  - model-00001-of-00004.safetensors
  - model-00002-of-00004.safetensors
  - model-00003-of-00004.safetensors
  - model-00004-of-00004.safetensors
  model_type: LLAMA3
  output_dir: checkpoint/multimodalai/resume-critique-llama3_1_8b-tt_lora-mod

## Save the Model

In [15]:
HF_TOKEN = os.environ["HF_TOKEN_W"]

In [16]:
%%pybash
huggingface-cli login --token {HF_TOKEN}
huggingface-cli repo create -y --organization {ORGANIZATION} {FT_MODEL_NAME}
huggingface-cli upload {FT_MODEL_HF_ID} {OUTPUT_MODEL_PATH}

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
The token `leo-mm-write-token-2` has been saved to /home/ubuntu/.cache/huggingface/stored_tokens
Your token has been saved to /home/ubuntu/.cache/huggingface/token
Login successful.
The current active token is: `leo-mm-write-token-2`


[90mgit version 2.34.1[0m
[1m[31mLooks like you do not have git-lfs installed, please install. You can install from https://git-lfs.github.com/. Then run `git lfs install` (you only have to do this once).[0m

You are about to create [1mmultimodalai/resume-critique-llama3_1_8b-tt_lora-model_4_2k-adapter-rev_3[0m

Your repo now lives at:
  [1mhttps://huggingface.co/multimodalai/resume-critique-llama3_1_8b-tt_lora-model_4_2k-adapter-rev_3[0m

You can clone it locally with the command below, and commit/push as usual.

  git clone https://huggingface.co/multimodalai/resume-critique-llama3_1_8b-tt_lora-model_4_2k-adapter-rev_3



Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details.
Start hashing 19 files.
Finished hashing 19 files.
adapter_0.pt:   0%|          | 0.00/40.0M [00:00<?, ?B/s]
adapter_1.pt:   0%|          | 0.00/40.0M [00:00<?, ?B/s][A

adapter_2.pt:   0%|          | 0.00/40.0M [00:00<?, ?B/s][A[A



Upload 17 LFS files:   0%|          | 0/17 [00:00<?, ?it/s][A[A[A[A


adapter_model.bin:   0%|          | 0.00/40.0M [00:00<?, ?B/s][A[A[A




adapter_0.pt:   8%|▊         | 3.19M/40.0M [00:00<00:01, 31.9MB/s][A[A[A[A
adapter_1.pt:  21%|██        | 8.36M/40.0M [00:00<00:00, 83.4MB/s][A

adapter_2.pt:  10%|▉         | 3.85M/40.0M [00:00<00:00, 38.5MB/s][A[A


adapter_model.bin:  22%|██▏       | 8.68M/40.0M [00:00<00:00, 86.7MB/s][A[A[A




adapter_0.pt:  22%|██▏       | 8.60M/40.0M [00:00<00:01, 27.7MB/s]7MB/s][A[A[A[A[A


adapter_0.pt:  32%|███▏      | 12.7M/40.0M

https://huggingface.co/multimodalai/resume-critique-llama3_1_8b-tt_lora-model_4_2k-adapter-rev_3/tree/main/.


## Evaluation

In [69]:
import torch
import pandas as pd
from huggingface_hub import login
from peft import PeftModel, PeftModelForCausalLM
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM 
from collections import defaultdict
from tqdm import tqdm
from pathlib import Path
import pickle

In [40]:
TEST_DATA_PATH

'data/resume_critique_model_4_test.jsonl'

In [41]:
df = pd.read_json(TEST_DATA_PATH, orient="records", lines=True)

In [48]:
len(df)

30

In [42]:
OUTPUT_MODEL_PATH

'checkpoint/multimodalai/resume-critique-llama3_1_8b-tt_lora-model_4_2k-adapter-rev_3'

In [43]:
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_PATH, torch_dtype=torch.float16, device_map="cuda:0")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00,  1.08it/s]


In [44]:
peft_model = PeftModelForCausalLM.from_pretrained(model, OUTPUT_MODEL_PATH)

In [45]:
tokenizer = AutoTokenizer.from_pretrained(
	BASE_MODEL_PATH,
	padding_side = "right",
	add_eos_token = True
)
tokenizer.pad_token = tokenizer.eos_token

In [51]:
gen_params = {
	"temperature": 0.3,
	"max_new_tokens": 300,
    "top_p": 0.95
}

In [61]:
# eos_list = ["\n\n", ""]

def generate_llm(prompt, gen_params):
	prompt = prompt.split("### Response:\n")[0] + "### Response:\n"
	inputs = tokenizer(prompt, return_tensors = "pt").to("cuda:0")
	generate_ids = model.generate(inputs.input_ids, **gen_params)
	returned = tokenizer.batch_decode(generate_ids, skip_special_tokens = False)[0]
	generated_text = returned.split("### Response:\n")[1]
	generated_text = generated_text.rsplit(".", 1)[0] + "."
	return generated_text

In [62]:
prompt_ex = df.iloc[0].text
res = generate_llm(prompt_ex, gen_params)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [63]:
print(res)

'Introduction': Your work history section contains substantial content, which is a positive aspect of your CV. However, it is essential to present this information in an engaging and impactful way to truly capture the attention of potential employers. This section should not only list job titles and companies but also provide insights into your responsibilities and achievements.
'Achiever': Your work history does include substantive content, but it could benefit from a stronger emphasis on accomplishments. Striking a balance between listing tasks and showcasing achievements is crucial; this will help you appear as an 'achiever' rather than just a 'do-er.' By rewriting your duties to highlight measurable outcomes, you can significantly enhance the impact of your CV.
'Achievements': ['Developed and implemented comprehensive records management policies and procedures, ensuring compliance and efficiency.', 'Led multiple high-profile projects, resulting in the successful digitization and ar

In [64]:
responses = defaultdict(list)

In [65]:
prompts = df.text.tolist()

In [66]:
gen_params = {
	"temperature": 0.45,
	"max_new_tokens": 300,
    "top_p": 0.90
}

prompts = df.text.tolist()
results = []
for prompt in tqdm(prompts):
	res = generate_llm(prompt, gen_params)
	results.append(res)
responses["0.45"] += results

  0%|          | 0/30 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  3%|▎         | 1/30 [00:09<04:41,  9.71s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  7%|▋         | 2/30 [00:19<04:30,  9.66s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 10%|█         | 3/30 [00:28<04:20,  9.65s/it]The attention mask and the pad token id were not set. As a consequen

In [67]:
gen_params = {
	"temperature": 0.65,
	"max_new_tokens": 300,
    "top_p": 0.95
}

prompts = df.text.tolist()
results = []
for prompt in tqdm(prompts):
	res = generate_llm(prompt, gen_params)
	results.append(res)
responses["0.60"] += results

  0%|          | 0/30 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  3%|▎         | 1/30 [00:09<04:44,  9.81s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  7%|▋         | 2/30 [00:19<04:35,  9.83s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 10%|█         | 3/30 [00:29<04:24,  9.80s/it]The attention mask and the pad token id were not set. As a consequen

In [68]:
gen_params = {
	"temperature": 0.75,
	"max_new_tokens": 300,
    "top_p": 0.97
}

prompts = df.text.tolist()
results = []
for prompt in tqdm(prompts):
	res = generate_llm(prompt, gen_params)
	results.append(res)
responses["0.70"] += results

  0%|          | 0/30 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  3%|▎         | 1/30 [00:09<04:47,  9.92s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  7%|▋         | 2/30 [00:19<04:36,  9.87s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 10%|█         | 3/30 [00:29<04:25,  9.85s/it]The attention mask and the pad token id were not set. As a consequen

In [71]:
Path("data/results_model_4_rev_3.pkl").write_bytes(pickle.dumps(responses))

144710