In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM

In [1]:
!pip install pandas \
    torch\
    transformers==4.43.1 \
    scikit-learn\
    accelerate \
    openpyxl

Collecting pandas
  Downloading pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Collecting transformers==4.43.1
  Downloading transformers-4.43.1-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scikit-learn
  Downloading scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting accelerate
  Downloading accelerate-1.6.0-py3-none-any.whl.metadata (19 kB)
Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers==4.43.1)
  Downloading huggingface_hub-0.30.1-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers==4.43.1)
  Downloading regex-2024.11.

In [3]:
from huggingface_hub import login

login(token="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")

In [4]:
import torch
MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.bfloat16,  # bfloat16
    device_map="auto"
)
model.resize_token_embeddings(len(tokenizer))
#padding token
model.config.pad_token_id = tokenizer.pad_token_id

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [5]:
print("Tokenizer vocab_size:", tokenizer.vocab_size)
print("Model embedding size:", model.get_input_embeddings().num_embeddings)
print(len(tokenizer))

Tokenizer vocab_size: 128000
Model embedding size: 128257
128257


In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-19.0.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting aiohttp (from datasets)
  Downloading aiohttp-3.11.16-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting aiohappyeyeballs>=2.3.0 (from aiohttp->datasets)
  Downloading aiohappyeyeballs-2.6.1-py3-none-any.whl.metadata (5.9 kB)
Collecting aiosignal>=1.1.2 (from aiohttp->datasets)
  Downloading aiosignal-1.3.2-py2.py3-none-any.whl.metadata (3.8 kB)
Collecting frozenlist>=1.1.1 (f

In [3]:
 !pip install peft==0.5.0

Collecting peft==0.5.0
  Downloading peft-0.5.0-py3-none-any.whl.metadata (22 kB)
Downloading peft-0.5.0-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.6/85.6 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: peft
Successfully installed peft-0.5.0
[0m

In [8]:
import os
import json
import torch
from datasets import load_dataset
from transformers import TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model

In [9]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 3,407,872 || all params: 8,033,677,312 || trainable%: 0.04241982678230828


In [10]:
data_files = {"train": "./llama_medical_instruction_data.json"}
dataset = load_dataset("json", data_files=data_files)

In [11]:
def preprocess_function(examples):
    prompts = []
    for instruction, inp, output in zip(examples["instruction"], examples["input"], examples["output"]):
        if isinstance(inp, list):
            inp = " ".join(inp)
        prompt = f"Instruction: {instruction}\n"
        if inp and isinstance(inp, str) and inp.strip():
            prompt += f"Input: {inp}\n"
        prompt += f"Output: {output}"
        prompts.append(prompt)
    tokenized = tokenizer(prompts, padding="max_length", truncation=True, max_length=512)
    # tokenized["labels"] = tokenized["input_ids"].copy()
    tokenized["labels"] = [
        [(tid if tid != tokenizer.pad_token_id else -100) for tid in ids]
        for ids in tokenized["input_ids"]
    ]
    return tokenized

tokenized_dataset = dataset["train"].map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names
)

In [12]:
print("Tokenizer vocab_size:", tokenizer.vocab_size)
print("Model embedding size:", model.get_input_embeddings().num_embeddings)


Tokenizer vocab_size: 128000
Model embedding size: 128257


In [13]:
# tokenized_dataset = dataset["train"].map(preprocess_function, batched=True)
tokenized_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 4920
})

In [14]:
print(tokenized_dataset[0]["input_ids"])

[128000, 17077, 25, 578, 8893, 374, 25051, 86075, 11, 6930, 57342, 11, 16387, 278, 6930, 61354, 1324, 11, 834, 41484, 292, 220, 29760, 13, 3639, 374, 279, 1455, 4461, 23842, 323, 1148, 656, 499, 7079, 5380, 5207, 25, 95452, 25, 435, 58267, 19405, 198, 80039, 512, 12, 9061, 11157, 198, 12, 1005, 3474, 337, 477, 841, 336, 304, 73509, 3090, 198, 12, 2567, 29374, 3158, 9235, 198, 12, 1005, 4335, 5405, 17323, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 

In [15]:
print(tokenized_dataset[0]["labels"])

[128000, 17077, 25, 578, 8893, 374, 25051, 86075, 11, 6930, 57342, 11, 16387, 278, 6930, 61354, 1324, 11, 834, 41484, 292, 220, 29760, 13, 3639, 374, 279, 1455, 4461, 23842, 323, 1148, 656, 499, 7079, 5380, 5207, 25, 95452, 25, 435, 58267, 19405, 198, 80039, 512, 12, 9061, 11157, 198, 12, 1005, 3474, 337, 477, 841, 336, 304, 73509, 3090, 198, 12, 2567, 29374, 3158, 9235, 198, 12, 1005, 4335, 5405, 17323, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100

In [16]:
training_args = TrainingArguments(
    output_dir="./lora_llama_medical",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=2e-4,
    logging_steps=10,
    save_strategy="steps",
    save_steps=100,
    fp16=True,
    report_to="none"
    # use_cpu=True # to see real bug
)

In [17]:
# model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto", low_cpu_mem_usage=True)

In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer
)

trainer.train()

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


Step,Training Loss
10,2.4242
20,1.5107
30,1.1754
40,0.857
50,0.5763
60,0.42
70,0.2877
80,0.1769
90,0.144
100,0.1395


TrainOutput(global_step=1845, training_loss=0.1230461179079402, metrics={'train_runtime': 8655.7413, 'train_samples_per_second': 1.705, 'train_steps_per_second': 0.213, 'total_flos': 3.404484059332608e+17, 'train_loss': 0.1230461179079402, 'epoch': 3.0})

In [19]:
model.save_pretrained("./lora_llama_medical_finetuned")
tokenizer.save_pretrained("./lora_llama_medical_finetuned")

('./lora_llama_medical_finetuned/tokenizer_config.json',
 './lora_llama_medical_finetuned/special_tokens_map.json',
 './lora_llama_medical_finetuned/tokenizer.json')

In [20]:
# print(max(tokenized_dataset[0]["input_ids"]))

In [21]:
# print(model.config.vocab_size)

In [22]:
# print(tokenized_dataset[0]["input_ids"])

In [23]:
# print(len(tokenized_dataset[0]["input_ids"]))