In [1]:
from unsloth import FastLanguageModel
import torch, gc, os
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [2]:
max_seq_length = 2048
dtype = None # None for auto detection
load_in_4bit = True # Use 4bit quantization to reduce memory usage.


In [3]:
gc.collect()
torch.cuda.empty_cache()

In [4]:
# We will use TinyLlama for this demo, but you can choose any other model
MODEL = "meta-llama/Meta-Llama-3-8B"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth: Fast Llama patching release 2024.5
   \\   /|    GPU: NVIDIA GeForce RTX 4090. Max memory: 23.41 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1. FA = True.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
# Use Lora adapters to update only 1 - 10 % of params
model = FastLanguageModel.get_peft_model(
    model,
    r = 128, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,   # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [6]:
# Get the data to train
dataset = load_dataset("wikimedia/wikipedia", "20231101.hi", split = "train",)
# Use only 10% of data
dataset = dataset.train_test_split(train_size = 0.1)["train"]

In [7]:
dataset

Dataset({
    features: ['id', 'url', 'title', 'text'],
    num_rows: 16309
})

# Continued Pre-training

This is continued pre-training to instill more knowledge into already trained base model. We just need a dataset with text field containing data we need to train on.

In [8]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing=False,

    args = TrainingArguments(
        per_device_train_batch_size = 1,
        # gradient_accumulation_steps = 8,

        # Use warmup_ratio and num_train_epochs for longer runs!
        max_steps = 240,
        warmup_steps = 20,
        # warmup_ratio = 0.1,
        # num_train_epochs = 1,

        # Select a 2 to 10x smaller learning rate for the embedding matrices!
        learning_rate = 5e-5,
        # embedding_learning_rate = 1e-5,

        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 20,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        # lr_scheduler_type = "cosine",
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

max_steps is given, it will override any value given in num_train_epochs


In [9]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 4090. Max memory = 23.41 GB.
6.699 GB of memory reserved.


In [10]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 16,309 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 1
\        /    Total batch size = 1 | Total steps = 240
 "-____-"     Number of trainable parameters = 335,544,320


Step,Training Loss
20,1.66
40,1.2365
60,1.2722
80,1.3738
100,1.4262
120,1.3188
140,1.4255
160,1.1966
180,1.1423
200,1.1584


In [11]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

84.8644 seconds used for training.
1.41 minutes used for training.
Peak reserved memory = 9.645 GB.
Peak reserved memory for training = 2.946 GB.
Peak reserved memory % of max memory = 41.2 %.
Peak reserved memory for training % of max memory = 12.584 %.


In [12]:
model.save_pretrained_merged("lora_unsloth_unopt_full", tokenizer, save_method="merged_16bit")
tokenizer.save_pretrained("lora_unsloth_unopt_full")

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 81.49 out of 125.5 RAM for saving.


 75%|████████████████████████████████▎          | 24/32 [00:00<00:00, 56.83it/s]We will save to Disk and not RAM now.
100%|███████████████████████████████████████████| 32/32 [00:02<00:00, 11.49it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Done.


('lora_unsloth_unopt_full/tokenizer_config.json',
 'lora_unsloth_unopt_full/special_tokens_map.json',
 'lora_unsloth_unopt_full/tokenizer.json')

# Instruction Finetuning

We now use alpaca dataset in hindi for instruction fine-tuning.

In [None]:
gc.collect()
torch.cuda.empty_cache()

In [4]:
from datasets import load_dataset
alpaca_dataset = load_dataset("FreedomIntelligence/alpaca-gpt4-hindi", split = "train")

Repo card metadata block was not found. Setting CardData to empty.


In [5]:
print(alpaca_dataset[0])

{'conversations': [{'from': 'human', 'value': 'कुछ एक रीसाइक्लिंग अभियान के लिए एक नारा सुझाव दें।\n'}, {'from': 'gpt', 'value': '1. "ग्रीन भविष्य के लिए एक साथ: कम करें, पुन: उपयोग करें, रीसाइकल करें।"\n2. "एक बेहतर कल के लिए आज ही रीसाइकल करें।"\n3. "अपने कचरे को खजाना बनाएं - रीसाइकल करें!"\n4. "जीवन के चक्र के लिए रीसाइकल करें।"\n5. "संसाधन बचाएं, अधिक रीसाइकल करें।"'}], 'id': '23712'}


In [6]:
alpaca_hindi_prompt="""नीचे एक निर्देश है जो किसी कार्य का वर्णन करता है, जिसे एक इनपुट के साथ जोड़ा गया है जो आगे का संदर्भ प्रदान करता है। ऐसा उत्तर लिखें जो अनुरोध को उचित रूप से पूरा करता हो।

### निर्देश:
{}

### इनपुट:
{}

### प्रतिक्रिया:
{}"""

In [8]:
EOS_TOKEN = tokenizer.eos_token
def formatting_prompts_func(conversations):
    texts = []
    conversations = conversations["conversations"]
    for convo in conversations:
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        # Careful Aya Dataset does not have an input!
        text = alpaca_hindi_prompt.format(convo[0]["value"], "", convo[1]["value"]) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

alpaca_dataset = alpaca_dataset.map(formatting_prompts_func, batched = True,)

In [7]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "lora_unsloth_unopt_full", # YOUR MODEL YOU USED FOR TRAINING
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth: Fast Llama patching release 2024.5
   \\   /|    GPU: NVIDIA GeForce RTX 4090. Max memory: 23.41 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1. FA = True.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
# Use Lora adapters to update only 1 - 10 % of params
model = FastLanguageModel.get_peft_model(
    model,
    r = 128, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,   # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [10]:
model.print_trainable_parameters()

trainable params: 335,544,320 || all params: 8,365,805,568 || trainable%: 4.0109


In [11]:
trainer_instrc = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = alpaca_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 8,

    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,

        # Use num_train_epochs and warmup_ratio for longer runs!
        max_steps = 240,
        warmup_steps = 20,
        # warmup_ratio = 0.1,
        # num_train_epochs = 1,

        # Select a 2 to 10x smaller learning rate for the embedding matrices!
        learning_rate = 5e-5,
        # embedding_learning_rate = 1e-5,

        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 20,
        optim = "adamw_8bit",
        weight_decay = 0.00,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs_instrc",
    ),
)

max_steps is given, it will override any value given in num_train_epochs


In [12]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 4090. Max memory = 23.41 GB.
6.719 GB of memory reserved.


In [13]:
trainer_instrc_stats = trainer_instrc.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 49,969 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 240
 "-____-"     Number of trainable parameters = 335,544,320


Step,Training Loss
20,1.2692
40,0.9388
60,0.8217
80,0.8756
100,0.8327
120,0.8041
140,0.8559
160,0.7816
180,0.8307
200,0.7761


In [15]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_instrc_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_instrc_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

596.7715 seconds used for training.
9.95 minutes used for training.
Peak reserved memory = 11.332 GB.
Peak reserved memory for training = 4.613 GB.
Peak reserved memory % of max memory = 48.407 %.
Peak reserved memory for training % of max memory = 19.705 %.


In [16]:
# Save final model
model.save_pretrained("lora_unsloth_instrc_unopt")
tokenizer.save_pretrained("lora_unsloth_instrc_unopt")

('lora_unsloth_instrc_unopt/tokenizer_config.json',
 'lora_unsloth_instrc_unopt/special_tokens_map.json',
 'lora_unsloth_instrc_unopt/tokenizer.json')

In [17]:
# model.push_to_hub_merged("saucam/lora-llama-3-8B-unsloth-unopt", tokenizer, save_method = "merged_16bit", token = "")

Unsloth: You are pushing to hub, but you passed your HF username = saucam.
We shall truncate saucam/lora-llama-3-8B-unsloth-unopt to lora-llama-3-8B-unsloth-unopt


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 76.58 out of 125.5 RAM for saving.


 62%|██████████████████████████▉                | 20/32 [00:00<00:00, 58.32it/s]We will save to Disk and not RAM now.
100%|███████████████████████████████████████████| 32/32 [00:02<00:00, 11.52it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...


model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

Done.
Saved merged model to https://huggingface.co/saucam/lora-llama-3-8B-unsloth-unopt


# Inference

In [18]:
gc.collect()
torch.cuda.empty_cache()

In [19]:
# from unsloth import FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "lora_unsloth_instrc_unopt",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# alpaca_prompt = You MUST copy from above!


==((====))==  Unsloth: Fast Llama patching release 2024.5
   \\   /|    GPU: NVIDIA GeForce RTX 4090. Max memory: 23.41 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1. FA = True.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [21]:
inputs = tokenizer(
[
    alpaca_hindi_prompt.format(
        # "Describe the planet Earth extensively.", # instruction
        "पृथ्वी ग्रह का विस्तृत वर्णन करें",
        "", # input
        "", # output - leave this blank for generation!
    ),
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 256, use_cache = True, repetition_penalty = 0.9, temperature = 0.7)
tokenizer.batch_decode(outputs)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


['<|begin_of_text|>नीचे एक निर्देश है जो किसी कार्य का वर्णन करता है, जिसे एक इनपुट के साथ जोड़ा गया है जो आगे का संदर्भ प्रदान करता है। ऐसा उत्तर लिखें जो अनुरोध को उचित रूप से पूरा करता हो।\n\n### निर्देश:\nपृथ्वी ग्रह का विस्तृत वर्णन करें\n\n### इनपुट:\n\n\n### प्रतिक्रिया:\nपृथ्वी ग्रह एक ग्रह है जो सूर्य की कक्षा में सूर्य से कुछ 150 मिलियन किलोमीटर की दूरी पर एक विशेष स्थान पर स्थित है। पृथ्वी ग्रह का नाम प्राचीन ग्रीक देवी गा के नाम पर रखा गया है, जो किसी प्राथमिक देवता के रूप में मानी जाती है। पृथ्वी ग्रह का व्यास 12,742 किलोमीटर है और यह सूर्य के पास कक्षा के प्रत्येक स्थान पर किसी ग्रह के लिए विशेष रूप से स्थान है। पृथ्वी ग्रह की विशेषता है कि यह एक पर्यावरण में जीवन के लिए अनुकूल है, जो सूर्य के पास स्थित ग्रह के लिए एक विशेष स्थान है। पृथ्व']