In [14]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

# Install necessary libraries such as Unsloth, Transformers, and Datasets for fine-tuning

Unsloth is the key library that provides faster fine-tuning.

Transformers and Datasets from Hugging Face help us work with LLMs and datasets.

Torch and bitsandbytes are for efficient model training and quantization.


In [5]:
!pip install bitsandbytes accelerate transformers datasets torch





AutoModelForCausalLM allows us to load causal language models like Llama, which we will fine-tune.
AutoTokenizer is used to tokenize input text.
FastLanguageModel from Unsloth will handle faster fine-tuning.

In [16]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048
dtype = None
load_in_4bit = True

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-v0.3-bnb-4bit",      # New Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",           # Llama-3 15 trillion tokens model 2x faster!
    "unsloth/llama-3-8b-Instruct-bnb-4bit",
    "unsloth/llama-3-70b-bnb-4bit",
    "unsloth/Phi-3-mini-4k-instruct",        # Phi-3 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",             # Gemma 2.2x faster!
]

# Loading the Llama 3.1 (8B) model using FastLanguageModel for 4-bit quantization to reduce memory usage and improve fine-tuning speed.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2024.9.post4: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/198 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

In [17]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.9.post4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.



# Data Preparation
We will be using the Alpaca dataset provided by yahma, which is a refined version of 52K examples from the original Alpaca dataset. Feel free to swap out this dataset with your own for customization.

Important: If you wish to train solely on the completions, excluding the user's input, refer to the documentation from TRL here for more information.

Reminder: Ensure you add the EOS_TOKEN to the tokenized outputs, as failure to do so may result in infinite text generation.



In [19]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("yahma/alpaca-cleaned", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

README.md:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

alpaca_data_cleaned.json:   0%|          | 0.00/44.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/51760 [00:00<?, ? examples/s]

Map:   0%|          | 0/51760 [00:00<?, ? examples/s]

# Train the model

In [38]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

# Explanation:
# This section sets up the fine-tuning trainer using SFTTrainer from TRL (Transformers Reinforcement Learning).
# The SFTTrainer is used for supervised fine-tuning of models. We provide it with the model, tokenizer, dataset,
# and training arguments, including batch size, learning rate, and optimization settings.


trainer = SFTTrainer(
    model=model,  # The model we are fine-tuning
    tokenizer=tokenizer,  # Tokenizer for processing input text
    train_dataset=dataset,  # Training dataset
    dataset_text_field="text",  # Specifies the field in the dataset to use for training
    max_seq_length=max_seq_length,  # Maximum sequence length for input
    dataset_num_proc=2,  # Number of processes to use for dataset preprocessing
    packing=False,  # Packing short sequences can speed up training but is disabled here
    args=TrainingArguments(
        per_device_train_batch_size=2,  # The batch size for training per device
        gradient_accumulation_steps=4,  # Number of steps to accumulate gradients for each batch
        warmup_steps=5,  # Number of warmup steps for the learning rate scheduler
        max_steps=60,  # The total number of steps to run during training
        learning_rate=2e-4,  # Initial learning rate for training
        fp16=not is_bfloat16_supported(),  # Use fp16 if bfloat16 is not supported
        bf16=is_bfloat16_supported(),  # Use bfloat16 if supported for efficient training on certain hardware
        logging_steps=1,  # Log every step for detailed tracking
        optim="adamw_8bit",  # Use the AdamW optimizer with 8-bit precision for memory efficiency
        weight_decay=0.01,  # Apply weight decay to reduce overfitting
        lr_scheduler_type="linear",  # Linear learning rate scheduler
        seed=3407,  # Random seed for reproducibility
        output_dir="outputs",  # Directory where training outputs will be saved
    ),
)

Map (num_proc=2):   0%|          | 0/51760 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [39]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 51,760 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,2.6133
2,2.6523
3,2.5273
4,2.7734
5,2.5
6,2.7793
7,2.5312
8,2.7617
9,2.5059
10,2.8203


This block of code calculates and displays the memory usage and training time statistics after the fine-tuning process.
It provides insights into how much GPU memory was used overall, how much was specifically used for LoRA fine-tuning,
and how long the training took in both seconds and minutes.

# Show final memory and time stats

In [40]:

# Calculate the peak GPU memory usage and time statistics
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)

# Display the time statistics
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")

# Display the memory usage statistics
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

300.1179 seconds used for training.
5.0 minutes used for training.
Peak reserved memory = 12.172 GB.
Peak reserved memory for training = 0.889 GB.
Peak reserved memory % of max memory = 82.533 %.
Peak reserved memory for training % of max memory = 6.028 %.


# Inference

In [41]:
# Enable native 2x faster inference using Unsloth's optimizations
FastLanguageModel.for_inference(model)
inputs = tokenizer(
[

# Define the input prompt based on the Alpaca format
# The instruction here is to continue the Fibonacci sequence
    alpaca_prompt.format(
        "Continue the fibonnaci sequence.", # instruction
        "1, 1, 2, 3, 5, 8", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

# Generate the output based on the input, using the model to generate up to 64 new tokens
# use_cache=True enables faster generation by caching key-value pairs of attention

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

['<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nContinue the fibonnaci sequence.\n\n### Input:\n1, 1, 2, 3, 5, 8\n\n### Response:\n13\n<|end_of_text|>']

In [48]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model)

# Define the Alpaca-style prompt with an instruction to continue the prime number sequence

inputs = tokenizer(
    [
        alpaca_prompt.format(
            "Continue the prime sequence till 100",  # Instruction to the model
            "3,5,7",  # Input prime numbers
            "",  # Output is left blank for generation
        )
    ], return_tensors="pt"  # Convert the input to PyTorch tensors
).to("cuda")  # Move the inputs to GPU for faster inference

rom transformers import TextStreamer

# Initialize a TextStreamer, which will stream the decoded tokens in real-time
text_streamer = TextStreamer(tokenizer)

# Generate text based on the input prompt and stream the output tokens as they are generated.
# We set max_new_tokens to 128, meaning the model will generate up to 128 new tokens.
_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=128)

<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Continue the prime sequence till 100

### Input:
3,5,7

### Response:
11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71,73,79,83,89,97
<|end_of_text|>


In [43]:
model.save_pretrained("lora_model") # Local saving
tokenizer.save_pretrained("lora_model")

('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/tokenizer.json')

# Saving, loading finetuned models

In [44]:
# Explanation:
# This section demonstrates how to run inference using a model that was fine-tuned using Unsloth, specifically a LoRA fine-tuned model.
# We use a pre-trained model to generate an answer for a prompt about a famous tower in Paris.
# The inference is done with Unsloth's 2x faster inference capability.

if False:  # This block is set to not run, but if you were loading your model, this is how it would be done
    from unsloth import FastLanguageModel

    # Load the fine-tuned model and tokenizer using Unsloth with 4-bit quantization for efficiency
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name="lora_model",  # The model you fine-tuned earlier
        max_seq_length=max_seq_length,  # The maximum sequence length for the input
        dtype=dtype,  # The data type used during fine-tuning (e.g., float16, bf16)
        load_in_4bit=load_in_4bit,  # Whether to load the model using 4-bit precision
    )

    # Enable Unsloth's native 2x faster inference
    FastLanguageModel.for_inference(model)

# Explanation:
# The `alpaca_prompt` is the format template that provides an instruction for the model to process.
# Here, we ask the model a general knowledge question about a famous landmark in Paris.

# Define the input prompt using the Alpaca format
inputs = tokenizer(
    [
        alpaca_prompt.format(
            "What is a famous tall tower in Paris?",  # Instruction for the model to answer
            "",  # No input is provided here, leaving it blank for generation
            "",  # Output is left blank for the model to generate the response
        )
    ], return_tensors="pt"  # Convert the input into PyTorch tensors for model processing
).to("cuda")  # Move the input tensors to the GPU for faster inference

# Explanation:
# The model is set to generate up to 64 tokens as a response to the input prompt. We also enable the cache to speed up generation.
outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)

# Decode the output tokens back into text so we can understand the generated answer
tokenizer.batch_decode(outputs)


['<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nWhat is a famous tall tower in Paris?\n\n### Input:\n\n\n### Response:\nThe Eiffel Tower is a famous tall tower in Paris.<|end_of_text|>']

In [45]:
if False:  # This block is not intended to run. It is provided for reference.

    # Importing necessary modules from PEFT and Hugging Face Transformers
    from peft import AutoPeftModelForCausalLM  # Used for loading PEFT models for causal language modeling tasks
    from transformers import AutoTokenizer  # Hugging Face tokenizer for processing input text

    # Load the PEFT fine-tuned model and tokenizer
    model = AutoPeftModelForCausalLM.from_pretrained(
        "lora_model",  # Specify the model you fine-tuned using LoRA (Low-Rank Adaptation)
        load_in_4bit=load_in_4bit,  # Load the model using 4-bit quantization to save memory
    )

    # Load the corresponding tokenizer for the fine-tuned model
    tokenizer = AutoTokenizer.from_pretrained("lora_model")


# Saving to float16 for VLLM

Explanation:
 This section shows how to save and upload the fine-tuned model in different formats: 16-bit, 4-bit, and just LoRA adapters.

 Depending on the precision and size requirements, you can either save the merged full model or just the LoRA adapters.

None of these blocks are set to run (if False) and can be enabled depending on your needs.


In [46]:
# Merge to 16-bit precision
if False:
    model.save_pretrained_merged("model", tokenizer, save_method="merged_16bit")
    # Explanation:
    # This will save the model merged into 16-bit precision (FP16). This is useful for maintaining higher precision
    # while reducing the memory footprint compared to full 32-bit models.
    # The model and tokenizer will be saved to the local directory called "model".

if False:
    model.push_to_hub_merged("hf/model", tokenizer, save_method="merged_16bit", token="")
    # Explanation:
    # This command pushes the 16-bit merged model to Hugging Face's model hub for sharing or deployment.
    # You must provide an authentication token from Hugging Face in the `token` parameter to push the model to your account.

# Merge to 4-bit precision
if False:
    model.save_pretrained_merged("model", tokenizer, save_method="merged_4bit")
    # Explanation:
    # This command saves the model merged into 4-bit precision, making it much more memory-efficient.
    # This is ideal for deployment on machines with limited GPU resources where 4-bit quantization suffices.

if False:
    model.push_to_hub_merged("hf/model", tokenizer, save_method="merged_4bit", token="")
    # Explanation:
    # Pushes the 4-bit merged model to Hugging Face's model hub. This allows you to share the model publicly or use it for inference.
    # You need to provide a Hugging Face token for authentication.

# Save just the LoRA adapters
if False:
    model.save_pretrained_merged("model", tokenizer, save_method="lora")
    # Explanation:
    # This command saves only the LoRA adapters, which represent the fine-tuned modifications to the base model.
    # This approach is useful when you want to keep the base model separate and only save the updates from fine-tuning.

if False:
    model.push_to_hub_merged("hf/model", tokenizer, save_method="lora", token="")
    # Explanation:
    # Pushes just the LoRA adapters to the Hugging Face hub. This can be combined later with the base model for inference or further fine-tuning.
    # You need a Hugging Face token to authenticate the upload.


# GGUF / llama.cpp Conversion

In [47]:

# Save to 8bit Q8_0
if False: model.save_pretrained_gguf("model", tokenizer,)
if False: model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")