In [None]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install xformers

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "meta-llama/Llama-3.1-8B-Instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 8.9. CUDA Toolkit = 12.4.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

meta-llama/Llama-3.1-8B-Instruct does not have a padding token! Will use pad_token = <|finetune_right_pad_id|>.


In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.11.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


<a name="Data"></a>
### Data Prep
We now use the Alpaca dataset from [yahma](https://huggingface.co/datasets/yahma/alpaca-cleaned), which is a filtered version of 52K of the original [Alpaca dataset](https://crfm.stanford.edu/2023/03/13/alpaca.html). You can replace this code section with your own data prep.

**[NOTE]** To train only on completions (ignoring the user's input) read TRL's docs [here](https://huggingface.co/docs/trl/sft_trainer#train-on-completions-only).

**[NOTE]** Remember to add the **EOS_TOKEN** to the tokenized output!! Otherwise you'll get infinite generations!

If you want to use the `llama-3` template for ShareGPT datasets, try our conversational [notebook](https://colab.research.google.com/drive/1XamvWYinY6FOSX9GLvnqSjjsNflxdhNc?usp=sharing).

For text completions like novel writing, try this [notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing).

def generate_prompt(data_point): return f""" Bestem om følgende tekst handler om identitetspolitik eller ikke identitetspolitik. text: {data_point["Text"]} label: {data_point["Output"]}{EOS_TOKEN}""".strip() def generate_test_prompt(data_point): return f""" Bestem om følgende tekst handler om identitetspolitik eller ikke identitetspolitik. text: {data_point["Text"]} label: {EOS_TOKEN}""".strip() # Add EOS token after the label field

In [None]:
from google.colab import drive
import pandas as pd

# Mount Google Drive
drive.mount('/content/drive')

# Load the data from the Excel file
df = pd.read_excel('/content/drive/MyDrive/Speciale/small_dataset512.xlsx', header=0)
print(df.columns)

# Select relevant columns and fill any missing values in the Output column with 0
df = df[['text', 100]]
df = df.fillna(0)

# Rename columns for consistency
df.rename(columns={"text": "Text", 100: "Output"}, inplace=True)

# Ensure the Output column is in binary format (1 for "identitetspolitik" and 0 for "ikke identitetspolitik")
# The column is already in binary format, so no need to modify further.

# Shuffle the DataFrame and limit to 265 rows for training/testing
df = df.sample(frac=1, random_state=85).reset_index(drop=True).head(512)

# Define split sizes for train, evaluation, and test sets
train_size = 0.8
eval_size = 0.1

# Calculate the split indices
train_end = int(train_size * len(df))
eval_end = train_end + int(eval_size * len(df))

# Split the data
X_train = df[:train_end]
X_eval = df[train_end:eval_end]
X_test = df[eval_end:]

# Extract true labels for evaluation and test sets
y_true = X_test['Output']

# Set the EOS token
EOS_TOKEN = tokenizer.eos_token

# Define prompt generation functions with explicit binary label meanings
def generate_prompt(data_point):
    return f"""
    Du er en klassifikationsmodel, der skal afgøre, om følgende tekst indeholder identitetspolitisk indhold eller ej.

    Klassifikation:
    - Klassificér teksten som identitetspolitik, hvis den relaterer til rettigheder, repræsentation eller ligestilling af historisk marginaliserede grupper (f.eks., kvinder, LGBTQ+, kønsminoriteter, etniske og religiøse minoriteter).
    - Klassificér teksten som ikke identitetspolitik, hvis den ikke indeholder sådanne elementer.

    Læs teksten grundigt og afgør klassifikationen.

    Text: {data_point["Text"]}
    Svar: {data_point["Output"]}{EOS_TOKEN}
    """.strip()

def generate_test_prompt(data_point):
    return f"""
    Du er en klassifikationsmodel, der skal afgøre, om følgende tekst indeholder identitetspolitisk indhold eller ej.

    Klassifikation:
    - Klassificér teksten som identitetspolitik, hvis den relaterer til rettigheder, repræsentation eller ligestilling af historisk marginaliserede grupper (f.eks., kvinder, LGBTQ+, kønsminoriteter, etniske og religiøse minoriteter).
    - Klassificér teksten som ikke identitetspolitik, hvis den ikke indeholder sådanne elementer.

    Læs teksten grundigt og afgør klassifikationen.

    Text: {data_point["Text"]}
    Svar: {EOS_TOKEN}
    """.strip()

# Generate prompts for training and evaluation data with binary labels
X_train['text'] = X_train.apply(generate_prompt, axis=1)
X_eval['text'] = X_eval.apply(generate_prompt, axis=1)

# Generate test prompts without adding the expected output (for pure evaluation)
X_test_prompts = X_test.apply(generate_test_prompt, axis=1)

# Convert X_test_prompts to a DataFrame for inference
X_test = pd.DataFrame(X_test_prompts, columns=["text"])

# Display the first few rows to verify that prompts are generated correctly
X_train.head()


Mounted at /content/drive
Index(['text', 100], dtype='object')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['text'] = X_train.apply(generate_prompt, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_eval['text'] = X_eval.apply(generate_prompt, axis=1)


Unnamed: 0,Text,Output,text
0,Det korte svar er ja. Vi er ikke i gang med at...,0.0,"Du er en klassifikationsmodel, der skal afgøre..."
1,Jeg har som formentlig alle her i salen et sto...,1.0,"Du er en klassifikationsmodel, der skal afgøre..."
2,Jeg takker meget for lige at få det sidste spø...,0.0,"Du er en klassifikationsmodel, der skal afgøre..."
3,"Men en af de ting, der er så fantastisk ved ju...",1.0,"Du er en klassifikationsmodel, der skal afgøre..."
4,"Det er jo sådan, at der i grundloven står besk...",1.0,"Du er en klassifikationsmodel, der skal afgøre..."


In [None]:
from datasets import Dataset
from tqdm import tqdm

# Convert to datasets
train_data = Dataset.from_pandas(X_train[["text"]])
eval_data = Dataset.from_pandas(X_eval[["text"]])

In [None]:
train_data

In [None]:
## This is meant for inspiration on how to design

# alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

# ### Instruction:
# {}

# ### Input:
# {}

# ### Response:
# {}"""

# EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
# def formatting_prompts_func(examples):
#     instructions = examples["instruction"]
#     inputs       = examples["input"]
#     outputs      = examples["output"]
#     texts = []
#     for instruction, input, output in zip(instructions, inputs, outputs):
#         # Must add EOS_TOKEN, otherwise your generation will go on forever!
#         text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
#         texts.append(text)
#     return { "text" : texts, }
# pass

# from datasets import load_dataset
# dataset = load_dataset("yahma/alpaca-cleaned", split = "train")
# dataset = dataset.map(formatting_prompts_func, batched = True,)

<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
import random
import numpy as np

# Set a fixed seed
seed = 3407

# Set seeds for various libraries to ensure reproducibility
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

# Ensure deterministic operations in PyTorch (may slow down the training a bit)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

dataset_text_field = "Text"

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_data,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 1,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 9, # Set this for 1 full training run. #3 epoch virker.
        max_steps = -1,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = seed,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
# pip install --upgrade --no-cache-dir --no-deps unsloth transformers git+https://github.com/huggingface/trl.git

In [None]:
# from numba import cuda
# device = cuda.get_current_device()
# device.reset()

In [None]:
## IMPORTANT train loss should converge -> 0 (get smaller)
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 408 | Num Epochs = 9
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 459
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,2.1742
2,2.1488
3,2.1697
4,2.0632
5,1.8635
6,1.68
7,1.5586
8,1.3258
9,1.2084
10,1.1532


<a name="Inference"></a>
### Inference
Let's run the model! You can change the instruction and input - leave the output blank!

**[NEW] Try 2x faster inference in a free Colab for Llama-3.1 8b Instruct [here](https://colab.research.google.com/drive/1T-YBVfnphoVc8E2E854qF3jdia2Ll2W2?usp=sharing)**

In [None]:
classification_prompt = """
Du er en klassifikationsmodel, der skal afgøre, om følgende tekst indeholder identitetspolitik eller ej.
Tekst: {text}
Svar:
"""

#Du er en klassifikationsmodel, der skal afgøre, om følgende tekst indeholder identitetspolitisk indhold eller ej.
# Bestem om følgende tekst handler om identitetspolitik eller ikke identitetspolitik.

classification_prompt1 = """
Tekst: {text}
Svar:
"""
classification_prompt2 = """
Du er en klassifikationsmodel, der skal afgøre, om følgende tekst indeholder identitetspolitisk indhold eller ej.

Klassifikation:
- Klassificér teksten som identitetspolitik, hvis den relaterer til rettigheder, repræsentation eller ligestilling af historisk marginaliserede grupper (f.eks., kvinder, LGBTQ+, kønsminoriteter, etniske og religiøse minoriteter).
- Klassificér teksten som ikke identitetspolitik, hvis den ikke indeholder sådanne elementer.

Læs teksten grundigt og afgør klassifikationen.

Tekst: {text}
Svar:
"""


In [None]:
from transformers import TextStreamer
import re

FastLanguageModel.for_inference(model)

def classify_text(text, max_length=1000):
    # Truncate text to avoid cutting off the generated answer¨
    truncated_text = text[:max_length]

    # Format the prompt
    prompt = classification_prompt.format(text=truncated_text)

    # Tokenize the input prompt
    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

    # Set deterministic inference parameters
    temperature = 0.1
    top_k = 1
    top_p = 0.1

    # Generate the output with increased max_new_tokens
    output = model.generate(**inputs,
                            max_new_tokens=6,
                            temperature=temperature,
                            top_k=top_k,
                            top_p=top_p)

    # Decode the output to get the generated text
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True).strip().lower()

    # Print generated text for debugging
    print(f"Generated text: {generated_text}")

    # Find the position where "svar:" appears in the generated text
    svar_index = generated_text.find("svar:")

    if svar_index != -1:  # If "svar:" is found
        # Extract the part of the text that comes after "svar:"
        answer = generated_text[svar_index + len("Svar:"):].strip()

        # Classify based on the presence of "1" in the answer
        if "1" in answer:
            return 1  # "identitetspolitik"
        else:
            return 0  # "ikke identitetspolitik"
    else:
        # If no "svar:" is found, return unknown
        return "unknown"


# Apply classification on your test dataset
X_test['Predicted_Label'] = X_test['text'].apply(classify_text)


Generated text: du er en klassifikationsmodel, der skal afgøre, om følgende tekst indeholder identitetspolitik eller ej.
tekst: du er en klassifikationsmodel, der skal afgøre, om følgende tekst indeholder identitetspolitisk indhold eller ej.

    klassifikation:
    - klassificér teksten som identitetspolitik, hvis den relaterer til rettigheder, repræsentation eller ligestilling af historisk marginaliserede grupper (f.eks., kvinder, lgbtq+, kønsminoriteter, etniske og religiøse minoriteter).
    - klassificér teksten som ikke identitetspolitik, hvis den ikke indeholder sådanne elementer.

    læs teksten grundigt og afgør klassifikationen.

    text: det danske sprog fylder rigtig, rigtig meget i det grønlandske samfund. det er, som om nu hvor vi skal til at tale mere engelsk, betyder det, at man skærer alt, hvad der er dansk, fra i skolen. det er det jo ikke. man har på ingen måde bestemt, at der ikke skal undervises i dansk i grønland.
    svar: 
svar:
    - 1.0
Generated text: du er

In [None]:
 X_test['Predicted_Label']

Unnamed: 0,Predicted_Label
459,1
460,0
461,0
462,1
463,0
464,0
465,1
466,1
467,0
468,0


In [None]:
 from sklearn.metrics import classification_report

# Print the classification report to evaluate performance
print(classification_report(y_true, X_test['Predicted_Label']))


              precision    recall  f1-score   support

         0.0       0.84      0.94      0.89        33
         1.0       0.87      0.68      0.76        19

    accuracy                           0.85        52
   macro avg       0.85      0.81      0.83        52
weighted avg       0.85      0.85      0.84        52



In [None]:
# Opret en ny dataframe med den sande værdi og forudsigelsen
X_test['True_Label'] = y_true

# Find de rækker hvor den forudsigte label er forkert
fejl = X_test[X_test['Predicted_Label'] != X_test['True_Label']]

# Vis tekst, True_Label og Predicted_Label for fejlene
pd.set_option('display.max_colwidth', None)  # Sikrer visning af hele teksten
print(fejl[['text', 'True_Label', 'Predicted_Label']])

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [None]:
# Antag, at 'True_Label' er den faktiske label, og 'Predicted_Label' er modelens forudsigelse
incorrect_predictions = X_test[X_test['True_Label'] != X_test['Predicted_Label']]
num_incorrect = incorrect_predictions.shape[0]

print(f"Antal forkerte forudsigelser: {num_incorrect}")


Antal forkerte forudsigelser: 8


In [None]:
import pandas as pd

# Filtrer de forkerte forudsigelser
incorrect_predictions = X_test[X_test['True_Label'] != X_test['Predicted_Label']]

# Vælg kun de nødvendige kolonner (True_Label, Predicted_Label og Text)
incorrect_predictions_df = incorrect_predictions[['text', 'True_Label', 'Predicted_Label']]

# Gem de forkerte forudsigelser i en Excel-fil
#Gem filen i Google Drive-mappen
incorrect_predictions_df.to_excel("/content/drive/MyDrive/Speciale/Forkerte_forudsigelser/Eksperiment_ny.xlsx", index=False)

print("Forkerte forudsigelser er gemt i filen '/content/drive/MyDrive/Speciale/Forkerte_forudsigelser/Eksperiment_ny.xlsx'")



Forkerte forudsigelser er gemt i filen '/content/drive/MyDrive/Speciale/Forkerte_forudsigelser/Eksperiment_ny.xlsx'


In [None]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    classification_prompt.format(
        "Continue the fibonnaci sequence.", # instruction
        "1, 1, 2, 3, 5, 8", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

KeyError: 'text'

<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [None]:
import os
model.save_pretrained("lora_model_tuned")
tokenizer.save_pretrained("lora_model_tuned")


('lora_model_tuned/tokenizer_config.json',
 'lora_model_tuned/special_tokens_map.json',
 'lora_model_tuned/tokenizer.json')

In [None]:
from google.colab import userdata

# Retrieve the Hugging Face token from the secrets manager
hf_token = userdata.get('HF_TOKEN')

# Push the model and tokenizer to the Hugging Face Hub
model.push_to_hub("KaroKruse/lora_model_tuned_1", use_auth_token=hf_token)
tokenizer.push_to_hub("KaroKruse/lora_model_tuned_1", use_auth_token=hf_token)

README.md:   0%|          | 0.00/586 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

Saved model to https://huggingface.co/KaroKruse/lora_model_tuned_1


  0%|          | 0/1 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`:

In [None]:
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# alpaca_prompt = You MUST copy from above!

inputs = tokenizer(
[
    alpaca_prompt.format(
        "What is a famous tall tower in Paris?", # instruction
        "", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

You can also use Hugging Face's `AutoModelForPeftCausalLM`. Only use this if you do not have `unsloth` installed. It can be hopelessly slow, since `4bit` model downloading is not supported, and Unsloth's **inference is 2x faster**.

In [None]:
if False:
    # I highly do NOT suggest - use Unsloth if possible
    from peft import AutoPeftModelForCausalLM
    from transformers import AutoTokenizer
    model = AutoPeftModelForCausalLM.from_pretrained(
        "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        load_in_4bit = load_in_4bit,
    )
    tokenizer = AutoTokenizer.from_pretrained("lora_model")

### Saving to float16 for VLLM

We also support saving to `float16` directly. Select `merged_16bit` for float16 or `merged_4bit` for int4. We also allow `lora` adapters as a fallback. Use `push_to_hub_merged` to upload to your Hugging Face account! You can go to https://huggingface.co/settings/tokens for your personal tokens.

In [None]:
# Merge to 16bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")

# Merge to 4bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")

# Just LoRA adapters
if False: model.save_pretrained_merged("model", tokenizer, save_method = "lora",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "lora", token = "")

### GGUF / llama.cpp Conversion
To save to `GGUF` / `llama.cpp`, we support it natively now! We clone `llama.cpp` and we default save it to `q8_0`. We allow all methods like `q4_k_m`. Use `save_pretrained_gguf` for local saving and `push_to_hub_gguf` for uploading to HF.

Some supported quant methods (full list on our [Wiki page](https://github.com/unslothai/unsloth/wiki#gguf-quantization-options)):
* `q8_0` - Fast conversion. High resource use, but generally acceptable.
* `q4_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K.
* `q5_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K.

[**NEW**] To finetune and auto export to Ollama, try our [Ollama notebook](https://colab.research.google.com/drive/1WZDi7APtQ9VsvOrQSSC5DDtxq159j8iZ?usp=sharing)

In [None]:
# Save to 8bit Q8_0
if False: model.save_pretrained_gguf("model", tokenizer,)
# Remember to go to https://huggingface.co/settings/tokens for a token!
# And change hf to your username!
if False: model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")

# Save to multiple GGUF options - much faster if you want multiple!
if False:
    model.push_to_hub_gguf(
        "hf/model", # Change hf to your username!
        tokenizer,
        quantization_method = ["q4_k_m", "q8_0", "q5_k_m",],
        token = "",
    )

Now, use the `model-unsloth.gguf` file or `model-unsloth-Q4_K_M.gguf` file in `llama.cpp` or a UI based system like `GPT4All`. You can install GPT4All by going [here](https://gpt4all.io/index.html).

**[NEW] Try 2x faster inference in a free Colab for Llama-3.1 8b Instruct [here](https://colab.research.google.com/drive/1T-YBVfnphoVc8E2E854qF3jdia2Ll2W2?usp=sharing)**

And we're done! If you have any questions on Unsloth, we have a [Discord](https://discord.gg/u54VK8m8tk) channel! If you find any bugs or want to keep updated with the latest LLM stuff, or need help, join projects etc, feel free to join our Discord!

Some other links:
1. Zephyr DPO 2x faster [free Colab](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing)
2. Llama 7b 2x faster [free Colab](https://colab.research.google.com/drive/1lBzz5KeZJKXjvivbYvmGarix9Ao6Wxe5?usp=sharing)
3. TinyLlama 4x faster full Alpaca 52K in 1 hour [free Colab](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing)
4. CodeLlama 34b 2x faster [A100 on Colab](https://colab.research.google.com/drive/1y7A0AxE3y8gdj4AVkl2aZX47Xu3P1wJT?usp=sharing)
5. Mistral 7b [free Kaggle version](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook)
6. We also did a [blog](https://huggingface.co/blog/unsloth-trl) with 🤗 HuggingFace, and we're in the TRL [docs](https://huggingface.co/docs/trl/main/en/sft_trainer#accelerate-fine-tuning-2x-using-unsloth)!
7. `ChatML` for ShareGPT datasets, [conversational notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing)
8. Text completions like novel writing [notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing)
9. [**NEW**] We make Phi-3 Medium / Mini **2x faster**! See our [Phi-3 Medium notebook](https://colab.research.google.com/drive/1hhdhBa1j_hsymiW9m-WzxQtgqTH_NHqi?usp=sharing)
10. [**NEW**] We make Gemma-2 9b / 27b **2x faster**! See our [Gemma-2 9b notebook](https://colab.research.google.com/drive/1vIrqH5uYDQwsJ4-OO3DErvuv4pBgVwk4?usp=sharing)
11. [**NEW**] To finetune and auto export to Ollama, try our [Ollama notebook](https://colab.research.google.com/drive/1WZDi7APtQ9VsvOrQSSC5DDtxq159j8iZ?usp=sharing)
12. [**NEW**] We make Mistral NeMo 12B 2x faster and fit in under 12GB of VRAM! [Mistral NeMo notebook](https://colab.research.google.com/drive/17d3U-CAIwzmbDRqbZ9NnpHxCkmXB6LZ0?usp=sharing)

<div class="align-center">
  <a href="https://github.com/unslothai/unsloth"><img src="https://github.com/unslothai/unsloth/raw/main/images/unsloth%20new%20logo.png" width="115"></a>
  <a href="https://discord.gg/u54VK8m8tk"><img src="https://github.com/unslothai/unsloth/raw/main/images/Discord.png" width="145"></a>
  <a href="https://ko-fi.com/unsloth"><img src="https://github.com/unslothai/unsloth/raw/main/images/Kofi button.png" width="145"></a></a> Support our work if you can! Thanks!
</div>