In [1]:
# !pip install mlflow
# !pip install unsloth
# !pip install --upgrade --no-deps "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
# !pip install numpy torch
# !pip uninstall bitsandbytes -y
# !pip install bitsandbytes
# !pip install accelerate
# !pip install datasets
# !pip install tf-keras

# Load model and tokenizer

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from transformers import BitsAndBytesConfig
import mlflow
import mlflow.pytorch
import os
# Model and tokenizer setup
from unsloth import FastLanguageModel
import torch

# Disable wandb
os.environ["WANDB_DISABLED"] = "true"
# Initialize MLflow
mlflow.set_experiment("LLM Fine-Tuning")  # Replace with your desired experiment name

max_seq_length = 512 # 1024 # 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-v0.3-bnb-4bit",      # New Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",           # Llama-3 15 trillion tokens model 2x faster!
    "unsloth/llama-3-8b-Instruct-bnb-4bit",
    "unsloth/llama-3-70b-bnb-4bit",
    "unsloth/Phi-3-mini-4k-instruct",        # Phi-3 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",             # Gemma 2.2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    # model_name = "unsloth/Phi-3-mini-4k-instruct",
    model_name = "unsloth/Llama-3.2-1B-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)


  from .autonotebook import tqdm as notebook_tqdm
2025-01-26 14:49:17.387375: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-26 14:49:17.586163: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1737899357.650425    4879 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1737899357.670444    4879 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-26 14:49:17.833734: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorF

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.1.6: Fast Llama patching. Transformers: 4.48.1.
   \\   /|    GPU: NVIDIA GeForce RTX 3050 Ti Laptop GPU. Max memory: 3.706 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.1.6 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


# Load and prepare dataset

In [2]:
import pandas as pd
from datasets import load_dataset
from unsloth import apply_chat_template
from unsloth import standardize_sharegpt
from unsloth import to_sharegpt

chat_template = """Below describes some details about some city that can be visited.
Write a travel guide out of the information that you received in instructions.

### Instruction:
{INPUT}

### Response:
{OUTPUT}"""
merged_prompt = "[Generate travel guide about the following place and details provided: {input}]"
default_system_message = "You are a helpful assistant that generates tourist guides based on input"


# Load data
train_data = load_dataset(
    "csv",
    data_files="top_3000_longest_inputs.csv",
    split="train"
)
eval_data = load_dataset(
    "csv",
    data_files="cro_output_with_ner.csv",
)
eval_data = eval_data['train']

print(train_data[0])
print(eval_data)

# Prepare train dataset
train_dataset = to_sharegpt(
    train_data,
    merged_prompt = merged_prompt,
    conversation_extension = 1,
    output_column_name = "output",
)
train_dataset = standardize_sharegpt(train_dataset)

print(train_dataset)
print(train_dataset[0])


train_dataset = apply_chat_template(
    train_dataset,
    tokenizer = tokenizer,
    chat_template = chat_template,
    default_system_message = default_system_message
)

# Prepare test dataset
eval_dataset = to_sharegpt(
    eval_data,
    merged_prompt = merged_prompt,
    conversation_extension = 1,
    output_column_name = "output",
)
eval_dataset = standardize_sharegpt(eval_dataset)

print(eval_dataset)
print(eval_dataset[0])

split_datasets = eval_dataset.train_test_split(test_size=0.5, seed=42)


eval_dataset = tokenizer.apply_chat_template(
    split_datasets['train']['conversations'],
    add_generation_prompt=True,
    return_tensors="pt",
    padding=True,
    truncation=True,  # Enable truncation
).to("cuda")

test_dataset = tokenizer.apply_chat_template(
    split_datasets['test']['conversations'],
    add_generation_prompt=True,
    return_tensors="pt",
    padding=True,
    truncation=True,  # Enable truncation
).to("cuda")



print(split_datasets)

Unsloth: We automatically added an EOS token to stop endless generations.


{'input': 'Nagykanizsa | other | Nagykanizsa Bus Station | Ajka, six per day, two hours, Alsópáhok, hourly, Aszófő, Badacsony, ten per day, one and half hours, Szigliget, six per day, one and half hours, Bak, Balatonakali, Balatonalmádi, Balatonboglár, three per day, one and half hours, Balatonederics, Balatonfüzfő, Balatongyörök, hourly, Balatonlelle, Balatonmáriafürdő, Balatonrendes, Balatonszentgyörgy, hourly, Balatonszepezd, Balatonudvari, Bánokszentgyörgy, hourly, Becehegy, hourly, hourly, Bikal, daily, Bocska, Bonyhád, Böhönye, hourly, Budapest, hourly, Celldömölk (daily, Cserszegtomaj, Csesztreg, Csörnyeföld, Csurgó, Devecser, Dióskál, Dobogómajor, Dunaföldvár, Egeraracsa, Egervár, Esztergályhorváti, Fakospuszta, Felsőbáránd, hourly, Felsőpáhok, hourly, Fenékpuszta, Fonyód, Galambok, hourly, Garabonc, Gellénháza, hourly, Gelse, Gyenesdiás, hourly, Győr, Hagyárosbörönd, Hahót, hourly, Hegyesd, Hévíz, every twenty minutes, Kapolcs, Kaposvár, hourly, Karmacs, hourly, Káld, Kálócfa,

Map: 100%|██████████| 3000/3000 [00:00<00:00, 32893.06 examples/s]


Dataset({
    features: ['conversations'],
    num_rows: 358
})
{'conversations': [{'content': "[The city and available attractions are ('Brač | see | Pustinja Blaca | 1551, 2007',).]", 'role': 'user'}, {'content': 'A former monastery originating from 1551, now a museum run by two brothers. In the 2007 the hermitage was included in the UNESCO World Heritage Tentative List.', 'role': 'assistant'}]}
DatasetDict({
    train: Dataset({
        features: ['conversations'],
        num_rows: 179
    })
    test: Dataset({
        features: ['conversations'],
        num_rows: 179
    })
})


In [3]:
print(test_dataset)
print(test_dataset[0])

tensor([[128000,  39314,  16964,  ..., 128004, 128004, 128004],
        [128000,  39314,  16964,  ..., 128004, 128004, 128004],
        [128000,  39314,  16964,  ..., 128004, 128004, 128004],
        ...,
        [128000,  39314,  16964,  ..., 128004, 128004, 128004],
        [128000,  39314,  16964,  ..., 128004, 128004, 128004],
        [128000,  39314,  16964,  ..., 128004, 128004, 128004]],
       device='cuda:0')
tensor([128000,  39314,  16964,   1063,   3649,    922,   1063,   3363,    430,
           649,    387,  12263,    627,   8144,    264,   5944,   8641,    704,
           315,    279,   2038,    430,    499,   4036,    304,  11470,    382,
         14711,  30151,    512,     58,    791,   3363,    323,   2561,  39591,
           527,   4417,     57,    351,  32575,    765,   8343,    765,   2009,
           426,  15931,    765,   8753,     11,    279,   3263,    306,   9419,
         10609,   1037,  14894,    518,    570,   2595,  14711,   6075,    512,
            32,   

# Prepare training arguments

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = split_datasets['train'],
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 1,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60,
        learning_rate = 2e-4,
        # fp16=True,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Map: 100%|██████████| 3000/3000 [00:00<00:00, 7248.71 examples/s]
Map: 100%|██████████| 179/179 [00:00<00:00, 9018.38 examples/s]


# Train

In [None]:
from accelerate import Accelerator
import mlflow.pytorch
import torch

# Unwrap the model to ensure it's pickleable
accelerator = Accelerator()

# Unwrap the model
model = accelerator.unwrap_model(model)

# Optional: Manually disable AMP by setting the model to float32 (default precision)
model = model.to(torch.float32)  # Convert the model to float32

# Ensure no AMP optimization remains
for param in model.parameters():
    if param.grad is not None:
        param.grad.detach_()

model.zero_grad()

# Log the final model and tokenizer to MLflow
with mlflow.start_run():
    mlflow.log_params({
        "model_name": "unsloth/mistral-7b-v0.3-bnb-4bit",
        "learning_rate": trainer.args.learning_rate,
        "batch_size": trainer.args.per_device_train_batch_size,
        "num_train_epochs": trainer.args.num_train_epochs,
        "weight_decay": trainer.args.weight_decay,
    })

    # Log metrics (you can customize this to log additional metrics)
    eval_metrics = trainer.evaluate()
    mlflow.log_metrics(eval_metrics)

    print("Training and logging completed!")


Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


Training and logging completed!


In [None]:
model.save_pretrained("lora_model") # Local saving
tokenizer.save_pretrained("lora_model")

('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/tokenizer.json')

In [4]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
print(split_datasets['test'][0])
messages = [                    # Change below!
    *split_datasets['test'][0]['conversations']
]
input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
print('generate:')
_ = model.generate(input_ids, streamer = text_streamer, max_new_tokens = 128, pad_token_id = tokenizer.eos_token_id)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


{'conversations': [{'content': "[The city and available attractions are ('Zagreb | eat | Le Bistro | French, the Regent Esplanade Hotel',).]", 'role': 'user'}, {'content': 'A French restaurant within the Regent Esplanade Hotel..', 'role': 'assistant'}]}
generate:
A French restaurant within the Regent Esplanade Hotel..<|end_of_text|>


In [11]:
messages = [                    # Change below!
    split_datasets['test'][i]['conversations'][0] for i in range(len(split_datasets['test']))
]
messages

[{'content': "[The city and available attractions are ('Zagreb | eat | Le Bistro | French, the Regent Esplanade Hotel',).]",
  'role': 'user'},
 {'content': "[The city and available attractions are ('Zagreb | drink | Oliver Twist | Irish, summer',).]",
  'role': 'user'},
 {'content': "[The city and available attractions are ('Zagreb | sleep | Funk Lounge | ',).]",
  'role': 'user'},
 {'content': "[The city and available attractions are ('Karlovac | sleep | Hostel Na putu | Monday-Sunday, August 2014, Irish',).]",
  'role': 'user'},
 {'content': "[The city and available attractions are ('Zagreb | sleep | All 4 seasons Hostel | just 10 minutes, Zagreb, Josip Jelačić',).]",
  'role': 'user'},
 {'content': "[The city and available attractions are ('Zadar | sleep | Hotel Kolovare | ',).]",
  'role': 'user'},
 {'content': "[The city and available attractions are ('Kornati National Park | see | Krune | ',).]",
  'role': 'user'},
 {'content': "[The city and available attractions are ('Korčula 

In [12]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
print(split_datasets['test'][0])
messages = [                    # Change below!
    split_datasets['test'][i]['conversations'][0] for i in range(len(split_datasets['test']))
]
messages
input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
print('generate:')
_ = model.generate(input_ids, streamer = text_streamer, max_new_tokens = 128, pad_token_id = tokenizer.eos_token_id)

{'conversations': [{'content': "[The city and available attractions are ('Zagreb | eat | Le Bistro | French, the Regent Esplanade Hotel',).]", 'role': 'user'}, {'content': 'A French restaurant within the Regent Esplanade Hotel..', 'role': 'assistant'}]}
generate:
[The city and available attractions are ('Zagreb | other | Croatia | ',).]

### Response:
[The city and available attractions are ('Zagreb | see | Strossmayer Square | Strossmayer, the 18th century, Croatian, Josip, Juraj Strossmayer',).]

### Response:
[The city and available attractions are ('Zagreb | sleep | Hotel Phoenix | ',).]

### Response:
[The city and available attractions are ('Zagreb | eat | SladoMazo | ',).]

### Response:
[The city and available attractions are ('Zag


In [None]:
model.eval()  # Set the model to evaluation mode
model.to("cuda")  # Important for using CUDA


def test_model(model, tokenizer, eval_dataset, max_length=512):
    results = []
    for example in eval_dataset:
        conversations = example["conversations"]

        # Find the user and assistant conversation elements
        user_input = None
        assistant_output = None
        for conversation in conversations:
            if conversation["role"] == "user":
                user_input = conversation["content"]
            elif conversation["role"] == "assistant":
                assistant_output = conversation["content"]

        if user_input is None:
            print(f"WARNING: Example missing user input: {example}")
            continue

        inputs = tokenizer(user_input, return_tensors="pt", max_length=max_length, truncation=True).to("cuda")

        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=200)  # Adjust max_new_tokens as needed

        decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
        results.append({"input": user_input, "expected_output": assistant_output, "generated_output": decoded_output})
    return results

test_results = test_model(model, tokenizer, split_datasets['test'], max_length=max_seq_length)
df_results = pd.DataFrame(test_results)
print(df_results)

                                                 input  \
0    [The city and available attractions are ('Zagr...   
1    [The city and available attractions are ('Zagr...   
2    [The city and available attractions are ('Zagr...   
3    [The city and available attractions are ('Karl...   
4    [The city and available attractions are ('Zagr...   
..                                                 ...   
174  [The city and available attractions are ('Zagr...   
175  [The city and available attractions are ('Zagr...   
176  [The city and available attractions are ('Zagr...   
177  [The city and available attractions are ('Korn...   
178  [The city and available attractions are ('Zagr...   

                                       expected_output  \
0    A French restaurant within the Regent Esplanad...   
1    A choice of good Irish beer with a great atmos...   
2                                                 None   
3    Reception: 8am-10pm, Monday-Sunday. Newly open...   
4    All 4 se

In [10]:
print(split_datasets['test'][0])

{'conversations': [{'content': "[The city and available attractions are ('Zagreb | eat | Le Bistro | French, the Regent Esplanade Hotel',).]", 'role': 'user'}, {'content': 'A French restaurant within the Regent Esplanade Hotel..', 'role': 'assistant'}], 'text': "<|begin_of_text|>Below describes some details about some city that can be visited.\nWrite a travel guide that mentions city characteristics.\nOutput text of travel guide.\n>>> City Details:\n[The city and available attractions are ('Zagreb | eat | Le Bistro | French, the Regent Esplanade Hotel',).]\n>>> Travel Guide:\nA French restaurant within the Regent Esplanade Hotel..<|end_of_text|>"}
