# Load and process data to chatml format

In [1]:
from tools.prepare_data import prepare_dialogue_dataset, print_sample_conversation

# Load and prepare the dataset
chatml_data = prepare_dialogue_dataset()


In [2]:

# Print a sample conversation
print_sample_conversation(chatml_data)

Sample conversation:

Question ID: synthetic_1
Language: Danish

Messages:
user: Hej, jeg har en stor samling af gamle vinyl-plader, som jeg gerne vil digitalisere, så jeg kan lytte til dem på min telefon eller computer. Er det muligt at overføre dem til en digital form, og hvordan gør jeg det i så fald?

assistant: Ja, det er absolut muligt! Du har flere muligheder. Du kan købe en USB-vinylafspiller, som kan tilsluttes din computer og overføre musikken direkte til en digital fil. Alternativt kan du også bruge en almindelig vinylafspiller og en lydkort til at indspille musikken på din computer.

user: En USB-vinylafspiller lyder som en god løsning. Er de dyre, og hvilke mærker kan du anbefale?

assistant: Priserne varierer, men du kan finde en god USB-vinylafspiller for mellem 500-1500 kroner. Mærker som Audio-Technica, Pro-Ject og U-Turn er alle godt kendte for deres kvalitet og pris. Du skal også være opmærksom på, at nogle af dem kommer med software, som kan hjælpe dig med at redige

In [3]:
chatml_data

{'synthetic_0': {'messages': [{'role': 'user',
    'content': 'Hej, jeg har et problem når jeg skal indspille vokal til en eksisterende indspilning af et band. Jeg har allerede lagt en masse effekter på instrumenterne, men når jeg skal indspille vokal, giver disse effekter en masse forsinkelse. Er der en måde at slå alle disse effekter fra på én gang, så jeg ikke skal gennemgå hver enkelt plug-in og slå dem fra manuelt?'},
   {'role': 'assistant',
    'content': 'Ja, det er et ret almindeligt problem! I de fleste DAW\'er (digital audio workstation) kan du oprette en såkaldt "Bypass-gruppe" eller "Bypass-alle-effekter"-funktion. Dette giver dig mulighed for at slå alle effekter fra på én gang med et enkelt klik.'},
   {'role': 'user',
    'content': 'Det lyder fantastisk! Jeg bruger Logic Pro X - ved du hvordan jeg kan gøre det i denne DAW?'},
   {'role': 'assistant',
    'content': 'Ja, i Logic Pro X kan du gå til "Mix"-vinduet og klikke på "Bypass All Plug-ins" i toppen af vinduet. De

In [4]:
from datasets import Dataset
import pandas as pd

def prepare_sft_dataset(chatml_data):
    """
    Convert ChatML format data to a format suitable for SFTTrainer.
    
    Args:
        chatml_data: Dictionary of conversations in ChatML format
        
    Returns:
        Dataset: HuggingFace Dataset ready for SFTTrainer
    """
    # Convert conversations to list format
    formatted_data = []
    
    for conv_id, conv_data in chatml_data.items():
        # Combine messages into a single conversation string
        messages = conv_data['messages']
        
        # Format each conversation as a dictionary with the required fields
        formatted_data.append({
            "messages": messages,  # SFTTrainer can handle the ChatML format directly
            "conversation_id": conv_id,
            "language": conv_data['language']
        })
    
    # Convert to pandas DataFrame first
    df = pd.DataFrame(formatted_data)
    
    # Convert to HuggingFace Dataset
    dataset = Dataset.from_pandas(df)
    
    return dataset

# Convert your data
sft_dataset = prepare_sft_dataset(chatml_data)

# Print sample to verify format
print("\nDataset format:")
print(sft_dataset.features)

print("\nSample entry:")
print(sft_dataset[0])


Dataset format:
{'messages': [{'content': Value(dtype='string', id=None), 'role': Value(dtype='string', id=None)}], 'conversation_id': Value(dtype='string', id=None), 'language': Value(dtype='string', id=None)}

Sample entry:
{'messages': [{'content': 'Hej, jeg har et problem når jeg skal indspille vokal til en eksisterende indspilning af et band. Jeg har allerede lagt en masse effekter på instrumenterne, men når jeg skal indspille vokal, giver disse effekter en masse forsinkelse. Er der en måde at slå alle disse effekter fra på én gang, så jeg ikke skal gennemgå hver enkelt plug-in og slå dem fra manuelt?', 'role': 'user'}, {'content': 'Ja, det er et ret almindeligt problem! I de fleste DAW\'er (digital audio workstation) kan du oprette en såkaldt "Bypass-gruppe" eller "Bypass-alle-effekter"-funktion. Dette giver dig mulighed for at slå alle effekter fra på én gang med et enkelt klik.', 'role': 'assistant'}, {'content': 'Det lyder fantastisk! Jeg bruger Logic Pro X - ved du hvordan j

# Fine-tune

### Hyper parameters from DFM

In [5]:
# Model and training configuration settings
CONFIG = {
    # Model settings
    "model": {
        "name": "google/gemma-2b-it",
        "max_seq_length": 2048,
        "load_in_4bit": True,
        "use_flash_attention": True,
    },
    
    # LoRA parameters (from the tutorial)
    "lora": {
        "r": 16,                     # LoRA attention dimension
        "alpha": 16,                 # LoRA alpha parameter
        "dropout": 0.0,             # Dropout probability for LoRA layers
        "target_modules": [          # Which modules to apply LoRA to
            "q_proj",
            "k_proj", 
            "v_proj",
            "o_proj",
            "gate_proj",
            "up_proj",
            "down_proj",
        ],
        "bias": "none",             # Add bias to LoRA layers
        "use_gradient_checkpointing": True,  # Enable gradient checkpointing
    },
    
    # Training hyperparameters (from the tutorial)
    "training": {
        "num_train_epochs": 1,      # Number of training epochs
        "per_device_train_batch_size": 8,  # Batch size per GPU for training
        "gradient_accumulation_steps": 1,  # Number of updates steps to accumulate before backward pass
        "learning_rate": 2e-4,      # Initial learning rate
        "max_grad_norm": 0.3,       # Max gradient norm for gradient clipping
        "weight_decay": 0.01,       # Weight decay to apply
        "adam_beta1": 0.9,          # Adam beta1 parameter
        "adam_beta2": 0.999,        # Adam beta2 parameter
        "adam_epsilon": 1e-8,       # Adam epsilon parameter
        "warmup_steps": 5,          # Number of warmup steps for learning rate scheduler
        "lr_scheduler_type": "linear",  # Learning rate scheduler type
    },
    
    # Mixed precision settings
    "mixed_precision": {
        "bf16": True,               # Use bfloat16 mixed precision
        "fp16": False,              # Don't use float16 mixed precision
    },
    
    # Dataset settings
    "dataset": {
        "max_samples": 1000,        # Maximum number of training samples to use
        "num_proc": 4,             # Number of processes for dataset preprocessing
    },
    
    # Other settings
    "seed": 42,                    # Random seed for reproducibility
    "logging_steps": 3,            # Number of steps between logging updates
    "save_strategy": "epoch",      # When to save checkpoints
    "output_dir": "outputs",       # Directory to save outputs
}

# Training Arguments for Hugging Face Trainer
training_args = {
    "output_dir": CONFIG["output_dir"],
    "num_train_epochs": CONFIG["training"]["num_train_epochs"],
    "per_device_train_batch_size": CONFIG["training"]["per_device_train_batch_size"],
    "gradient_accumulation_steps": CONFIG["training"]["gradient_accumulation_steps"],
    "learning_rate": CONFIG["training"]["learning_rate"],
    "max_grad_norm": CONFIG["training"]["max_grad_norm"],
    "weight_decay": CONFIG["training"]["weight_decay"],
    "adam_beta1": CONFIG["training"]["adam_beta1"],
    "adam_beta2": CONFIG["training"]["adam_beta2"],
    "adam_epsilon": CONFIG["training"]["adam_epsilon"],
    "warmup_steps": CONFIG["training"]["warmup_steps"],
    "lr_scheduler_type": CONFIG["training"]["lr_scheduler_type"],
    "bf16": CONFIG["mixed_precision"]["bf16"],
    "fp16": CONFIG["mixed_precision"]["fp16"],
    "logging_steps": CONFIG["logging_steps"],
    "save_strategy": CONFIG["save_strategy"],
    "seed": CONFIG["seed"],
    "optim": "adamw_8bit",
}

# PEFT (Parameter Efficient Fine-Tuning) configuration
peft_config = {
    "r": CONFIG["lora"]["r"],
    "lora_alpha": CONFIG["lora"]["alpha"],
    "lora_dropout": CONFIG["lora"]["dropout"],
    "target_modules": CONFIG["lora"]["target_modules"],
    "bias": CONFIG["lora"]["bias"],
    "use_gradient_checkpointing": CONFIG["lora"]["use_gradient_checkpointing"],
    "use_rslora": False,
    "loftq_config": None,
    "random_state": CONFIG["seed"],
}

# Generation configuration for inference
generation_config = {
    "max_new_tokens": 256,
    "do_sample": True,
    "temperature": 0.2,
    "repetition_penalty": 1.2,
    "top_k": 50,
    "top_p": 0.95,
    "use_cache": False,
}

# Print configurations
print("Configuration loaded with the following settings:")
print("\nTraining Arguments:")
for k, v in training_args.items():
    print(f"{k:25} = {v}")

print("\nPEFT Configuration:")
for k, v in peft_config.items():
    print(f"{k:25} = {v}")

print("\nGeneration Configuration:")
for k, v in generation_config.items():
    print(f"{k:25} = {v}")

Configuration loaded with the following settings:

Training Arguments:
output_dir                = outputs
num_train_epochs          = 1
per_device_train_batch_size = 8
gradient_accumulation_steps = 1
learning_rate             = 0.0002
max_grad_norm             = 0.3
weight_decay              = 0.01
adam_beta1                = 0.9
adam_beta2                = 0.999
adam_epsilon              = 1e-08
warmup_steps              = 5
lr_scheduler_type         = linear
bf16                      = True
fp16                      = False
logging_steps             = 3
save_strategy             = epoch
seed                      = 42
optim                     = adamw_8bit

PEFT Configuration:
r                         = 16
lora_alpha                = 16
lora_dropout              = 0.0
target_modules            = ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj']
bias                      = none
use_gradient_checkpointing = True
use_rslora                = False
loftq_confi

# Training code

In [6]:
import torch
from datasets import load_dataset
from unsloth import FastLanguageModel
from trl import SFTTrainer, setup_chat_format
from transformers import TrainingArguments, AutoTokenizer
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp import (
    MixedPrecision,
    ShardingStrategy,
)

# Model and training configuration settings
CONFIG = {
    # Model settings
    "model": {
        "name": "google/gemma-2b-it",
        "max_seq_length": 2048,  # Reduced from 4096 for single GPU
        "load_in_4bit": True,    # Keep 4-bit quantization for memory efficiency
        "use_flash_attention": True,
    },
    
    # LoRA parameters
    "lora": {
        "r": 16,  # Reduced from 32 for single GPU
        "lora_alpha": 16,
        "lora_dropout": 0.0,  # Set to 0 for unsloth optimization
        "target_modules": [
            "q_proj",
            "k_proj", 
            "v_proj",
            "o_proj",
            "gate_proj",
            "up_proj",
            "down_proj",
        ],
        "bias": "none",
        "use_gradient_checkpointing": True,
        "use_rslora": False,
        "loftq_config": None,
    },
    
    # Training hyperparameters for single GPU
    "training": {
        "num_train_epochs": 3,
        "per_device_train_batch_size": 4,  # Reduced for single GPU
        "gradient_accumulation_steps": 4,  # Increased to compensate for smaller batch size
        "learning_rate": 2e-4,
        "max_grad_norm": 0.3,
        "weight_decay": 0.01,
        "adam_beta1": 0.9,
        "adam_beta2": 0.999,
        "adam_epsilon": 1e-8,
        "warmup_steps": 100,
        "lr_scheduler_type": "cosine",
    },
    
    # Mixed precision settings
    "mixed_precision": {
        "bf16": True,  # RTX 4090 supports bf16
        "fp16": False,
    },
    
    # Other settings
    "seed": 42,
    "logging_steps": 10,
    "save_strategy": "steps",
    "save_steps": 200,
    "output_dir": "outputs",
}

def get_training_arguments():
    """Configure training arguments for single GPU"""
    return TrainingArguments(
        output_dir=CONFIG["output_dir"],
        num_train_epochs=CONFIG["training"]["num_train_epochs"],
        per_device_train_batch_size=CONFIG["training"]["per_device_train_batch_size"],
        gradient_accumulation_steps=CONFIG["training"]["gradient_accumulation_steps"],
        learning_rate=CONFIG["training"]["learning_rate"],
        max_grad_norm=CONFIG["training"]["max_grad_norm"],
        weight_decay=CONFIG["training"]["weight_decay"],
        adam_beta1=CONFIG["training"]["adam_beta1"],
        adam_beta2=CONFIG["training"]["adam_beta2"],
        adam_epsilon=CONFIG["training"]["adam_epsilon"],
        warmup_steps=CONFIG["training"]["warmup_steps"],
        lr_scheduler_type=CONFIG["training"]["lr_scheduler_type"],
        bf16=CONFIG["mixed_precision"]["bf16"],
        fp16=CONFIG["mixed_precision"]["fp16"],
        logging_steps=CONFIG["logging_steps"],
        save_strategy=CONFIG["save_strategy"],
        save_steps=CONFIG["save_steps"],
        seed=CONFIG["seed"],
        optim="adamw_8bit",
        # Remove FSDP-specific settings
    )

# Remove the distributed training setup
def prepare_model_and_tokenizer():
    """Prepare model and tokenizer for single GPU training"""
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=CONFIG["model"]["name"],
        max_seq_length=CONFIG["model"]["max_seq_length"],
        load_in_4bit=CONFIG["model"]["load_in_4bit"],
        attn_implementation="flash_attention_2" if CONFIG["model"]["use_flash_attention"] else "sdpa"
    )
    
    # Skip setup_chat_format since Gemma already has a chat template
    model = FastLanguageModel.get_peft_model(model, **CONFIG["lora"])
    
    return model, tokenizer

# Remove the distributed training setup
def prepare_model_and_tokenizer():
    """Prepare model and tokenizer for single GPU training"""
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=CONFIG["model"]["name"],
        max_seq_length=CONFIG["model"]["max_seq_length"],
        load_in_4bit=CONFIG["model"]["load_in_4bit"],
        attn_implementation="flash_attention_2" if CONFIG["model"]["use_flash_attention"] else "sdpa"
    )
    
    # Set Gemma chat template
    tokenizer.chat_template = """{% for message in messages %}{% if message['role'] == 'user' %}
<start_of_turn>user
{{ message['content'] }}<end_of_turn>
{% elif message['role'] == 'assistant' %}
<start_of_turn>model
{{ message['content'] }}<end_of_turn>
{% endif %}{% endfor %}
{% if add_generation_prompt %}
<start_of_turn>model
{% endif %}"""

    # Apply LoRA configuration
    model = FastLanguageModel.get_peft_model(model, **CONFIG["lora"])
    
    return model, tokenizer


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 02-21 23:46:21 __init__.py:190] Automatically detected platform cuda.


In [7]:
# Training setup
model, tokenizer = prepare_model_and_tokenizer()

==((====))==  Unsloth 2025.2.15: Fast Gemma patching. Transformers: 4.49.0.
   \\   /|    GPU: NVIDIA GeForce RTX 4090. Max memory: 23.513 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.2.15 patched 18 layers with 18 QKV layers, 18 O layers and 18 MLP layers.


In [8]:
print("Gemma's chat template:")
print(tokenizer.chat_template)

Gemma's chat template:
{% for message in messages %}{% if message['role'] == 'user' %}
<start_of_turn>user
{{ message['content'] }}<end_of_turn>
{% elif message['role'] == 'assistant' %}
<start_of_turn>model
{{ message['content'] }}<end_of_turn>
{% endif %}{% endfor %}
{% if add_generation_prompt %}
<start_of_turn>model
{% endif %}


In [9]:
number_of_gpus = torch.cuda.device_count()
print(f"Number of GPUs: {number_of_gpus}")

# Setup trainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=sft_dataset,
    max_seq_length=CONFIG["model"]["max_seq_length"],
    dataset_num_proc=number_of_gpus,  # Increased for multi-GPU
    args=get_training_arguments(),
)

Number of GPUs: 1


Converting train dataset to ChatML:   0%|          | 0/60 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/60 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/60 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/60 [00:00<?, ? examples/s]

In [10]:

# Train
trainer.train()


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 60 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 4
\        /    Total batch size = 16 | Total steps = 9
 "-____-"     Number of trainable parameters = 19,611,648


Step,Training Loss


TrainOutput(global_step=9, training_loss=2.576289070977105, metrics={'train_runtime': 15.7326, 'train_samples_per_second': 11.441, 'train_steps_per_second': 0.572, 'total_flos': 767613770465280.0, 'train_loss': 2.576289070977105})

In [None]:

# Save the model
trainer.save_model(CONFIG["output_dir"])
tokenizer.save_pretrained(CONFIG["output_dir"])