In [1]:
%%capture
# Install required packages
!pip install datasets>=2.17.0 huggingface-hub>=0.21.0 pandas>=2.1.0 matplotlib>=3.8.0 seaborn>=0.13.0
!pip install python-dotenv>=1.0.0 tqdm>=4.66.0 plotly>=5.18.0 googletrans==4.0.0-rc1 langdetect>=1.0.9
# Fine-tuning dependencies
!pip install torch>=2.0.0
!pip install transformers>=4.34.0 peft>=0.5.0 accelerate>=0.23.0 bitsandbytes>=0.40.0 einops>=0.6.1
!pip install unsloth>=2025.1.1 sentencepiece>=0.1.99 optimum>=1.12.0 tensorboard>=2.14.0
!pip install evaluate>=0.4.0 rouge-score>=0.1.2 safetensors>=0.3.3 nvidia-ml-py>=12.0.0

In [2]:
# Clone the repository if you want all files
!git clone https://github.com/KacperJanowski98/pllum-function-calling.git

Cloning into 'pllum-function-calling'...
remote: Enumerating objects: 147, done.[K
remote: Counting objects: 100% (74/74), done.[K
remote: Compressing objects: 100% (38/38), done.[K
remote: Total 147 (delta 46), reused 55 (delta 36), pack-reused 73 (from 1)[K
Receiving objects: 100% (147/147), 331.66 KiB | 3.35 MiB/s, done.
Resolving deltas: 100% (78/78), done.


In [3]:
# Import necessary libraries
import os
import sys
import json
import torch
import random
import numpy as np
from pathlib import Path
import logging
from datetime import datetime
from google.colab import files
from tqdm.auto import tqdm
from huggingface_hub import login

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [4]:
# Set a random seed for reproducibility
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

In [5]:
# Create project directory for local storage
project_dir = '/content/pllum-function-calling-output'
os.makedirs(project_dir, exist_ok=True)
models_dir = f"{project_dir}/models"
data_dir = f"{project_dir}/data"
os.makedirs(models_dir, exist_ok=True)
os.makedirs(data_dir, exist_ok=True)

print(f"Project directory created at: {project_dir}")
print(f"Models will be saved to: {models_dir}")
print(f"Data will be stored in: {data_dir}")

# Add the repository to the path
if os.path.exists('/content/pllum-function-calling'):
  sys.path.append('/content/pllum-function-calling')
else:
  # If repo wasn't cloned, we'll manually create necessary files
  print("Repository not found. We'll create the necessary modules manually.")

Project directory created at: /content/pllum-function-calling-output
Models will be saved to: /content/pllum-function-calling-output/models
Data will be stored in: /content/pllum-function-calling-output/data


In [6]:
try:
    from google.colab import userdata
    hf_token = userdata.get('HF_TOKEN')
    if hf_token:
        login(token=hf_token)
        print("✅ Successfully logged in to Hugging Face using stored token")
    else:
        print("⚠️ No Hugging Face token found in Colab secrets")
        print("Please add your token by:")
        print("1. Click on the 🔑 icon in the left sidebar")
        print("2. Add a new secret with name 'HF_TOKEN' and your token as the value")
        print("3. Run this cell again")
except Exception as e:
    print(f"Error accessing Colab secrets: {e}")
    print("You can manually enter your token if needed:")

    # Fallback for manual input if secrets fail
    manual_token = input("Enter your Hugging Face token (or press Enter to skip): ")
    if manual_token:
        login(token=manual_token)
        print("✅ Successfully logged in to Hugging Face with manual token")
    else:
        print("⚠️ No token provided. Some operations may fail.")

✅ Successfully logged in to Hugging Face using stored token


In [7]:
# Define a function to check CUDA compatibility
def check_cuda_compatibility():
    """
    Check CUDA compatibility and print detailed information about the setup.

    Returns:
        Dictionary with CUDA information
    """
    cuda_info = {
        "cuda_available": torch.cuda.is_available(),
        "cuda_version": torch.version.cuda if torch.cuda.is_available() else None,
        "device_count": torch.cuda.device_count() if torch.cuda.is_available() else 0,
        "devices": [],
    }

    if cuda_info["cuda_available"]:
        # Get details for each CUDA device
        for i in range(cuda_info["device_count"]):
            props = torch.cuda.get_device_properties(i)
            cuda_info["devices"].append({
                "name": props.name,
                "compute_capability": f"{props.major}.{props.minor}",
                "total_memory_gb": props.total_memory / 1e9,
                "multi_processor_count": props.multi_processor_count,
            })

        # Try to get more detailed info with pynvml
        try:
            import pynvml
            pynvml.nvmlInit()

            cuda_info["driver_version"] = pynvml.nvmlSystemGetDriverVersion()

            for i in range(cuda_info["device_count"]):
                handle = pynvml.nvmlDeviceGetHandleByIndex(i)
                mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
                util_info = pynvml.nvmlDeviceGetUtilizationRates(handle)

                cuda_info["devices"][i].update({
                    "memory_free_gb": mem_info.free / 1e9,
                    "memory_used_gb": mem_info.used / 1e9,
                    "gpu_utilization": util_info.gpu,
                    "memory_utilization": util_info.memory,
                })

            pynvml.nvmlShutdown()
        except (ImportError, Exception) as e:
            cuda_info["pynvml_error"] = str(e)

    # Print the CUDA information
    if cuda_info["cuda_available"]:
        logger.info(f"CUDA is available - Version: {cuda_info['cuda_version']}")
        logger.info(f"Found {cuda_info['device_count']} CUDA device(s)")

        for i, device in enumerate(cuda_info["devices"]):
            logger.info(f"Device {i}: {device['name']}")
            logger.info(f"  Compute capability: {device['compute_capability']}")
            logger.info(f"  Total memory: {device['total_memory_gb']:.2f} GB")

            if "memory_free_gb" in device:
                logger.info(f"  Free memory: {device['memory_free_gb']:.2f} GB")
                logger.info(f"  Used memory: {device['memory_used_gb']:.2f} GB")
                logger.info(f"  GPU utilization: {device['gpu_utilization']}%")
                logger.info(f"  Memory utilization: {device['memory_utilization']}%")
    else:
        logger.warning("CUDA is not available. Training will be very slow on CPU.")

    return cuda_info

In [8]:
# Run CUDA compatibility check
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()

cuda_info = check_cuda_compatibility()

if not cuda_info['cuda_available']:
    print("WARNING: CUDA is not available! Fine-tuning will be extremely slow on CPU.")
    print("Please make sure you have an NVIDIA GPU and have installed PyTorch with CUDA support.")
    print("You can install PyTorch with CUDA using: pip install torch --index-url https://download.pytorch.org/whl/cu118")
else:
    print(f"\n✅ CUDA is available with version {cuda_info['cuda_version']}")
    for i, device in enumerate(cuda_info['devices']):
        print(f"\nGPU {i}: {device['name']}")
        print(f"  Memory: {device['total_memory_gb']:.2f} GB total")

        if 'memory_free_gb' in device:
            print(f"  Free memory: {device['memory_free_gb']:.2f} GB")
            print(f"  Used memory: {device['memory_used_gb']:.2f} GB")

            # Check if there's enough free memory (at least 6GB recommended for 8B model with QLoRA)
            if device['memory_free_gb'] < 6.0:
                print(f"⚠️ Warning: Only {device['memory_free_gb']:.2f} GB free memory detected.")
                print("   You may encounter out-of-memory errors during fine-tuning.")
                print("   Consider reducing batch size, sequence length, or closing other applications.")
            else:
                print(f"✅ Sufficient free memory detected ({device['memory_free_gb']:.2f} GB)")

# Verify PyTorch was installed with CUDA support
if torch.cuda.is_available():
    # Run a simple test to verify CUDA is working
    try:
        print("Running CUDA test...")
        x = torch.rand(10, 10).cuda()
        y = torch.rand(10, 10).cuda()
        z = x @ y  # Matrix multiplication
        print(f"CUDA test result shape: {z.shape}")
        print("✅ CUDA test passed!")
    except Exception as e:
        print(f"❌ CUDA test failed: {str(e)}")
        print("This may indicate a problem with your CUDA installation.")


✅ CUDA is available with version 12.4

GPU 0: NVIDIA A100-SXM4-40GB
  Memory: 42.47 GB total
  Free memory: 42.47 GB
  Used memory: 0.48 GB
✅ Sufficient free memory detected (42.47 GB)
Running CUDA test...
CUDA test result shape: torch.Size([10, 10])
✅ CUDA test passed!


In [9]:
if os.path.exists('/content/pllum-function-calling/src/fine_tuning.py'):
    # If repository was cloned, import from there
    from src.fine_tuning import (
        PLLuMFineTuningConfig,
        setup_model_and_tokenizer,
        prepare_dataset,
        train_model,
        format_function_calling_prompt,
        generate_function_call,
    )
    print("Imported fine-tuning functions from cloned repository.")
else:
    # Otherwise, create the necessary functions manually
    print("Creating fine-tuning functions manually...")
    # We'll need to copy the necessary code from src/fine_tuning.py here

    # Custom PyTorch Dataset for function calling
    from torch.utils.data import Dataset
    from dataclasses import dataclass
    from transformers import (
        TrainingArguments,
        Trainer,
        AutoTokenizer,
        AutoModelForCausalLM,
        BitsAndBytesConfig,
    )
    from peft import (
        LoraConfig,
        PeftModel,
        prepare_model_for_kbit_training,
        get_peft_model,
    )
    from unsloth import FastLanguageModel

    class FunctionCallingDataset(Dataset):
        """Dataset for function calling fine-tuning."""

        def __init__(self, input_ids, attention_mask, labels):
            self.input_ids = input_ids
            self.attention_mask = attention_mask
            self.labels = labels
            self.num_examples = len(input_ids)

        def __len__(self):
            return self.num_examples

        def __getitem__(self, idx):
            return {
                "input_ids": self.input_ids[idx],
                "attention_mask": self.attention_mask[idx],
                "labels": self.labels[idx]
            }


    @dataclass
    class PLLuMFineTuningConfig:
        """Configuration for fine-tuning PLLuM model."""

        # Model configuration
        model_name_or_path: str = "CYFRAGOVPL/Llama-PLLuM-8B-instruct"
        output_dir: str = "models/pllum-function-calling"

        # QLoRA parameters
        lora_r: int = 8
        lora_alpha: int = 16
        lora_dropout: float = 0.05
        use_4bit: bool = True
        bnb_4bit_compute_dtype: str = "float16"
        bnb_4bit_quant_type: str = "nf4"
        use_nested_quant: bool = False

        # Training parameters
        num_train_epochs: int = 3
        per_device_train_batch_size: int = 4
        gradient_accumulation_steps: int = 2
        learning_rate: float = 2e-4
        weight_decay: float = 0.01
        max_grad_norm: float = 0.3
        max_steps: int = -1
        warmup_ratio: float = 0.03
        lr_scheduler_type: str = "cosine"

        # Logging & Saving
        logging_steps: int = 10
        save_steps: int = 200
        save_total_limit: int = 3

        # Dataset parameters
        max_seq_length: int = 1024
        dataset_path: str = "data/translated_dataset.json"

        # Tokenization
        padding: str = "max_length"
        pad_to_multiple_of: int = 8

        # CUDA and device settings
        use_cuda: bool = True  # Default to using CUDA if available
        device_map: str = "auto"  # Let HF decide on device mapping


    def format_function_calling_prompt(example):
        """
        Format a function calling example into a prompt suitable for PLLuM instruction format.

        Args:
            example: A dictionary containing 'query', 'tools', and 'answers' fields

        Returns:
            Formatted prompt string
        """
        query = example["query"]
        tools = example["tools"]
        answers = example["answers"]

        # Handle case where tools and answers are stored as JSON strings
        if isinstance(tools, str):
            try:
                tools = json.loads(tools)
                logger.info("Parsed tools from JSON string")
            except json.JSONDecodeError as e:
                logger.error(f"Error parsing tools JSON: {e}")
                # Provide a fallback if parsing fails
                tools = [{"name": "error", "description": "Error parsing tools JSON"}]

        if isinstance(answers, str):
            try:
                answers = json.loads(answers)
                logger.info("Parsed answers from JSON string")
            except json.JSONDecodeError as e:
                logger.error(f"Error parsing answers JSON: {e}")
                # Provide a fallback if parsing fails
                answers = [{"name": "error", "arguments": {}}]

        # Format tools as a string
        tools_str = json.dumps(tools, indent=2, ensure_ascii=False)

        # Format expected answers as a string
        answers_str = json.dumps(answers, indent=2, ensure_ascii=False)

        # Create instruction in PLLuM format
        prompt = f"""<|im_start|>user
Poniżej znajduje się zapytanie i lista dostępnych narzędzi.
Proszę wywołać odpowiednie narzędzie, aby odpowiedzieć na zapytanie użytkownika.

Zapytanie: {query}

Dostępne narzędzia:
{tools_str}
<|im_end|>
<|im_start|>assistant
{answers_str}
<|im_end|>"""

        return prompt


    def prepare_dataset(
        dataset_path,
        tokenizer,
        max_length=1024,
        custom_format_func=None
    ):
        """
        Prepare the dataset for fine-tuning.

        Args:
            dataset_path: Path to the JSON dataset
            tokenizer: The tokenizer to use
            max_length: Maximum sequence length
            custom_format_func: Optional custom formatting function

        Returns:
            A PyTorch Dataset for training
        """
        # Default to format_function_calling_prompt if no custom function is provided
        format_func = custom_format_func or format_function_calling_prompt

        # Load the dataset
        with open(dataset_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        num_examples = len(data)
        logger.info(f"Loaded {num_examples} examples from {dataset_path}")

        # Format the examples
        formatted_examples = [format_func(example) for example in data]
        logger.info(f"Formatted {len(formatted_examples)} examples")

        # Tokenize the examples
        tokenized = tokenizer(
            formatted_examples,
            padding="max_length",
            truncation=True,
            max_length=max_length,
            return_tensors="pt",
        )

        # Create dataset
        dataset = FunctionCallingDataset(
            tokenized["input_ids"],
            tokenized["attention_mask"],
            tokenized["input_ids"].clone()  # Labels are the same as input_ids for causal LM
        )

        logger.info(f"Created dataset with {len(dataset)} examples")
        return dataset


    def setup_model_and_tokenizer(config):
        """
        Setup the PLLuM model and tokenizer with QLoRA 4-bit quantization using Unsloth.

        Args:
            config: Fine-tuning configuration

        Returns:
            Tuple of (model, tokenizer)
        """
        # Ensure CUDA if required and available
        if config.use_cuda and not torch.cuda.is_available():
            logger.warning("CUDA requested but not available. Falling back to CPU.")
            config.use_cuda = False

        # Use cuda if available and requested
        device = "cuda" if config.use_cuda and torch.cuda.is_available() else "cpu"
        logger.info(f"Using device: {device}")

        # Log CUDA memory before loading model
        if device == "cuda":
            logger.info(f"CUDA memory before loading model: {torch.cuda.memory_allocated() / 1e9:.2f}GB allocated")
            logger.info(f"CUDA memory reserved: {torch.cuda.memory_reserved() / 1e9:.2f}GB")

        # Define quantization config for BitsAndBytes
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=config.use_4bit,
            bnb_4bit_quant_type=config.bnb_4bit_quant_type,
            bnb_4bit_compute_dtype=getattr(torch, config.bnb_4bit_compute_dtype),
            bnb_4bit_use_double_quant=config.use_nested_quant,
        )

        # Load PLLuM model and tokenizer using Unsloth's optimized loader
        logger.info(f"Loading model from {config.model_name_or_path}")
        model, tokenizer = FastLanguageModel.from_pretrained(
            model_name=config.model_name_or_path,
            max_seq_length=config.max_seq_length,
            dtype=None,  # Automatically decide based on GPU availability
            quantization_config=bnb_config if device == "cuda" else None,
            device_map=config.device_map if device == "cuda" else None,
        )

        # Log CUDA memory after loading model
        if device == "cuda":
            logger.info(f"CUDA memory after loading model: {torch.cuda.memory_allocated() / 1e9:.2f}GB allocated")
            logger.info(f"CUDA memory reserved: {torch.cuda.memory_reserved() / 1e9:.2f}GB")

        # Define target modules for LoRA
        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]

        # Apply LoRA adapters using Unsloth
        model = FastLanguageModel.get_peft_model(
            model,
            r=config.lora_r,
            target_modules=target_modules,
            lora_alpha=config.lora_alpha,
            lora_dropout=config.lora_dropout,
            bias="none",
            use_gradient_checkpointing=True,
        )

        # Make sure tokenizer has padding token
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

        return model, tokenizer


    def train_model(
        model,
        tokenizer,
        train_dataset,
        config,
    ):
        """
        Train the PLLuM model with the prepared dataset.

        Args:
            model: The model to fine-tune
            tokenizer: The tokenizer
            train_dataset: The prepared Dataset for training
            config: Fine-tuning configuration

        Returns:
            Fine-tuned model
        """
        # Create training arguments
        training_args = TrainingArguments(
            output_dir=config.output_dir,
            num_train_epochs=config.num_train_epochs,
            per_device_train_batch_size=config.per_device_train_batch_size,
            gradient_accumulation_steps=config.gradient_accumulation_steps,
            learning_rate=config.learning_rate,
            weight_decay=config.weight_decay,
            max_grad_norm=config.max_grad_norm,
            max_steps=config.max_steps,
            warmup_ratio=config.warmup_ratio,
            lr_scheduler_type=config.lr_scheduler_type,
            logging_steps=config.logging_steps,
            save_steps=config.save_steps,
            save_total_limit=config.save_total_limit,
            fp16=torch.cuda.is_available(),  # Mixed precision training if CUDA is available
            bf16=False,  # Use FP16 instead of BF16
            report_to="tensorboard",
            optim="adamw_torch",
            ddp_find_unused_parameters=False,
            dataloader_pin_memory=torch.cuda.is_available(),  # Pin memory if CUDA available
            remove_unused_columns=False,  # Important for custom datasets
        )

        # Initialize Trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            tokenizer=tokenizer,
        )

        # Start training
        logger.info("Starting model training...")

        # Log CUDA memory before training
        if torch.cuda.is_available():
            logger.info(f"CUDA memory before training: {torch.cuda.memory_allocated() / 1e9:.2f}GB allocated")
            logger.info(f"CUDA memory reserved: {torch.cuda.memory_reserved() / 1e9:.2f}GB")

        trainer.train()

        # Log CUDA memory after training
        if torch.cuda.is_available():
            logger.info(f"CUDA memory after training: {torch.cuda.memory_allocated() / 1e9:.2f}GB allocated")
            logger.info(f"CUDA memory reserved: {torch.cuda.memory_reserved() / 1e9:.2f}GB")

        # Save the model
        model.save_pretrained(config.output_dir)
        tokenizer.save_pretrained(config.output_dir)

        logger.info(f"Model saved to {config.output_dir}")
        return model


    def generate_function_call(
        model,
        tokenizer,
        query,
        tools,
        max_new_tokens=512,
        temperature=0.1,
        top_p=0.9,
    ):
        """
        Generate a function call using the fine-tuned model.

        Args:
            model: The fine-tuned model
            tokenizer: The tokenizer
            query: User query
            tools: Available tools
            max_new_tokens: Maximum new tokens to generate
            temperature: Sampling temperature
            top_p: Top-p sampling parameter

        Returns:
            Generated function call as a dictionary
        """
        # Handle case where tools might be a string (should be a list of dicts)
        if isinstance(tools, str):
            try:
                tools = json.loads(tools)
                logger.info("Parsed tools from JSON string for generation")
            except json.JSONDecodeError as e:
                logger.error(f"Error parsing tools JSON for generation: {e}")
                # Provide a fallback if parsing fails
                tools = [{"name": "error", "description": "Error parsing tools JSON"}]

        # Format the tools as a JSON string
        tools_str = json.dumps(tools, indent=2, ensure_ascii=False)

        # Create prompt
        prompt = f"""<|im_start|>user
Poniżej znajduje się zapytanie i lista dostępnych narzędzi.
Proszę wywołać odpowiednie narzędzie, aby odpowiedzieć na zapytanie użytkownika.

Zapytanie: {query}

Dostępne narzędzia:
{tools_str}
<|im_end|>
<|im_start|>assistant
"""

        # Tokenize the prompt
        inputs = tokenizer(prompt, return_tensors="pt")

        # Move inputs to the model's device
        device = next(model.parameters()).device
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Generate response
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                top_p=top_p,
                do_sample=(temperature > 0),
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id,
            )

        # Extract generated text
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)

        # Extract only the assistant's response
        assistant_response = generated_text.split("<|im_start|>assistant")[1]
        assistant_response = assistant_response.split("<|im_end|>")[0].strip()

        # Try to parse the JSON response
        try:
            function_call = json.loads(assistant_response)
            return function_call
        except json.JSONDecodeError:
            # If parsing fails, return the raw response
            return {"raw_response": assistant_response}

    print("✅ Fine-tuning functions created successfully.")


Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Unsloth: OpenAI failed to import - ignoring for now.
🦥 Unsloth Zoo will now patch everything to make training faster!
Imported fine-tuning functions from cloned repository.


In [10]:
print("Please upload your dataset file (JSON format)")
uploaded = files.upload()

# Path to save the dataset
dataset_path = f"{data_dir}/xlam_function_calling_pl.json"

if uploaded:
    # Get the filename of the uploaded file
    upload_filename = list(uploaded.keys())[0]
    print(f"Uploaded: {upload_filename}")

    # Copy the uploaded file to the dataset path
    with open(upload_filename, 'rb') as f_src:
        with open(dataset_path, 'wb') as f_dst:
            f_dst.write(f_src.read())

    print(f"Dataset copied to: {dataset_path}")
else:
    # If no upload, provide instructions
    print("⚠️ No dataset file uploaded. Please upload your dataset to proceed.")
    print("If you need to create a sample dataset, run the next cell.")

Please upload your dataset file (JSON format)


Saving xlam_function_calling_pl_5k.json to xlam_function_calling_pl_5k.json
Uploaded: xlam_function_calling_pl_5k.json
Dataset copied to: /content/pllum-function-calling-output/data/xlam_function_calling_pl.json


In [12]:
# Create a timestamped output directory
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
MODEL_OUTPUT_DIR = f"{models_dir}/pllum-function-calling-{timestamp}"

# Create the output directory
os.makedirs(MODEL_OUTPUT_DIR, exist_ok=True)

# Configure fine-tuning parameters
batch_size = 8 # @param {type:"slider", min:1, max:8, step:1}
epochs = 4 # @param {type:"slider", min:1, max:5, step:1}
lora_r = 16 # @param {type:"slider", min:4, max:32, step:4}
lora_alpha = 32 # @param {type:"slider", min:8, max:64, step:8}
learning_rate = 0.0001 # @param {type:"number"}
gradient_accumulation_steps = 4 # @param {type:"slider", min:1, max:16, step:1}

config = PLLuMFineTuningConfig(
    model_name_or_path="CYFRAGOVPL/Llama-PLLuM-8B-instruct",
    output_dir=MODEL_OUTPUT_DIR,

    # QLoRA settings
    lora_r=lora_r,  # LoRA rank
    lora_alpha=lora_alpha,  # LoRA alpha
    lora_dropout=0.1,
    use_4bit=True,  # Use 4-bit quantization for memory efficiency
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_quant_type="nf4",  # Normal Float 4-bit quantization
    use_nested_quant=False,

    # Training parameters
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=learning_rate,
    weight_decay=0.01,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",

    # Logging & Saving
    logging_steps=25,
    save_steps=100,
    save_total_limit=1,  # Keep only the last checkpoint to save space

    # Dataset parameters
    max_seq_length=1024,  # Maximum sequence length
    dataset_path=dataset_path,

    # CUDA settings
    use_cuda=torch.cuda.is_available(),
    device_map="auto",
)

# Adjust batch size if we detect limited GPU memory
if torch.cuda.is_available() and 'devices' in cuda_info and len(cuda_info['devices']) > 0:
    if 'memory_free_gb' in cuda_info['devices'][0] and cuda_info['devices'][0]['memory_free_gb'] < 12.0:
        original_batch_size = config.per_device_train_batch_size
        config.per_device_train_batch_size = 1  # Reduce batch size for low memory GPUs
        config.gradient_accumulation_steps = 16  # Increase gradient accumulation
        print(f"⚠️ Limited GPU memory detected. Reducing batch size from {original_batch_size} to {config.per_device_train_batch_size}")
        print(f"   and increasing gradient accumulation steps to {config.gradient_accumulation_steps}")

# Save the configuration to the model directory for future reference
config_dict = {k: str(v) if isinstance(v, Path) else v for k, v in vars(config).items()}
with open(os.path.join(MODEL_OUTPUT_DIR, "config.json"), 'w', encoding='utf-8') as f:
    json.dump(config_dict, f, indent=2)

print(f"Fine-tuning configuration saved to {os.path.join(MODEL_OUTPUT_DIR, 'config.json')}")

Fine-tuning configuration saved to /content/pllum-function-calling-output/models/pllum-function-calling-20250330_071532/config.json


In [13]:
# Load model and tokenizer
print("Loading model and tokenizer...")
model, tokenizer = setup_model_and_tokenizer(config)
print("Model and tokenizer loaded successfully.")

# Check the model
print(f"Model loaded: {config.model_name_or_path}")
print(f"Model device: {next(model.parameters()).device}")
print(f"Using 4-bit quantization: {config.use_4bit}")

Loading model and tokenizer...
==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.50.0.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/52.1k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/525 [00:00<?, ?B/s]

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.1.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.3.19 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


Model and tokenizer loaded successfully.
Model loaded: CYFRAGOVPL/Llama-PLLuM-8B-instruct
Model device: cuda:0
Using 4-bit quantization: True


In [14]:
# Prepare the dataset
print("Preparing dataset...")
train_dataset = prepare_dataset(
    dataset_path=config.dataset_path,
    tokenizer=tokenizer,
    max_length=config.max_seq_length
)
print(f"Dataset prepared with {len(train_dataset)} examples.")

Preparing dataset...
Dataset prepared with 5000 examples.


In [15]:
# Run the training
print(f"Starting fine-tuning process. Model will be saved to {config.output_dir}")
print("This may take several hours depending on your hardware.")

# Start training
trained_model = train_model(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    config=config
)

print("Fine-tuning completed successfully!")

Starting fine-tuning process. Model will be saved to /content/pllum-function-calling-output/models/pllum-function-calling-20250330_071532
This may take several hours depending on your hardware.


  trainer = Trainer(
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 5,000 | Num Epochs = 4 | Total steps = 624
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 4 x 1) = 32
 "-____-"     Trainable parameters = 41,943,040/8,000,000,000 (0.52% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
25,5.5725
50,3.1976
75,3.2641
100,3.1057
125,3.11
150,3.092
175,3.1004
200,3.1363
225,3.0575
250,3.1141


Fine-tuning completed successfully!


In [16]:
# Test on a sample query
test_query = "Jaka jest pogoda w Warszawie w stopniach celsjusza?"
test_tools = [
    {
        "name": "get_weather",
        "description": "Get the current weather for a location",
        "parameters": {
            "location": {
                "type": "string",
                "description": "The city and state or country",
                "required": True
            },
            "unit": {
                "type": "string",
                "description": "Unit of temperature: 'celsius' or 'fahrenheit'",
                "required": False
            }
        }
    }
]

print("Testing the model with a sample query...")
print(f"Query: {test_query}")
print("Available tools:")
print(json.dumps(test_tools, indent=2, ensure_ascii=False))

generated = generate_function_call(
    model=trained_model,
    tokenizer=tokenizer,
    query=test_query,
    tools=test_tools,
    temperature=0.1
)

print("\nGenerated function call:")
print(json.dumps(generated, indent=2, ensure_ascii=False))

Testing the model with a sample query...
Query: Jaka jest pogoda w Warszawie w stopniach celsjusza?
Available tools:
[
  {
    "name": "get_weather",
    "description": "Get the current weather for a location",
    "parameters": {
      "location": {
        "type": "string",
        "description": "The city and state or country",
        "required": true
      },
      "unit": {
        "type": "string",
        "description": "Unit of temperature: 'celsius' or 'fahrenheit'",
        "required": false
      }
    }
  }
]

Generated function call:
[
  {
    "name": "get_weather",
    "arguments": {
      "location": "Warsaw, Poland",
      "unit": "celsius"
    }
  }
]


In [17]:
# Create a zip file of the model directory
model_zip_path = f"{MODEL_OUTPUT_DIR}.zip"
!zip -r "{model_zip_path}" "{MODEL_OUTPUT_DIR}"

# Download the zip file
print(f"Downloading the model to your local machine...")
files.download(model_zip_path)

print("""
Fine-tuning complete! The model has been downloaded as a zip file.

To use this model in your local project:
1. Unzip the downloaded file in your project's models directory
2. In your code, load the model using:

```python
from src.fine_tuning import load_fine_tuned_model, generate_function_call

# Path to the unzipped model directory
model_path = "models/pllum-function-calling-{timestamp}"

# Load the model
model, tokenizer = load_fine_tuned_model(model_path)

# Use the model to generate function calls
function_call = generate_function_call(
    model=model,
    tokenizer=tokenizer,
    query="Jaka jest pogoda w Warszawie?",
    tools=[...your tools here...],
    temperature=0.1
)
```
""".format(timestamp=timestamp))

  adding: content/pllum-function-calling-output/models/pllum-function-calling-20250330_071532/ (stored 0%)
  adding: content/pllum-function-calling-output/models/pllum-function-calling-20250330_071532/adapter_model.safetensors (deflated 7%)
  adding: content/pllum-function-calling-output/models/pllum-function-calling-20250330_071532/special_tokens_map.json (deflated 68%)
  adding: content/pllum-function-calling-output/models/pllum-function-calling-20250330_071532/tokenizer_config.json (deflated 95%)
  adding: content/pllum-function-calling-output/models/pllum-function-calling-20250330_071532/runs/ (stored 0%)
  adding: content/pllum-function-calling-output/models/pllum-function-calling-20250330_071532/runs/Mar30_07-17-10_5bc089c92779/ (stored 0%)
  adding: content/pllum-function-calling-output/models/pllum-function-calling-20250330_071532/runs/Mar30_07-17-10_5bc089c92779/events.out.tfevents.1743319032.5bc089c92779.231.0 (deflated 63%)
  adding: content/pllum-function-calling-output/mod

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Fine-tuning complete! The model has been downloaded as a zip file.

To use this model in your local project:
1. Unzip the downloaded file in your project's models directory
2. In your code, load the model using:

```python
from src.fine_tuning import load_fine_tuned_model, generate_function_call

# Path to the unzipped model directory
model_path = "models/pllum-function-calling-20250330_071532"

# Load the model
model, tokenizer = load_fine_tuned_model(model_path)

# Use the model to generate function calls
function_call = generate_function_call(
    model=model,
    tokenizer=tokenizer,
    query="Jaka jest pogoda w Warszawie?",
    tools=[...your tools here...],
    temperature=0.1
)
```

