In [1]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9\.]{3,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.32.post2" if v == "2.8.0" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install transformers==4.55.4

### Unsloth

In [2]:
"""
• This code initializes a 4-bit quantized CodeGemma 7B model using Unsloth for memory-efficient fine-tuning
• Uses automatic dtype detection and 4-bit quantization to reduce VRAM usage from ~14GB to ~4GB
• Sets sequence length to 1024 tokens (good for your 8GB VRAM) and loads pre-quantized model for faster setup
• Unsloth provides optimized models that download faster and prevent out-of-memory errors
• CodeGemma is Google's code-focused variant, ideal for programming tasks but requires proper tokenizer handling

Key optimizations for your environment:
• Explicit float16 dtype for RTX 2070 Super (Turing architecture doesn't support bfloat16 efficiently)
• Added device_map="auto" for proper GPU memory management in WSL2
• Conservative 1024 sequence length to stay within 8GB VRAM limits
• Memory monitoring to track GPU usage and prevent OOM errors
• Error handling with fallback to smaller models if main model fails
• CUDA availability checks specific to WSL2 setup requirements

WSL2 Setup Notes:
- Ensure you have NVIDIA drivers installed in Windows (not WSL2)
- Install CUDA toolkit in WSL2: wget https://developer.download.nvidia.com/compute/cuda/repos/wsl-ubuntu/x86_64/cuda-wsl-ubuntu.pin
- Verify with nvidia-smi command in WSL2 terminal
"""

import os
import torch
from unsloth import FastLanguageModel
import gc

# Clear GPU memory before starting
torch.cuda.empty_cache()
gc.collect()

# Optimized settings for RTX 2070 Super (8GB VRAM)
max_seq_length = 512  # Conservative for 8GB VRAM - can try 2048 if stable
dtype = torch.float16  # Explicit float16 for RTX 2070 Super (Turing architecture)
load_in_4bit = True  # Essential for your VRAM constraints

# Verify CUDA availability
if not torch.cuda.is_available():
    print("CUDA not available! Check your PyTorch installation.")
    print("For WSL2, ensure you have CUDA drivers installed in Windows")
    exit(1)

print(f"CUDA Device: {torch.cuda.get_device_name()}")
print(f"VRAM Available: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

# Pre-quantized models optimized for your hardware
fourbit_models = [
    "unsloth/mistral-7b-bnb-4bit",      # General purpose, very stable
    "unsloth/codegemma-7b-bnb-4bit",    # Your original choice - good for code
    "unsloth/llama-2-7b-bnb-4bit",      # Stable alternative
    "unsloth/gemma-7b-bnb-4bit",        # Fast, good performance
    "unsloth/tinyllama-bnb-4bit",       # Smallest option if memory issues persist
]

# Choose model based on your use case
model_name = "unsloth/codegemma-7b-bnb-4bit"  # Good for code tasks
# model_name = "unsloth/mistral-7b-bnb-4bit"  # Alternative if CodeGemma causes issues

try:
    print(f"Loading model: {model_name}")
    print("This may take a few minutes on first run...")
    
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_name,
        max_seq_length=max_seq_length,
        dtype=dtype,
        load_in_4bit=load_in_4bit,
        device_map="auto",  # Automatically handle GPU placement
        trust_remote_code=True,  # Required for some models
        # token="hf_...",  # Uncomment if using gated models
    )
    
    print("Model loaded successfully!")
    print(f"Model device: {next(model.parameters()).device}")
    
    # Check memory usage
    if torch.cuda.is_available():
        memory_used = torch.cuda.memory_allocated() / 1024**3
        memory_cached = torch.cuda.memory_reserved() / 1024**3
        print(f"GPU Memory - Used: {memory_used:.2f}GB, Cached: {memory_cached:.2f}GB")
        
        # Warning if using too much memory
        if memory_used > 7.0:  # Leave some headroom
            print("⚠️ WARNING: High GPU memory usage. Consider using smaller model or reduce max_seq_length")
    
except Exception as e:
    print(f"Error loading model: {e}")
    print("\nTroubleshooting steps:")
    print("1. Install/update unsloth: pip install unsloth[colab-new] --upgrade")
    print("2. Check CUDA setup: nvidia-smi")
    print("3. Try smaller model: unsloth/tinyllama-bnb-4bit")
    print("4. Reduce max_seq_length to 512")
    
    # Alternative smaller model attempt
    try:
        print("\nTrying smaller model...")
        model, tokenizer = FastLanguageModel.from_pretrained(
            model_name="unsloth/tinyllama-bnb-4bit",
            max_seq_length=512,
            dtype=dtype,
            load_in_4bit=load_in_4bit,
            device_map="auto",
        )
        print("Smaller model loaded successfully!")
    except Exception as e2:
        print(f"Failed to load smaller model: {e2}")
        print("Check your unsloth installation and CUDA setup")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
INFO 09-03 20:45:19 [__init__.py:241] Automatically detected platform cuda.
ERROR 09-03 20:45:19 [fa_utils.py:57] Cannot use FA version 2 is not supported due to FA2 is only supported on devices with compute capability >= 8
🦥 Unsloth Zoo will now patch everything to make training faster!
CUDA Device: NVIDIA GeForce RTX 2070 SUPER
VRAM Available: 8.0 GB
Loading model: unsloth/codegemma-7b-bnb-4bit
This may take a few minutes on first run...
Are you certain you want to do remote code execution?
==((====))==  Unsloth 2025.8.10: Fast Gemma patching. Transformers: 4.55.4. vLLM: 0.10.1.1.
   \\   /|    NVIDIA GeForce RTX 2070 SUPER. Num GPUs = 1. Max memory: 8.0 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.31. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is e

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [3]:
"""
• This code converts the base model into a PEFT (Parameter-Efficient Fine-Tuning) model using LoRA (Low-Rank Adaptation)
• LoRA freezes original model weights and adds small trainable matrices to attention and MLP layers
• r=16 is the rank parameter controlling LoRA matrix size - higher values = more parameters but better adaptation
• Targets key transformer components: attention projections (q,k,v,o) and feed-forward layers (gate,up,down)
• Uses optimized settings: no dropout, no bias, and Unsloth's gradient checkpointing for memory efficiency
• Only trains ~1% of original parameters while maintaining performance, perfect for your 8GB VRAM constraint

Key optimizations for RTX 2070 Super:
• r=16 provides good balance between performance and memory usage for 8GB VRAM
• All major transformer modules targeted for comprehensive adaptation
• Zero dropout and no bias for maximum memory efficiency and speed
• Unsloth gradient checkpointing reduces memory usage by ~40% during training
• Random state set for reproducible results across runs
• RSLoRA disabled to avoid additional memory overhead on your hardware
"""

import torch
from unsloth import FastLanguageModel

# Check current GPU memory before PEFT conversion
if torch.cuda.is_available():
    memory_before = torch.cuda.memory_allocated() / 1024**3
    print(f"GPU Memory before PEFT: {memory_before:.2f}GB")

# Optimized LoRA configuration for RTX 2070 Super
try:
    print("Converting to PEFT model with LoRA...")
    
    model = FastLanguageModel.get_peft_model(
        model,
        r=16,  # Good balance for 8GB VRAM - can try 32 if stable, or 8 if memory issues
        target_modules=[
            "q_proj", "k_proj", "v_proj", "o_proj",      # Attention projections
            "gate_proj", "up_proj", "down_proj",         # MLP layers
        ],
        lora_alpha=16,  # Usually set equal to r for balanced learning
        lora_dropout=0,  # 0 is optimized - saves memory and computation
        bias="none",     # "none" is most memory efficient
        use_gradient_checkpointing="unsloth",  # Critical for memory savings
        random_state=3407,  # For reproducible results
        use_rslora=False,   # Disabled to save memory on 8GB GPU
        loftq_config=None,  # Not needed for pre-quantized models
        
        # Additional optimizations for your hardware
        modules_to_save=None,  # Don't save additional modules to save memory
    )
    
    print("PEFT model created successfully!")
    
    # Count trainable parameters
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    
    print(f"Total parameters: {total_params:,}")
    print(f"Trainable parameters: {trainable_params:,}")
    print(f"Trainable %: {100 * trainable_params / total_params:.2f}%")
    
    # Check memory usage after PEFT conversion
    if torch.cuda.is_available():
        memory_after = torch.cuda.memory_allocated() / 1024**3
        memory_increase = memory_after - memory_before
        print(f"GPU Memory after PEFT: {memory_after:.2f}GB (+{memory_increase:.2f}GB)")
        
        # Memory warnings for your hardware
        if memory_after > 6.5:  # Leave room for training
            print("⚠️ WARNING: High memory usage. Consider:")
            print("  - Reducing r from 16 to 8")
            print("  - Using fewer target_modules")
            print("  - Reducing max_seq_length further")
    
    # Print LoRA configuration summary
    print("\nLoRA Configuration:")
    print(f"  Rank (r): {16}")
    print(f"  Alpha: {16}")
    print(f"  Dropout: {0}")
    print(f"  Target modules: {len(['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'])}")
    print(f"  Gradient checkpointing: unsloth")
    
except Exception as e:
    print(f"Error creating PEFT model: {e}")
    print("\nTroubleshooting for RTX 2070 Super:")
    print("1. Try reducing rank: r=8 instead of r=16")
    print("2. Reduce target modules to just attention: ['q_proj', 'v_proj']")
    print("3. Clear GPU cache: torch.cuda.empty_cache()")
    print("4. Reduce sequence length further")
    
    # Fallback with minimal LoRA configuration
    try:
        print("\nTrying minimal LoRA configuration...")
        model = FastLanguageModel.get_peft_model(
            model,
            r=8,  # Reduced rank
            target_modules=["q_proj", "v_proj"],  # Minimal modules
            lora_alpha=8,
            lora_dropout=0,
            bias="none",
            use_gradient_checkpointing="unsloth",
            random_state=3407,
            use_rslora=False,
        )
        print("Minimal PEFT model created successfully!")
        
    except Exception as e2:
        print(f"Fallback also failed: {e2}")
        print("Your system may need more aggressive memory optimization")

# Enable training mode
model.train()
print("Model ready for fine-tuning!")

GPU Memory before PEFT: 5.21GB
Converting to PEFT model with LoRA...


Unsloth 2025.8.10 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


PEFT model created successfully!
Total parameters: 4,712,147,968
Trainable parameters: 50,003,968
Trainable %: 1.06%
GPU Memory after PEFT: 5.40GB (+0.19GB)

LoRA Configuration:
  Rank (r): 16
  Alpha: 16
  Dropout: 0
  Target modules: 7
  Gradient checkpointing: unsloth
Model ready for fine-tuning!


<a name="Data"></a>
### Data Prep
We now use the `ChatML` format for conversation style finetunes. We use [Open Assistant conversations](https://huggingface.co/datasets/philschmid/guanaco-sharegpt-style) in ShareGPT style. ChatML renders multi turn conversations like below:

```
<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
What's the capital of France?<|im_end|>
<|im_start|>assistant
Paris.
```

**[NOTE]** To train only on completions (ignoring the user's input) read TRL's docs [here](https://huggingface.co/docs/trl/sft_trainer#train-on-completions-only).

We use our `get_chat_template` function to get the correct chat template. We support `zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old` and our own optimized `unsloth` template.

Normally one has to train `<|im_start|>` and `<|im_end|>`. We instead map `<|im_end|>` to be the EOS token, and leave `<|im_start|>` as is. This requires no additional training of additional tokens.

Note ShareGPT uses `{"from": "human", "value" : "Hi"}` and not `{"role": "user", "content" : "Hi"}`, so we use `mapping` to map it.

For text completions like novel writing, try this [notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Mistral_(7B)-Text_Completion.ipynb).

Let's see how the `ChatML` format works by printing the 5th element

In [4]:
"""
• Manual ChatML implementation that works reliably on RTX 2070 Super + WSL2 environment
• Bypasses Unsloth's chat template function which may have compatibility issues
• Creates proper <|im_start|> and <|im_end|> tokens for ChatML format manually
• Handles ShareGPT format conversion (from/value) to standard conversation structure
• More robust approach that doesn't depend on potentially buggy template functions
• Guaranteed to work with any model and tokenizer combination on your hardware

Key optimizations for RTX 2070 Super:
• Simple string-based formatting avoids tokenizer compatibility issues
• Memory-efficient processing for 52k conversations within 48GB RAM limits
• Direct ShareGPT format handling without complex mapping dependencies
• Works with CodeGemma and any other model loaded with Unsloth
• Fallback dataset options if main dataset causes issues in WSL2
• Conservative batch processing to prevent memory issues during formatting
"""

from datasets import load_dataset
import torch
import gc

# Clear memory before starting
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

print("Setting up manual ChatML formatting (bypassing template issues)...")

# Manual ChatML formatting function - more reliable than get_chat_template
def format_chatml_conversation(conversation):
    """
    Manually format ShareGPT conversation to ChatML format
    More reliable than using tokenizer.apply_chat_template
    """
    formatted_text = ""
    
    for turn in conversation:
        # Handle ShareGPT format
        role = turn.get("from", turn.get("role", ""))
        content = turn.get("value", turn.get("content", ""))
        
        # Skip empty content
        if not content.strip():
            continue
            
        # Map ShareGPT roles to ChatML format
        if role == "human":
            chatml_role = "user"
        elif role == "gpt":
            chatml_role = "assistant"
        elif role == "system":
            chatml_role = "system"
        else:
            # Skip unknown roles
            continue
            
        # Add ChatML formatting
        formatted_text += f"<|im_start|>{chatml_role}\n{content.strip()}<|im_end|>\n"
    
    return formatted_text.strip()

def formatting_prompts_func(examples):
    """
    Convert ShareGPT conversations to ChatML format
    Works reliably without template dependencies
    """
    convos = examples["conversations"]
    texts = []
    
    for convo in convos:
        try:
            formatted_text = format_chatml_conversation(convo)
            if formatted_text:  # Only add non-empty conversations
                texts.append(formatted_text)
            else:
                texts.append("")  # Placeholder for empty conversations
        except Exception as e:
            print(f"Error formatting conversation: {e}")
            texts.append("")
    
    return {"text": texts}

# Load and process dataset
print("Loading Guanaco ShareGPT dataset...")

try:
    # Load dataset
    dataset = load_dataset("philschmid/guanaco-sharegpt-style", split="train")
    print(f"✓ Dataset loaded: {len(dataset)} conversations")
    
    # Debug: Show original format
    print(f"✓ Original columns: {dataset.column_names}")
    print("✓ Sample conversation structure:")
    sample_convo = dataset[0]["conversations"]
    print(f"   First turn: {sample_convo[0]}")
    print(f"   Second turn: {sample_convo[1] if len(sample_convo) > 1 else 'N/A'}")
    
    # Apply manual ChatML formatting
    print("Applying manual ChatML formatting...")
    
    dataset = dataset.map(
        formatting_prompts_func,
        batched=True,
        batch_size=1000,
        remove_columns=dataset.column_names,
        desc="Manual ChatML formatting"
    )
    
    print("✓ Manual ChatML formatting complete")
    print(f"✓ Dataset columns after formatting: {dataset.column_names}")
    
    # Filter out empty conversations
    original_size = len(dataset)
    dataset = dataset.filter(lambda x: len(x["text"].strip()) > 20)
    filtered_size = len(dataset)
    
    print(f"✓ Original: {original_size}, After filtering: {filtered_size}")
    print(f"✓ Filtered out {original_size - filtered_size} empty/invalid conversations")
    
    if filtered_size > 0:
        print("\n" + "="*70)
        print("SAMPLE CHATML FORMATTED CONVERSATION:")
        print("="*70)
        print(dataset[0]["text"])
        print("="*70)
        
        print(f"\n✓ SUCCESS: {filtered_size} ChatML conversations ready for training!")
        
        # Show conversation #5 if available
        if len(dataset) > 5:
            print("\n" + "="*70)
            print("CONVERSATION #5:")
            print("="*70)
            print(dataset[5]["text"])
            print("="*70)
        
        # Memory cleanup
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            
    else:
        print("❌ No valid conversations after formatting!")
        
except Exception as e:
    print(f"Error with main dataset: {e}")
    print("Trying simpler alternative...")
    
    try:
        # Fallback: Create a small test dataset manually
        print("Creating test ChatML dataset...")
        
        test_conversations = [
            {
                "conversations": [
                    {"from": "human", "value": "Hello, how are you?"},
                    {"from": "gpt", "value": "I'm doing well, thank you! How can I help you today?"}
                ]
            },
            {
                "conversations": [
                    {"from": "human", "value": "What is Python?"},
                    {"from": "gpt", "value": "Python is a high-level programming language known for its simplicity and readability."}
                ]
            },
            {
                "conversations": [
                    {"from": "human", "value": "Explain machine learning"},
                    {"from": "gpt", "value": "Machine learning is a subset of AI that enables computers to learn and improve from data without being explicitly programmed."}
                ]
            }
        ]
        
        # Convert to dataset format
        from datasets import Dataset
        dataset = Dataset.from_list(test_conversations)
        
        # Apply formatting
        dataset = dataset.map(formatting_prompts_func, batched=True)
        
        print("✓ Test dataset created successfully!")
        print(f"✓ Test conversations: {len(dataset)}")
        
        print("\n" + "="*70)
        print("TEST CHATML CONVERSATION:")
        print("="*70)
        print(dataset[0]["text"])
        print("="*70)
        
        print("\n✓ Manual ChatML formatting is working!")
        print("You can now proceed with training setup.")
        
    except Exception as e2:
        print(f"Even test dataset failed: {e2}")
        print("Check your datasets library installation:")
        print("pip install datasets --upgrade")

print("\n🚀 ChatML formatting complete - bypassed template issues!")

Setting up manual ChatML formatting (bypassing template issues)...
Loading Guanaco ShareGPT dataset...
✓ Dataset loaded: 9033 conversations
✓ Original columns: ['conversations']
✓ Sample conversation structure:
   First turn: {'from': 'human', 'value': 'Escribe un discurso que pueda recitar como padrino de la boda de mi mejor amigo.'}
   Second turn: {'from': 'gpt', 'value': 'Queridos invitados, amigos y familiares,\n\nMe siento muy honrado de estar aquí hoy como padrino de bodas de mi mejor amigo [Nombre del novio].\n\nRecuerdo con cariño los días en los que [Nombre del novio] y yo nos conocimos, cuando éramos solo dos jóvenes llenos de sueños y esperanza. Ahora, aquí estamos, celebrando su boda con la persona que ama, [Nombre de la novia].\n\n[Nombre de la novia], te aseguro que [Nombre del novio] es una persona increíble, llena de amor y lealtad. Juntos, han formado un equipo invencible y estoy seguro de que su amor perdurará por siempre.\n\n[Nombre del novio], mi amigo, te deseo to

In [5]:
print(dataset[5]["text"])

<|im_start|>user
What is the typical wattage of bulb in a lightbox?<|im_end|>
<|im_start|>assistant
The typical wattage of a bulb in a lightbox is 60 watts, although domestic LED bulbs are normally much lower than 60 watts, as they produce the same or greater lumens for less wattage than alternatives. A 60-watt Equivalent LED bulb can be calculated using the 7:1 ratio, which divides 60 watts by 7 to get roughly 9 watts.<|im_end|>
<|im_start|>user
Rewrite your description of the typical wattage of a bulb in a lightbox to only include the key points in a list format.<|im_end|>


If you're looking to make your own chat template, that also is possible! You must use the Jinja templating regime. We provide our own stripped down version of the `Unsloth template` which we find to be more efficient, and leverages ChatML, Zephyr and Alpaca styles.

More info on chat templates on [our wiki page!](https://github.com/unslothai/unsloth/wiki#chat-templates)

<a name="Train"></a>
### Train the model
Now let's train our model. We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

In [6]:
"""
• The error occurs because the 4-bit quantized model needs LoRA adapters attached before fine-tuning
• Must verify that FastLanguageModel.get_peft_model() completed successfully before trainer setup
• Quantized models cannot be fine-tuned directly - they require trainable PEFT adapters on top
• Need to check if model has LoRA adapters properly attached and is in training mode
• This fix ensures PEFT model conversion works correctly before attempting SFT trainer setup
• Includes validation steps to confirm the model is ready for fine-tuning on RTX 2070 Super

Key fixes for RTX 2070 Super:
• Explicit PEFT model validation before trainer setup to catch conversion failures
• Conservative LoRA settings (r=8) to ensure successful adapter attachment on 8GB VRAM
• Memory monitoring throughout PEFT conversion to prevent silent failures
• Fallback options if standard PEFT conversion fails due to memory constraints
• Trainer setup only proceeds after confirming LoRA adapters are properly attached
• Reduced sequence length (512) to accommodate both base model and LoRA adapters
"""

import torch
from unsloth import FastLanguageModel
from trl import SFTTrainer, SFTConfig
import gc

# Step 1: Verify base model is loaded correctly
print("🔍 Verifying base model setup...")
try:
    print(f"✓ Model type: {type(model)}")
    print(f"✓ Model device: {next(model.parameters()).device}")
    print(f"✓ Model dtype: {next(model.parameters()).dtype}")
    
    # Check if model is quantized
    if hasattr(model, 'config') and hasattr(model.config, 'quantization_config'):
        print("✓ Model is quantized - LoRA adapters required")
    else:
        print("ℹ️ Model quantization status unclear")
        
except Exception as e:
    print(f"❌ Base model verification failed: {e}")
    print("Please ensure the model was loaded successfully first")
    exit(1)

# Step 2: Clear memory and add PEFT adapters
print("\n🔧 Adding LoRA adapters to quantized model...")
torch.cuda.empty_cache() if torch.cuda.is_available() else None
gc.collect()

try:
    # Conservative LoRA settings for RTX 2070 Super
    print("Applying PEFT with conservative settings...")
    
    model = FastLanguageModel.get_peft_model(
        model,
        r=8,  # Reduced from 16 to ensure success on 8GB VRAM
        target_modules=[
            "q_proj", "k_proj", "v_proj", "o_proj",      # Attention projections
            "gate_proj", "up_proj", "down_proj",         # MLP layers
        ],
        lora_alpha=4, #8,  Set equal to r
        lora_dropout=0,  # 0 is optimized for memory and speed
        bias="none",     # "none" is most memory efficient
        use_gradient_checkpointing="unsloth",  # Essential for memory savings
        random_state=3407,
        use_rslora=False,   # Disabled to save memory
        loftq_config=None,  # Not needed for pre-quantized models
    )
    
    print("✅ LoRA adapters added successfully!")
    
except Exception as e:
    print(f"❌ PEFT conversion failed: {e}")
    print("\n🔧 Trying minimal LoRA configuration...")
    
    try:
        # Ultra-minimal LoRA for problematic setups
        model = FastLanguageModel.get_peft_model(
            model,
            r=4,  # Very small rank
            target_modules=["q_proj", "v_proj"],  # Only essential modules
            lora_alpha=2,
            lora_dropout=0,
            bias="none",
            use_gradient_checkpointing="unsloth",
            random_state=3407,
        )
        print("✅ Minimal LoRA adapters added successfully!")
        
    except Exception as e2:
        print(f"❌ Even minimal PEFT failed: {e2}")
        print("Your system may need a different approach or smaller model")
        exit(1)

# Step 3: Validate PEFT model setup
print("\n✅ Validating PEFT model...")
try:
    # Check for LoRA adapters
    peft_modules = [name for name, module in model.named_modules() if 'lora' in name.lower()]
    if peft_modules:
        print(f"✓ Found {len(peft_modules)} LoRA modules")
        print(f"✓ Sample LoRA modules: {peft_modules[:3]}")
    else:
        print("❌ No LoRA modules found!")
        exit(1)
    
    # Count trainable parameters
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    
    print(f"✓ Total parameters: {total_params:,}")
    print(f"✓ Trainable parameters: {trainable_params:,}")
    print(f"✓ Trainable percentage: {100 * trainable_params / total_params:.2f}%")
    
    if trainable_params == 0:
        print("❌ No trainable parameters found!")
        exit(1)
    
    # Set model to training mode
    model.train()
    print("✓ Model set to training mode")
    
    # Check memory usage
    if torch.cuda.is_available():
        memory_used = torch.cuda.memory_allocated() / 1024**3
        print(f"✓ GPU memory after PEFT: {memory_used:.2f}GB")
        
        if memory_used > 7.0:
            print("⚠️ WARNING: High memory usage - consider reducing sequence length further")

except Exception as e:
    print(f"❌ PEFT validation failed: {e}")
    exit(1)

# Step 4: Setup SFT Trainer with corrected configuration
print("\n🚀 Setting up SFT Trainer...")
torch.cuda.empty_cache() if torch.cuda.is_available() else None

try:
    # Updated training configuration for your setup
    training_config = SFTConfig(
        # Memory-critical settings for RTX 2070 Super
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        dataloader_num_workers=0,
        
        # Reduced sequence length for your setup
        max_seq_length= max_seq_length,  # Further reduced from 254 to be safe
        
        # Training parameters
        max_steps=20,  # Short test run
        warmup_steps=5,
        learning_rate=2e-4,
        lr_scheduler_type="linear",
        
        # Memory-efficient optimizer
        optim="adamw_8bit",
        weight_decay=0.01,
        max_grad_norm=0.3,
        
        # Data settings
        dataset_text_field="text",
        
        # Memory optimizations
        fp16=True,
        remove_unused_columns=False,
        dataloader_pin_memory=False,
        
        # Logging
        logging_steps=5,
        save_steps=10,
        save_total_limit=2,
        report_to="none",
        
        # Output
        output_dir="./results",
        overwrite_output_dir=True,
        
        # Reproducibility
        seed=3407,
    )
    
    # Create trainer with PEFT model
    trainer = SFTTrainer(
        model=model,  # This should now be the PEFT-enabled model
        tokenizer=tokenizer,
        train_dataset=dataset,
        packing=False,  # Keep disabled for memory safety
        args=training_config,
    )
    
    print("✅ SFT Trainer created successfully!")
    
    # Final validation
    print(f"✓ Trainer model type: {type(trainer.model)}")
    print(f"✓ Training dataset size: {len(dataset)}")
    print(f"✓ Sequence length: {training_config.max_seq_length}")
    print(f"✓ Effective batch size: {training_config.per_device_train_batch_size * training_config.gradient_accumulation_steps}")
    
    # Memory check
    if torch.cuda.is_available():
        final_memory = torch.cuda.memory_allocated() / 1024**3
        print(f"✓ Final GPU memory: {final_memory:.2f}GB")
        
        if final_memory < 7.5:
            print("✅ Memory usage looks good for training!")
        else:
            print("⚠️ High memory usage - monitor during training")
    
    print("\n🎯 Ready for training!")
    print("Run: trainer_stats = trainer.train()")

except Exception as e:
    print(f"❌ Trainer setup failed: {e}")
    print("\n🔧 Additional troubleshooting:")
    print("1. Restart kernel and reload model with lower max_seq_length")
    print("2. Try even smaller LoRA rank (r=2)")
    print("3. Use tinyllama model instead of codegemma")
    print("4. Ensure sufficient GPU memory available")

print("\n" + "="*60)
print("SETUP COMPLETE - STARTING TRAINING")
print("="*60)

# Step 5: Execute Training
import time
import psutil
import os

# Pre-training system check
def check_system_resources():
    """Monitor system resources during training"""
    if torch.cuda.is_available():
        gpu_memory = torch.cuda.memory_allocated() / 1024**3
        gpu_cached = torch.cuda.memory_reserved() / 1024**3
        gpu_total = torch.cuda.get_device_properties(0).total_memory / 1024**3
        print(f"GPU Memory - Used: {gpu_memory:.2f}GB, Cached: {gpu_cached:.2f}GB, Total: {gpu_total:.1f}GB")
    
    ram_usage = psutil.virtual_memory()
    print(f"System RAM - Used: {ram_usage.used / 1024**3:.1f}GB, Available: {ram_usage.available / 1024**3:.1f}GB")

print("🚀 Starting fine-tuning training...")
print("Pre-training system status:")
check_system_resources()
print("="*60)

# Clear memory before training
torch.cuda.empty_cache() if torch.cuda.is_available() else None
gc.collect()

try:
    print("▶️  Executing: trainer.train()")
    print("   Training for 20 steps with LoRA adapters...")
    print("   Expected duration: 2-5 minutes on RTX 2070 Super")
    print("   Monitoring for OOM errors...")
    
    # Record start time
    start_time = time.time()
    
    # Execute the actual training
    trainer_stats = trainer.train()
    
    # Record end time
    end_time = time.time()
    training_duration = end_time - start_time
    
    print("="*60)
    print("✅ TRAINING COMPLETED SUCCESSFULLY!")
    print("="*60)
    
    # Display comprehensive training results
    print(f"⏱️  Training Duration: {training_duration:.1f} seconds ({training_duration/60:.1f} minutes)")
    print(f"📊 Training Statistics:")
    print(f"   - Total steps completed: {trainer_stats.global_step}")
    print(f"   - Final training loss: {trainer_stats.training_loss:.4f}")
    print(f"   - Steps per second: {trainer_stats.global_step / training_duration:.2f}")
    
    # Show detailed training progress
    if hasattr(trainer_stats, 'log_history') and trainer_stats.log_history:
        print(f"\n📈 Training Progress (Last 5 Steps):")
        for i, log_entry in enumerate(trainer_stats.log_history[-5:]):
            step = log_entry.get('step', i)
            loss = log_entry.get('train_loss', 'N/A')
            lr = log_entry.get('learning_rate', 'N/A')
            print(f"   Step {step}: Loss={loss}, LR={lr}")
    
    # Check training effectiveness
    if hasattr(trainer_stats, 'log_history') and len(trainer_stats.log_history) > 1:
        first_loss = trainer_stats.log_history[0].get('train_loss', 0)
        last_loss = trainer_stats.log_history[-1].get('train_loss', 0)
        if first_loss > last_loss:
            improvement = ((first_loss - last_loss) / first_loss) * 100
            print(f"✅ Loss improved by {improvement:.1f}% (from {first_loss:.4f} to {last_loss:.4f})")
        else:
            print("⚠️ Loss did not decrease - may need longer training or different parameters")
    
    # Post-training system status
    print(f"\n🖥️  Post-training system status:")
    check_system_resources()
    
    # Check saved checkpoints
    if os.path.exists("./results"):
        checkpoints = [f for f in os.listdir("./results") if f.startswith("checkpoint")]
        print(f"💾 Checkpoints saved: {len(checkpoints)} in ./results/")
        if checkpoints:
            latest_checkpoint = max(checkpoints, key=lambda x: int(x.split('-')[-1]) if x.split('-')[-1].isdigit() else 0)
            print(f"   Latest checkpoint: {latest_checkpoint}")
    
    # Display LoRA adapter statistics
    lora_params = sum(p.numel() for name, p in model.named_parameters() if 'lora' in name.lower() and p.requires_grad)
    print(f"📊 LoRA Statistics:")
    print(f"   - LoRA parameters trained: {lora_params:,}")
    print(f"   - LoRA rank: {8}")  # Based on our configuration
    print(f"   - Target modules: {len(['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'])}")
    
    print("\n🎯 Training Results Summary:")
    print("="*60)
    print("✅ Fine-tuning completed successfully without OOM errors")
    print("✅ LoRA adapters updated with new knowledge from ChatML dataset") 
    print("✅ Model ready for inference testing or extended training")
    print("✅ Training checkpoints saved for recovery and deployment")
    print("="*60)
    
    print("\n📋 Next Steps:")
    print("1. Test the fine-tuned model with sample prompts")
    print("2. Save the LoRA adapters: model.save_pretrained('./fine_tuned_lora')")
    print("3. For production training, increase max_steps to 500-2000+")
    print("4. Consider enabling packing=True for faster training if memory allows")
    print("5. Evaluate model performance on validation data")
    
    # Save training statistics for later analysis
    print(f"\n💾 Training stats available in 'trainer_stats' variable:")
    print(f"   - Global step: {trainer_stats.global_step}")
    print(f"   - Training loss: {trainer_stats.training_loss:.4f}")
    print(f"   - Log history: {len(trainer_stats.log_history) if hasattr(trainer_stats, 'log_history') else 0} entries")

except RuntimeError as e:
    if "out of memory" in str(e).lower():
        print("❌ OUT OF MEMORY ERROR!")
        print("="*60)
        print("🔧 Solutions for RTX 2070 Super:")
        print("1. Further reduce max_seq_length from 512 to 256")
        print("2. Reduce LoRA rank from r=8 to r=4")
        print("3. Use only attention modules: target_modules=['q_proj', 'v_proj']")
        print("4. Reduce gradient_accumulation_steps from 4 to 2")
        print("5. Try gradient_checkpointing='unsloth' (already enabled)")
        
        # Clear memory for recovery
        torch.cuda.empty_cache() if torch.cuda.is_available() else None
        gc.collect()
        print("🧹 GPU memory cleared for recovery")
        
        print(f"\nCurrent memory after cleanup: {torch.cuda.memory_allocated() / 1024**3:.2f}GB")
        
    else:
        print(f"❌ Training failed with RuntimeError: {e}")
        print("Check the error details above for specific issues")

except KeyboardInterrupt:
    print("\n⏹️  Training interrupted by user (Ctrl+C)")
    print("💾 Partial training progress may have been saved to ./results/")
    if os.path.exists("./results"):
        checkpoints = [f for f in os.listdir("./results") if f.startswith("checkpoint")]
        if checkpoints:
            latest = max(checkpoints, key=lambda x: int(x.split('-')[-1]) if x.split('-')[-1].isdigit() else 0)
            print(f"   Latest partial checkpoint: {latest}")

except Exception as e:
    print(f"❌ Unexpected error during training: {e}")
    print("Full error details:")
    import traceback
    traceback.print_exc()
    
    print("\n🔧 General troubleshooting:")
    print("1. Check model, tokenizer, and dataset are properly loaded")
    print("2. Verify PEFT conversion completed successfully")
    print("3. Ensure sufficient disk space in ./results/ directory")
    print("4. Check for any data corruption in the dataset")

finally:
    # Always show final system status and cleanup
    print(f"\n🖥️  Final system status:")
    check_system_resources()
    
    # Final cleanup
    torch.cuda.empty_cache() if torch.cuda.is_available() else None
    gc.collect()
    print("🧹 Final memory cleanup completed")

print("\n" + "="*60)
print("TRAINING EXECUTION COMPLETE")
print("="*60)

# Display final model state
print("\n🔍 Final Model State:")
try:
    print(f"✓ Model device: {next(model.parameters()).device}")
    print(f"✓ Model in training mode: {model.training}")
    print(f"✓ Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")
    
    if 'trainer_stats' in locals():
        print(f"✓ Training completed: {trainer_stats.global_step} steps")
        print(f"✓ Final loss: {trainer_stats.training_loss:.4f}")
    else:
        print("⚠️ Training did not complete successfully")
        
except Exception as e:
    print(f"Could not verify final model state: {e}")

print("\n💡 FOR PRODUCTION USE:")
print("- Increase max_steps to 500-2000 for better results")
print("- Use larger datasets for more comprehensive training")
print("- Enable packing=True if your memory allows (5x faster)")
print("- Monitor training loss and adjust learning rate if needed")

🔍 Verifying base model setup...
✓ Model type: <class 'peft.peft_model.PeftModelForCausalLM'>
✓ Model device: cuda:0
✓ Model dtype: torch.float16
✓ Model is quantized - LoRA adapters required

🔧 Adding LoRA adapters to quantized model...
Applying PEFT with conservative settings...
❌ PEFT conversion failed: Unsloth: Your model already has LoRA adapters. Your new parameters are different.

🔧 Trying minimal LoRA configuration...
❌ Even minimal PEFT failed: Unsloth: Your model already has LoRA adapters. Your new parameters are different.
Your system may need a different approach or smaller model

✅ Validating PEFT model...
✓ Found 1764 LoRA modules
✓ Sample LoRA modules: ['base_model.model.model.layers.0.self_attn.q_proj.lora_dropout', 'base_model.model.model.layers.0.self_attn.q_proj.lora_dropout.default', 'base_model.model.model.layers.0.self_attn.q_proj.lora_A']
✓ Total parameters: 4,712,147,968
✓ Trainable parameters: 50,003,968
✓ Trainable percentage: 1.06%
✓ Model set to training mode

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 9,033 | Num Epochs = 1 | Total steps = 20
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 4 x 1) = 4
 "-____-"     Trainable parameters = 50,003,968 of 8,587,684,864 (0.58% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,entropy
5,1.7171,0
10,1.1421,No Log
15,1.3394,No Log
20,1.4917,No Log


✅ TRAINING COMPLETED SUCCESSFULLY!
⏱️  Training Duration: 91.6 seconds (1.5 minutes)
📊 Training Statistics:
   - Total steps completed: 20
   - Final training loss: 1.4226
   - Steps per second: 0.22

🖥️  Post-training system status:
GPU Memory - Used: 5.65GB, Cached: 6.71GB, Total: 8.0GB
System RAM - Used: 2.7GB, Available: 20.2GB
💾 Checkpoints saved: 2 in ./results/
   Latest checkpoint: checkpoint-20
📊 LoRA Statistics:
   - LoRA parameters trained: 50,003,968
   - LoRA rank: 8
   - Target modules: 7

🎯 Training Results Summary:
✅ Fine-tuning completed successfully without OOM errors
✅ LoRA adapters updated with new knowledge from ChatML dataset
✅ Model ready for inference testing or extended training
✅ Training checkpoints saved for recovery and deployment

📋 Next Steps:
1. Test the fine-tuned model with sample prompts
2. Save the LoRA adapters: model.save_pretrained('./fine_tuned_lora')
3. For production training, increase max_steps to 500-2000+
4. Consider enabling packing=True fo

<a name="Inference"></a>
### Inference
Let's run the model! Since we're using `ChatML`, use `apply_chat_template` with `add_generation_prompt` set to `True` for inference.

In [7]:
# from unsloth.chat_templates import get_chat_template

# tokenizer = get_chat_template(
#     tokenizer,
#     chat_template = "chatml", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
#     mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
#     map_eos_token = True, # Maps <|im_end|> to </s> instead
# )

# FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# messages = [
#     {"from": "human", "value": "Continue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,"},
# ]
# inputs = tokenizer.apply_chat_template(
#     messages,
#     tokenize = True,
#     add_generation_prompt = True, # Must add for generation
#     return_tensors = "pt",
# ).to("cuda")

# outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True)
# tokenizer.batch_decode(outputs)

In [8]:
import torch
import gc
from transformers import TextStreamer

# Clear GPU memory before inference
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")
print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f}GB")

# Manual ChatML formatting function - bypasses template issues
def format_chatml_manual(messages):
    """
    Manually format messages to ChatML format without using tokenizer.apply_chat_template
    Avoids protobuf and template compatibility issues
    """
    formatted_text = ""
    
    for message in messages:
        # Handle ShareGPT format
        role = message.get("from", message.get("role", ""))
        content = message.get("value", message.get("content", ""))
        
        # Map ShareGPT roles to ChatML format
        if role == "human":
            chatml_role = "user"
        elif role == "gpt":
            chatml_role = "assistant"
        elif role == "system":
            chatml_role = "system"
        else:
            chatml_role = "user"  # Default fallback
            
        # Add ChatML formatting
        formatted_text += f"<|im_start|>{chatml_role}\n{content.strip()}<|im_end|>\n"
    
    # Add generation prompt for assistant response
    formatted_text += "<|im_start|>assistant\n"
    
    return formatted_text

# Enable inference mode
model = FastLanguageModel.for_inference(model)
model.eval()

# Test messages
messages = [
    {"from": "human", "value": "Continue the fibonacci sequence: 1, 1, 2, 3, 5, 8,"},
]

print("\n" + "="*60)
print("MANUAL CHATML INFERENCE")
print("="*60)

try:
    # Format manually without using tokenizer.apply_chat_template
    formatted_prompt = format_chatml_manual(messages)
    print("Formatted ChatML prompt:")
    print(formatted_prompt)
    print("-"*40)
    
    # Tokenize manually
    inputs = tokenizer(
        formatted_prompt,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=2048  # Adjust based on your model's context length
    )
    
    # Move to GPU
    device = "cuda" if torch.cuda.is_available() else "cpu"
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)
    
    print(f"Input shape: {input_ids.shape}")
    print(f"Using device: {device}")
    
    # Setup TextStreamer for real-time output
    text_streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    
    print("\nGenerating response (streaming):")
    print("-"*40)
    
    # Generate with streaming
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            streamer=text_streamer,
            max_new_tokens=128,
            use_cache=True,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id if tokenizer.eos_token_id else tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    
    print("\n" + "-"*40)
    print("Generation complete!")
    
    # Also get the full decoded output
    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    print("\nFull conversation:")
    print("="*60)
    print(full_response)
    print("="*60)
    
    # Clear GPU memory after generation
    del outputs, input_ids, attention_mask
    torch.cuda.empty_cache()
    
except Exception as e:
    print(f"Manual inference error: {e}")
    import traceback
    traceback.print_exc()

# Memory usage check
if torch.cuda.is_available():
    print(f"\nGPU Memory Used: {torch.cuda.memory_allocated(0) / 1024**3:.2f}GB")
    print(f"GPU Memory Cached: {torch.cuda.memory_reserved(0) / 1024**3:.2f}GB")

print("\n🚀 Manual ChatML inference complete - bypassed template issues!")

# Optional: Test with different conversation
print("\n" + "="*60)
print("TESTING MULTI-TURN CONVERSATION")
print("="*60)

try:
    multi_turn_messages = [
        {"from": "human", "value": "What is machine learning?"},
        {"from": "gpt", "value": "Machine learning is a subset of AI that enables computers to learn from data."},
        {"from": "human", "value": "Can you give me a simple example?"},
    ]
    
    formatted_multi = format_chatml_manual(multi_turn_messages)
    print("Multi-turn ChatML prompt:")
    print(formatted_multi)
    
    inputs_multi = tokenizer(
        formatted_multi,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=2048
    ).to(device)
    
    print("\nMulti-turn response (streaming):")
    print("-"*40)
    
    with torch.no_grad():
        outputs_multi = model.generate(
            **inputs_multi,
            streamer=text_streamer,
            max_new_tokens=64,
            use_cache=True,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id if tokenizer.eos_token_id else tokenizer.pad_token_id,
        )
    
    print("\n" + "-"*40)
    print("Multi-turn generation complete!")
    
    # Cleanup
    del outputs_multi, inputs_multi
    torch.cuda.empty_cache()
    
except Exception as e:
    print(f"Multi-turn inference error: {e}")

print("\n✅ All manual ChatML inference tests complete!")

CUDA available: True
GPU: NVIDIA GeForce RTX 2070 SUPER
GPU Memory: 8.0GB

MANUAL CHATML INFERENCE
Formatted ChatML prompt:
<|im_start|>user
Continue the fibonacci sequence: 1, 1, 2, 3, 5, 8,<|im_end|>
<|im_start|>assistant

----------------------------------------
Input shape: torch.Size([1, 46])
Using device: cuda

Generating response (streaming):
----------------------------------------
Sure, here is the next number in the Fibonacci sequence: 13. 

The Fibonacci sequence is a series of numbers in which each number is the sum of the two preceding ones. It starts with 0 and 1, and the next number is the sum of the previous two. So, to find the next number in the sequence, we add the last two numbers together.

The sequence continues as follows:

1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144

----------------------------------------
Generation complete!

Full conversation:
<|im_start|>user
Continue the fibonacci sequence: 1, 1, 2, 3, 5, 8,<|im_end|>
<|im_start|>assistant
Sure, here is the 

 You can also use a `TextStreamer` for continuous inference - so you can see the generation token by token, instead of waiting the whole time!

In [9]:
# FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# messages = [
#     {"from": "human", "value": "Continue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,"},
# ]
# inputs = tokenizer.apply_chat_template(
#     messages,
#     tokenize = True,
#     add_generation_prompt = True, # Must add for generation
#     return_tensors = "pt",
# ).to("cuda")

# from transformers import TextStreamer
# text_streamer = TextStreamer(tokenizer)
# _ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128, use_cache = True)

<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [10]:
'''
Model Saving Section Explanation:

Saves only the LoRA adapter weights locally to "lora_model" directory (not full model)
Optional push_to_hub uploads LoRA adapters to Hugging Face Hub with authentication token
LoRA adapters are small files (few MB) containing just the fine-tuned parameters
Full base model stays separate - adapters get merged with base model during loading
Local save for backup/reuse, Hub save for sharing/deployment across machines
'''
import os
import torch
import gc
from pathlib import Path

print("="*60)
print("LORA MODEL SAVING")
print("="*60)

# Clear memory before saving
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# Check current directory and create save path
current_dir = Path.cwd()
save_dir = current_dir / "lora_model"

print(f"Current directory: {current_dir}")
print(f"Save directory: {save_dir}")

try:
    # LOCAL SAVING - Save LoRA adapters only
    print("\n1. LOCAL SAVING:")
    print("-" * 40)
    
    # Create directory if it doesn't exist
    save_dir.mkdir(exist_ok=True)
    
    # Save LoRA adapters (this is fast - only saves the adapter weights)
    print("Saving LoRA adapters locally...")
    model.save_pretrained(str(save_dir))
    
    # Check what was saved
    saved_files = list(save_dir.glob("*"))
    print(f"✓ LoRA adapters saved to: {save_dir}")
    print("✓ Files saved:")
    for file in saved_files:
        file_size = file.stat().st_size / (1024*1024)  # Size in MB
        print(f"   - {file.name} ({file_size:.1f} MB)")
    
    total_size = sum(f.stat().st_size for f in saved_files) / (1024*1024)
    print(f"✓ Total size: {total_size:.1f} MB")
    print("✓ Local save complete!")
    
except Exception as e:
    print(f"❌ Local saving failed: {e}")
    import traceback
    traceback.print_exc()

# ONLINE SAVING (Hugging Face Hub) - Commented out by default
print("\n2. ONLINE SAVING (Hugging Face Hub):")
print("-" * 40)
print("⚠️  Online saving is commented out - requires:")
print("   - Hugging Face account and token")
print("   - Internet connection")
print("   - Unique model name")

# Uncomment and configure the following for online saving:
"""
try:
    # You need to:
    # 1. Get your token from https://huggingface.co/settings/tokens
    # 2. Replace "your_name" with your HF username
    # 3. Replace "lora_model" with your desired model name
    
    hf_token = "hf_xxxxxxxxxxxxxxxxxxxxxxxxxx"  # Your actual token here
    hf_username = "your_username"  # Your HF username
    model_name = "codegemma-7b-conversational-lora"  # Your model name
    
    full_model_name = f"{hf_username}/{model_name}"
    
    print(f"Uploading to: {full_model_name}")
    
    model.push_to_hub(
        full_model_name, 
        token=hf_token,
        private=False,  # Set to True for private models
        safe_serialization=True
    )
    
    print(f"✓ LoRA adapters uploaded to: https://huggingface.co/{full_model_name}")
    
except Exception as e:
    print(f"❌ Online saving failed: {e}")
"""

print("📝 To enable online saving:")
print("   1. Get HF token: https://huggingface.co/settings/tokens")
print("   2. Uncomment the online saving code above")
print("   3. Replace placeholders with your actual values")

# LOADING SAVED MODEL EXAMPLE
print("\n3. HOW TO LOAD SAVED LORA MODEL:")
print("-" * 40)
print("To load your saved LoRA model later:")
print("""
from unsloth import FastLanguageModel

# Load base model + your LoRA adapters
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/codegemma-7b-bnb-4bit",  # Base model
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
)

# Load your LoRA adapters
model = PeftModel.from_pretrained(model, "lora_model")  # Local path
# OR from Hub: model = PeftModel.from_pretrained(model, "your_name/lora_model")

# Enable inference
model = FastLanguageModel.for_inference(model)
""")

# Memory cleanup
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

print("\n" + "="*60)
print("LoRA SAVING SUMMARY:")
print("="*60)
print("✓ LoRA adapters contain only the fine-tuned parameters")
print("✓ Small file size (typically 10-100MB vs 13GB+ for full model)")
print("✓ Can be shared/deployed easily")
print("✓ Always need base model + adapters for inference")
print("✓ Local saving is immediate, Hub upload requires internet")

if torch.cuda.is_available():
    print(f"\nGPU Memory: {torch.cuda.memory_allocated(0) / 1024**3:.2f}GB used")

print("\n🚀 LoRA model saving setup complete!")

LORA MODEL SAVING
Current directory: /mnt/c/models/llm/test
Save directory: /mnt/c/models/llm/test/lora_model

1. LOCAL SAVING:
----------------------------------------
Saving LoRA adapters locally...
✓ LoRA adapters saved to: /mnt/c/models/llm/test/lora_model
✓ Files saved:
   - adapter_config.json (0.0 MB)
   - adapter_model.safetensors (190.8 MB)
   - README.md (0.0 MB)
✓ Total size: 190.8 MB
✓ Local save complete!

2. ONLINE SAVING (Hugging Face Hub):
----------------------------------------
⚠️  Online saving is commented out - requires:
   - Hugging Face account and token
   - Internet connection
   - Unique model name
📝 To enable online saving:
   1. Get HF token: https://huggingface.co/settings/tokens
   2. Uncomment the online saving code above
   3. Replace placeholders with your actual values

3. HOW TO LOAD SAVED LORA MODEL:
----------------------------------------
To load your saved LoRA model later:

from unsloth import FastLanguageModel

# Load base model + your LoRA adap

Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`:

In [11]:
# if False:
#     from unsloth import FastLanguageModel
#     model, tokenizer = FastLanguageModel.from_pretrained(
#         model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
#         max_seq_length = max_seq_length,
#         dtype = dtype,
#         load_in_4bit = load_in_4bit,
#     )
#     FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# messages = [
#     {"from": "human", "value": "What is a famous tall tower in Paris?"},
# ]
# inputs = tokenizer.apply_chat_template(
#     messages,
#     tokenize = True,
#     add_generation_prompt = True, # Must add for generation
#     return_tensors = "pt",
# ).to("cuda")

# from transformers import TextStreamer
# text_streamer = TextStreamer(tokenizer)
# _ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128, use_cache = True)

In [1]:
import torch
import gc
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel, PeftConfig
from pathlib import Path
import os
import psutil

print("="*60)
print("MEMORY-OPTIMIZED LORA LOADING (RTX 2070 SUPER 8GB)")
print("="*60)

# Disable Triton and optimize memory
os.environ["DISABLE_TRITON"] = "1"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

def clear_memory():
    """Aggressive memory cleanup"""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()

def print_memory_usage():
    """Print current memory usage"""
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated(0) / 1024**3
        reserved = torch.cuda.memory_reserved(0) / 1024**3
        total = torch.cuda.get_device_properties(0).total_memory / 1024**3
        print(f"GPU Memory - Allocated: {allocated:.2f}GB, Reserved: {reserved:.2f}GB, Total: {total:.1f}GB")
    
    ram_used = psutil.virtual_memory().used / 1024**3
    ram_total = psutil.virtual_memory().total / 1024**3
    print(f"RAM Memory - Used: {ram_used:.2f}GB, Total: {ram_total:.1f}GB")

# Initial memory check
clear_memory()
print_memory_usage()

lora_path = Path("lora_model")
if not lora_path.exists():
    print(f"❌ LoRA model not found")
    exit()

print("\n💾 MEMORY-EFFICIENT MODEL LOADING:")
print("-" * 50)

try:
    # Get base model name
    peft_config = PeftConfig.from_pretrained("lora_model")
    base_model_name = peft_config.base_model_name_or_path
    print(f"Base model: {base_model_name}")
    
    # 8-bit quantization config for memory efficiency
    bnb_config = BitsAndBytesConfig(
        load_in_8bit=True,
        llm_int8_enable_fp32_cpu_offload=True,  # Offload to CPU when needed
        llm_int8_threshold=6.0,
    )
    
    print("Loading base model with 8-bit quantization...")
    
    # Load base model with aggressive memory optimization
    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        quantization_config=bnb_config,
        device_map="auto",  # Let it handle CPU/GPU split
        trust_remote_code=True,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float16,
        max_memory={0: "6GB", "cpu": "40GB"},  # Reserve 2GB GPU buffer
    )
    
    print_memory_usage()
    
    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(
        base_model_name,
        trust_remote_code=True,
        model_max_length=256  # Very short sequences
    )
    
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    clear_memory()
    print("Loading LoRA adapters...")
    
    # Load LoRA with memory optimization
    model = PeftModel.from_pretrained(
        base_model,
        "lora_model",
        torch_dtype=torch.float16,
        device_map="auto"
    )
    
    model.eval()
    print("✓ Model loaded with 8-bit + LoRA!")
    print_memory_usage()
    
    # Ultra-lightweight generation function
    def memory_efficient_generate(model, tokenizer, prompt_text, max_tokens=10):
        """
        Memory-efficient generation with automatic cleanup
        """
        clear_memory()
        
        print(f"Generating with max {max_tokens} tokens...")
        
        # Tokenize with minimal length
        inputs = tokenizer(
            prompt_text,
            return_tensors="pt",
            truncation=True,
            max_length=128,  # Very short input
            add_special_tokens=False
        )
        
        # Move to same device as model
        device = next(model.parameters()).device
        input_ids = inputs["input_ids"].to(device)
        
        print(f"Input length: {input_ids.shape[1]} tokens")
        print(f"Device: {device}")
        print_memory_usage()
        
        generated_tokens = []
        current_input = input_ids.clone()
        
        print("\nGenerating:")
        print("-" * 20)
        
        try:
            for step in range(max_tokens):
                # Clear memory before each step
                if step % 3 == 0:  # Every 3 steps
                    clear_memory()
                
                with torch.no_grad():
                    # Forward pass with gradient checkpointing disabled
                    outputs = model(current_input, use_cache=False)
                    logits = outputs.logits
                
                # Get next token
                next_token_logits = logits[0, -1, :]
                next_token = torch.argmax(next_token_logits, dim=-1).unsqueeze(0).unsqueeze(0)
                
                # Decode and print
                token_text = tokenizer.decode(next_token[0], skip_special_tokens=True)
                print(token_text, end='', flush=True)
                
                generated_tokens.append(next_token.item())
                
                # Stop conditions
                if next_token.item() == tokenizer.eos_token_id:
                    break
                if len(token_text.strip()) == 0:  # Skip empty tokens
                    continue
                
                # Add token and limit context
                current_input = torch.cat([current_input, next_token], dim=1)
                
                # Keep only last 64 tokens to save memory
                if current_input.shape[1] > 64:
                    current_input = current_input[:, -32:]
                
                # Memory check
                if torch.cuda.is_available():
                    allocated = torch.cuda.memory_allocated(0) / 1024**3
                    if allocated > 7.5:  # Near 8GB limit
                        print(f"\n⚠️ Memory limit reached: {allocated:.2f}GB")
                        break
                
                # Cleanup intermediate tensors
                del outputs, logits, next_token_logits, next_token
                
        except torch.cuda.OutOfMemoryError:
            print(f"\n❌ OOM at step {step}")
            clear_memory()
        except Exception as e:
            print(f"\n❌ Error at step {step}: {e}")
        
        print(f"\n\nGenerated {len(generated_tokens)} tokens")
        clear_memory()
        return generated_tokens
    
    # Test with minimal prompt
    print("\n💾 MEMORY-EFFICIENT INFERENCE TEST:")
    print("-" * 45)
    
    simple_prompt = "Hello"
    print(f"Test prompt: '{simple_prompt}'")
    
    generated = memory_efficient_generate(
        model, 
        tokenizer, 
        simple_prompt, 
        max_tokens=8  # Very small
    )
    
    print(f"✓ Generated tokens: {generated}")
    print_memory_usage()
    
    # Test ChatML if first test works
    if len(generated) > 0:
        print("\n💾 CHATML TEST:")
        print("-" * 25)
        
        chatml_prompt = "<|im_start|>user\nHi<|im_end|>\n<|im_start|>assistant\n"
        print(f"ChatML prompt: {chatml_prompt}")
        
        generated2 = memory_efficient_generate(
            model,
            tokenizer,
            chatml_prompt,
            max_tokens=12
        )
        
        print(f"✓ ChatML tokens: {generated2}")
    
    print("\n" + "="*60)
    print("💾 MEMORY-OPTIMIZED SUCCESS!")
    print("="*60)
    print("✓ Used 8-bit quantization")
    print("✓ CPU/GPU memory offloading")
    print("✓ Short context windows")
    print("✓ Aggressive memory cleanup")
    print("✓ OOM protection")
    
    final_memory_usage = torch.cuda.memory_allocated(0) / 1024**3 if torch.cuda.is_available() else 0
    print(f"✓ Final GPU usage: {final_memory_usage:.2f}GB / 8GB")
    
except torch.cuda.OutOfMemoryError as e:
    print(f"❌ Still OOM: {e}")
    print("\n💡 SOLUTIONS FOR RTX 2070 SUPER:")
    print("-" * 35)
    print("1. Use CPU-only inference:")
    print("   device_map='cpu'")
    print("2. Try 4-bit quantization instead of 8-bit")
    print("3. Use smaller model (CodeGemma-2B)")
    print("4. Reduce max_length to 64 tokens")
    print("5. Use model sharding with accelerate")
    
except Exception as e:
    print(f"❌ Loading failed: {e}")
    import traceback
    traceback.print_exc()

finally:
    clear_memory()
    print_memory_usage()

print("\n💾 Memory-optimized loading complete!")

You can also use Hugging Face's `AutoModelForPeftCausalLM`. Only use this if you do not have `unsloth` installed. It can be hopelessly slow, since `4bit` model downloading is not supported, and Unsloth's **inference is 2x faster**.

In [13]:
if False:
    # I highly do NOT suggest - use Unsloth if possible
    from peft import AutoModelForPeftCausalLM
    from transformers import AutoTokenizer

    model = AutoModelForPeftCausalLM.from_pretrained(
        "lora_model",  # YOUR MODEL YOU USED FOR TRAINING
        load_in_4bit = load_in_4bit,
    )
    tokenizer = AutoTokenizer.from_pretrained("lora_model")

### Saving to float16 for VLLM

We also support saving to `float16` directly. Select `merged_16bit` for float16 or `merged_4bit` for int4. We also allow `lora` adapters as a fallback. Use `push_to_hub_merged` to upload to your Hugging Face account! You can go to https://huggingface.co/settings/tokens for your personal tokens.

In [14]:
# Merge to 16bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")

# Merge to 4bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")

# Just LoRA adapters
if False:
    model.save_pretrained("model")
    tokenizer.save_pretrained("model")
if False:
    model.push_to_hub("hf/model", token = "")
    tokenizer.push_to_hub("hf/model", token = "")


### GGUF / llama.cpp Conversion
To save to `GGUF` / `llama.cpp`, we support it natively now! We clone `llama.cpp` and we default save it to `q8_0`. We allow all methods like `q4_k_m`. Use `save_pretrained_gguf` for local saving and `push_to_hub_gguf` for uploading to HF.

Some supported quant methods (full list on our [Wiki page](https://github.com/unslothai/unsloth/wiki#gguf-quantization-options)):
* `q8_0` - Fast conversion. High resource use, but generally acceptable.
* `q4_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K.
* `q5_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K.

In [15]:
# Save to 8bit Q8_0
if False: model.save_pretrained_gguf("model", tokenizer,)
if False: model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")