# SmolVLA Fine-tuning for SO101 Arm

This notebook fine-tunes SmolVLA on your SO101 dataset using Google Colab with A100 GPU.

## Setup and Installation

## Check GPU Availability


In [None]:
# Check GPU availability
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"CUDA version: {torch.version.cuda}")
else:
    print("‚ö†Ô∏è Warning: No GPU detected. Training will be very slow!")


## Install LeRobot and Dependencies


In [None]:
# Clone LeRobot repository
!git clone https://github.com/huggingface/lerobot.git
%cd lerobot

# Install LeRobot with SmolVLA dependencies
%pip install -e ".[smolvla]"

# Verify installation
!python -c "import lerobot; print('LeRobot installed successfully!')"

# ‚ö†Ô∏è IMPORTANT: If you see a warning about restarting the runtime, 
# you MUST restart the runtime (Runtime -> Restart runtime) before running the training cell.
# This is required for PyTorch and other dependencies to load correctly.


In [None]:
# Training configuration
# Adjust these parameters based on your needs and GPU memory

CONFIG = {
    "policy_path": "lerobot/smolvla_base",  # Pretrained model from HuggingFace
    "dataset_repo_id": "HenryZhang/Group11_data_1763075740.884942",  # Your SO101 dataset
    "batch_size": 32,  # Start with 32 for A100, reduce if OOM errors occur
    "steps": 20000,  # Training steps (10% of full training budget as per blog)
}

print("Training Configuration:")
print("=" * 50)
for key, value in CONFIG.items():
    print(f"  {key}: {value}")
print("=" * 50)
print("\nüí° Tip: If you encounter out-of-memory (OOM) errors, reduce batch_size to 16 or 8")


## Debug: Check File Structure

Run this cell to verify the directory structure and locate train.py:


In [None]:
# Debug script to find train.py and check directory structure
import os
from pathlib import Path

print("=" * 60)
print("DIRECTORY STRUCTURE DEBUG")
print("=" * 60)

# Check current directory
print(f"\n1. Current working directory: {os.getcwd()}")

# Check if lerobot directory exists
lerobot_path = Path('/content/lerobot')
print(f"\n2. Does /content/lerobot exist? {lerobot_path.exists()}")

if lerobot_path.exists():
    # Search for train.py recursively
    print(f"\n3. Searching for train.py files:")
    train_py_paths = list(lerobot_path.rglob('train.py'))
    if train_py_paths:
        print(f"   Found {len(train_py_paths)} train.py file(s):")
        for path in train_py_paths:
            rel_path = path.relative_to(lerobot_path)
            print(f"   ‚úì {path}")
            print(f"     Relative path: {rel_path}")
    else:
        print("   ‚úó train.py not found!")
    
    # Check for scripts directory in various locations
    print(f"\n4. Checking for scripts directories:")
    scripts_dirs = [
        lerobot_path / 'scripts',
        lerobot_path / 'lerobot' / 'scripts',
        lerobot_path / 'src' / 'lerobot' / 'scripts',
    ]
    for scripts_dir in scripts_dirs:
        if scripts_dir.exists():
            print(f"   ‚úì Found: {scripts_dir}")
            try:
                script_files = list(scripts_dir.glob('*.py'))
                if script_files:
                    print(f"     Contains {len(script_files)} Python files:")
                    for script in sorted(script_files)[:10]:
                        print(f"       - {script.name}")
            except Exception as e:
                print(f"     Error listing: {e}")
    
    # Check src/lerobot structure
    src_lerobot = lerobot_path / 'src' / 'lerobot'
    print(f"\n5. Checking src/lerobot structure:")
    if src_lerobot.exists():
        print(f"   ‚úì src/lerobot exists")
        try:
            subdirs = [d for d in src_lerobot.iterdir() if d.is_dir()]
            print(f"   Subdirectories: {[d.name for d in subdirs]}")
            # Check for scripts or cli
            for subdir in subdirs:
                if 'script' in subdir.name.lower() or 'cli' in subdir.name.lower() or 'cmd' in subdir.name.lower():
                    print(f"   ‚ö†Ô∏è  Found potential scripts directory: {subdir}")
        except Exception as e:
            print(f"   Error: {e}")
    
    # Try to find how to run training (check for CLI or entry points)
    print(f"\n6. Checking for CLI/entry points:")
    try:
        import lerobot
        print(f"   ‚úì lerobot module is importable")
        # Check if there's a CLI
        if hasattr(lerobot, '__main__'):
            print(f"   ‚úì Has __main__ module")
    except Exception as e:
        print(f"   Error importing lerobot: {e}")

print("\n" + "=" * 60)


## Fine-tune SmolVLA

This will fine-tune the pretrained SmolVLA model on your SO101 dataset.

**Note:** Run the debug cell above first to verify the file structure!

In [None]:
# Fine-tune the pretrained model
# Note: Adjust batch_size based on your GPU memory (A100 can handle 32-64)
# If you get OOM errors, reduce batch_size to 16 or 8

# Ensure we're in the lerobot directory
import os
if not os.getcwd().endswith('lerobot'):
    os.chdir('/content/lerobot')

# Try different ways to run training based on LeRobot structure
# Method 1: Try using lerobot CLI (if available)
# Method 2: Try using python -m lerobot.scripts.train
# Method 3: Try direct path to script

print("Attempting to run training...")
print(f"Current directory: {os.getcwd()}")

# First, try to find the actual training script
from pathlib import Path
lerobot_path = Path('/content/lerobot')

# Check if there's a lerobot CLI command
import subprocess
result = subprocess.run(['lerobot', '--help'], capture_output=True, text=True, cwd='/content/lerobot')
if result.returncode == 0:
    print("‚úì Found lerobot CLI command")
    # Run using lerobot CLI
    !lerobot train \
      --policy.path={CONFIG['policy_path']} \
      --dataset.repo_id={CONFIG['dataset_repo_id']} \
      --batch_size={CONFIG['batch_size']} \
      --steps={CONFIG['steps']}
else:
    # Try python -m lerobot.scripts.train
    print("Trying python -m lerobot.scripts.train...")
    !python -m lerobot.scripts.train \
      --policy.path={CONFIG['policy_path']} \
      --dataset.repo_id={CONFIG['dataset_repo_id']} \
      --batch_size={CONFIG['batch_size']} \
      --steps={CONFIG['steps']}


## (Optional) Load and Test the Fine-tuned Model

After training, you can load and test your fine-tuned model:


In [None]:
# Load the fine-tuned model
from lerobot.common.policies.smolvla.modeling_smolvla import SmolVLAPolicy

# Load from checkpoint directory
# Checkpoints are saved in outputs/ by default (relative to lerobot directory)
# You can specify a specific checkpoint like: "outputs/checkpoint-20000"
import os
if not os.getcwd().endswith('lerobot'):
    os.chdir('/content/lerobot')

checkpoint_path = "outputs"  # Default output directory (relative to lerobot directory)

try:
    policy = SmolVLAPolicy.from_pretrained(checkpoint_path)
    print("‚úÖ Model loaded successfully!")
except Exception as e:
    print(f"‚ùå Error loading model: {e}")
    print("üí° Make sure training completed and checkpoint path is correct")
    print(f"üí° Current directory: {os.getcwd()}")
    print(f"üí° Looking for checkpoint at: {os.path.abspath(checkpoint_path)}")
