In [None]:
# Install required packages for Google Colab
%pip install huggingface_hub
%pip install --upgrade huggingface_hub


In [None]:
# Mount Google Drive for persistent storage
from google.colab import drive
drive.mount('/content/drive')


In [None]:
# Login to Hugging Face (optional - only needed if dataset requires authentication)
from huggingface_hub import login

# Uncomment the line below if you need to authenticate
# login()

print("Google Drive mounted and Hugging Face setup complete!")


In [None]:
import os
from huggingface_hub import snapshot_download

# Define the dataset repository
REPO_ID = "nvidia/PhysicalAI-Robotics-GR00T-X-Embodiment-Sim"
LOCAL_DIR = "/content/drive/MyDrive/gr00t_dataset"  # Persistent storage in Google Drive

# Define the tasks and file patterns to download
TASKS = [
    "gr1_arms_waist.CanToDrawer",
    "gr1_arms_waist.CupToDrawer", 
    "gr1_arms_waist.CuttingBoardToBasket",
    "gr1_arms_waist.CuttingBoardToCardboardBox",
    "gr1_arms_waist.CuttingBoardToPan",
    "gr1_arms_waist.CuttingBoardToPot",
    "gr1_arms_waist.PlaceBottleToCabinet",
    "gr1_arms_waist.PlaceMilkToMicrowave",
    "gr1_arms_waist.PlacematToBowl",
    "gr1_arms_waist.PotatoToMicrowave",
    "gr1_arms_waist.TrayToPot",
    "gr1_arms_waist.TrayToTieredShelf",

]

# Create include patterns for all tasks
include_patterns = []
for task in TASKS:
    include_patterns.extend([
        f"{task}/meta/**",
        f"{task}/data/chunk-000/**", 
        f"{task}/videos/chunk-000/**"
    ])

print(f"Will download {len(TASKS)} tasks to: {LOCAL_DIR}")
print(f"Tasks: {', '.join(TASKS)}")
print(f"Total include patterns: {len(include_patterns)}")


In [None]:
# Download the dataset
print("Starting download...")
print("This may take several minutes depending on the dataset size and your connection speed.")

try:
    snapshot_download(
        repo_id=REPO_ID,
        repo_type="dataset",
        local_dir=LOCAL_DIR,
        allow_patterns=include_patterns,
        resume_download=True  # Resume if interrupted
    )
    print(f"\n✅ Download completed successfully!")
    print(f"Dataset saved to: {LOCAL_DIR}")
    
except Exception as e:
    print(f"❌ Download failed with error: {str(e)}")
    print("Please check your internet connection and try again.")


In [None]:
# Verify the download by checking directory structure
print("Verifying download...")

if os.path.exists(LOCAL_DIR):
    print(f"\n📁 Contents of {LOCAL_DIR}:")
    
    for task in TASKS:
        task_path = os.path.join(LOCAL_DIR, task)
        if os.path.exists(task_path):
            print(f"\n✅ {task}:")
            
            # Check for meta, data, and videos directories
            for subdir in ["meta", "data/chunk-000", "videos/chunk-000"]:
                full_path = os.path.join(task_path, subdir)
                if os.path.exists(full_path):
                    file_count = len([f for f in os.listdir(full_path) if os.path.isfile(os.path.join(full_path, f))])
                    print(f"  📂 {subdir}: {file_count} files")
                else:
                    print(f"  ❌ {subdir}: Not found")
        else:
            print(f"\n❌ {task}: Not found")
else:
    print(f"❌ Dataset directory not found: {LOCAL_DIR}")


In [None]:
# Display dataset statistics
print("Dataset Statistics:")
print("=" * 50)

total_size = 0
total_files = 0

if os.path.exists(LOCAL_DIR):
    for root, dirs, files in os.walk(LOCAL_DIR):
        for file in files:
            file_path = os.path.join(root, file)
            file_size = os.path.getsize(file_path)
            total_size += file_size
            total_files += 1
    
    # Convert bytes to human readable format
    def human_readable_size(size_bytes):
        for unit in ['B', 'KB', 'MB', 'GB']:
            if size_bytes < 1024.0:
                return f"{size_bytes:.1f} {unit}"
            size_bytes /= 1024.0
        return f"{size_bytes:.1f} TB"
    
    print(f"Total files: {total_files}")
    print(f"Total size: {human_readable_size(total_size)}")
    print(f"Number of tasks: {len(TASKS)}")
    print(f"Storage location: {LOCAL_DIR}")
else:
    print("No dataset found to analyze.")


In [None]:
# Uncomment and run this cell if you want to explore the downloaded data structure

"""
# Example: Load and examine one task using GR00T's dataset loader
# Note: You'll need to install the gr00t package first

%pip install git+https://github.com/NVlabs/GR00T.git

from gr00t.data.dataset import LeRobotSingleDataset
from gr00t.data.dataset import ModalityConfig
from gr00t.data.schema import EmbodimentTag

# Example task to explore
EXAMPLE_TASK = "gr1_arms_waist.CanToDrawer"
TASK_PATH = os.path.join(LOCAL_DIR, EXAMPLE_TASK)

if os.path.exists(TASK_PATH):
    # Define modality configurations
    modality_configs = {
        "video": ModalityConfig(
            delta_indices=[0],
            modality_keys=["video.ego_view"],
        ),
        "state": ModalityConfig(
            delta_indices=[0],
            modality_keys=[
                "state.left_arm",
                "state.right_arm", 
                "state.left_hand",
                "state.right_hand",
                "state.waist",
            ],
        ),
        "action": ModalityConfig(
            delta_indices=[0],
            modality_keys=[
                "action.left_arm",
                "action.right_arm",
                "action.left_hand", 
                "action.right_hand",
                "action.waist",
            ],
        ),
        "language": ModalityConfig(
            delta_indices=[0],
            modality_keys=["annotation.human.action.task_description"],
        ),
    }
    
    # Load dataset
    dataset = LeRobotSingleDataset(
        TASK_PATH, 
        modality_configs, 
        embodiment_tag=EmbodimentTag.GR1_UNIFIED
    )
    
    print(f"Loaded dataset: {EXAMPLE_TASK}")
    print(f"Dataset length: {len(dataset)}")
    
    # Show first sample
    sample = dataset[0]
    print(f"Sample keys: {list(sample.keys())}")
    
else:
    print(f"Task not found: {EXAMPLE_TASK}")
"""

print("Data exploration cell ready (uncomment to use)")


In [None]:
# Success message
print("🎉 Dataset download complete!")
print("\nNext steps:")
print("1. The dataset is now available in:", LOCAL_DIR)
print("2. You can use this data with GR00T's data loaders")
print("3. Refer to other notebooks in this repository for training and inference examples")
print("4. The downloaded tasks are ready for fine-tuning or evaluation")

print(f"\nDownloaded tasks:")
for i, task in enumerate(TASKS, 1):
    print(f"{i}. {task}")
