In [None]:
# === Shared Cache Bootstrap (English comments only) ===
import os, torch, platform, pathlib

AI_CACHE_ROOT = os.getenv("AI_CACHE_ROOT", "/mnt/ai/cache")
paths = {
    "HF_HOME": f"{AI_CACHE_ROOT}/hf",
    "TRANSFORMERS_CACHE": f"{AI_CACHE_ROOT}/hf/transformers",
    "HF_DATASETS_CACHE": f"{AI_CACHE_ROOT}/hf/datasets",
    "HUGGINGFACE_HUB_CACHE": f"{AI_CACHE_ROOT}/hf/hub",
    "TORCH_HOME": f"{AI_CACHE_ROOT}/torch",
}
for k, v in paths.items():
    os.environ[k] = v
    pathlib.Path(v).mkdir(parents=True, exist_ok=True)

print("[Cache] Root:", AI_CACHE_ROOT)
print(
    "[GPU]",
    torch.cuda.is_available(),
    torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU",
)

In [None]:
# ===================================================================
# Cell 2: Dependencies Installation & Configuration Check
# ===================================================================

# Install required packages (run once)
"""
pip install transformers accelerate bitsandbytes datasets torch torchvision torchaudio
pip install openai-whisper pillow librosa soundfile
"""

import warnings

warnings.filterwarnings("ignore")

import torch
import numpy as np
from PIL import Image
import requests
from io import BytesIO
import time
from typing import Dict, List, Optional, Union, Any
import json

# Check key libraries
try:
    from transformers import (
        AutoTokenizer,
        AutoModel,
        AutoModelForCausalLM,
        AutoModelForSequenceClassification,
        AutoProcessor,
        AutoModelForSpeechSeq2Seq,
        pipeline,
        BitsAndBytesConfig,
    )

    print("✅ Transformers imported successfully")
except ImportError as e:
    print(f"❌ Transformers import failed: {e}")

try:
    import bitsandbytes as bnb

    print("✅ BitsAndBytes available for quantization")
except ImportError:
    print("⚠️ BitsAndBytes not available - quantization disabled")

# Memory check
if torch.cuda.is_available():
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1024**3
    print(f"[GPU Memory] Total: {gpu_memory:.1f} GB")

    # Suggest models based on VRAM
    if gpu_memory >= 16:
        print("💡 Recommended: GPT-2-large, BERT-large, CLIP-large models")
    elif gpu_memory >= 8:
        print("💡 Recommended: GPT-2-medium, BERT-base, CLIP-base with 8bit")
    else:
        print("💡 Recommended: GPT-2-small, distil-bert with 4bit quantization")
else:
    print("💡 CPU-only mode: Use smaller models or quantized versions")

In [None]:
# ===================================================================
# Cell 3: Unified Model Loader Class Design
# ===================================================================


class HFModelLoader:
    """
    Unified Hugging Face model loader with low-VRAM optimizations
    Supports: text generation, classification, multimodal, speech models
    """

    def __init__(self, cache_dir: str = None, device: str = "auto"):
        self.cache_dir = cache_dir or os.environ.get("HF_HOME")
        self.device = device
        self.loaded_models = {}

        # Default quantization config for low VRAM
        self.bnb_config = (
            BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_compute_dtype=torch.float16,
                bnb_4bit_use_double_quant=True,
                bnb_4bit_quant_type="nf4",
            )
            if "bitsandbytes" in globals()
            else None
        )

    def load_text_generation_model(
        self,
        model_name: str = "gpt2",
        use_quantization: bool = True,
        trust_remote_code: bool = False,
    ) -> Dict[str, Any]:
        """Load text generation model (GPT-2, GPT-J, etc.)"""

        print(f"🔄 Loading text generation model: {model_name}")

        # Configure loading parameters
        load_kwargs = {
            "cache_dir": self.cache_dir,
            "trust_remote_code": trust_remote_code,
        }

        # Add quantization if available and requested
        if use_quantization and self.bnb_config and torch.cuda.is_available():
            load_kwargs["quantization_config"] = self.bnb_config
            load_kwargs["device_map"] = "auto"
            print("  🔧 Using 4-bit quantization")
        elif torch.cuda.is_available():
            load_kwargs["device_map"] = "auto"

        try:
            # Load tokenizer and model
            tokenizer = AutoTokenizer.from_pretrained(
                model_name, cache_dir=self.cache_dir
            )
            model = AutoModelForCausalLM.from_pretrained(model_name, **load_kwargs)

            # Set pad token if missing
            if tokenizer.pad_token is None:
                tokenizer.pad_token = tokenizer.eos_token

            model_info = {
                "model": model,
                "tokenizer": tokenizer,
                "type": "text_generation",
                "model_name": model_name,
                "memory_usage": (
                    self._get_model_memory(model)
                    if torch.cuda.is_available()
                    else "N/A"
                ),
            }

            self.loaded_models[f"textgen_{model_name}"] = model_info
            print(f"  ✅ Loaded successfully | Memory: {model_info['memory_usage']}")
            return model_info

        except Exception as e:
            print(f"  ❌ Failed to load {model_name}: {e}")
            return None

    def load_classification_model(
        self, model_name: str = "bert-base-uncased", use_quantization: bool = False
    ) -> Dict[str, Any]:
        """Load classification/encoding model (BERT, RoBERTa, etc.)"""

        print(f"🔄 Loading classification model: {model_name}")

        load_kwargs = {"cache_dir": self.cache_dir}

        if use_quantization and self.bnb_config and torch.cuda.is_available():
            load_kwargs["quantization_config"] = self.bnb_config
            load_kwargs["device_map"] = "auto"
        elif torch.cuda.is_available():
            load_kwargs["device_map"] = "auto"

        try:
            tokenizer = AutoTokenizer.from_pretrained(
                model_name, cache_dir=self.cache_dir
            )
            model = AutoModel.from_pretrained(model_name, **load_kwargs)

            model_info = {
                "model": model,
                "tokenizer": tokenizer,
                "type": "classification",
                "model_name": model_name,
                "memory_usage": (
                    self._get_model_memory(model)
                    if torch.cuda.is_available()
                    else "N/A"
                ),
            }

            self.loaded_models[f"cls_{model_name}"] = model_info
            print(f"  ✅ Loaded successfully | Memory: {model_info['memory_usage']}")
            return model_info

        except Exception as e:
            print(f"  ❌ Failed to load {model_name}: {e}")
            return None

    def load_multimodal_model(
        self,
        model_name: str = "openai/clip-vit-base-patch32",
        use_quantization: bool = False,
    ) -> Dict[str, Any]:
        """Load multimodal model (CLIP, BLIP, etc.)"""

        print(f"🔄 Loading multimodal model: {model_name}")

        load_kwargs = {"cache_dir": self.cache_dir}

        if use_quantization and self.bnb_config and torch.cuda.is_available():
            load_kwargs["quantization_config"] = self.bnb_config
            load_kwargs["device_map"] = "auto"
        elif torch.cuda.is_available():
            load_kwargs["device_map"] = "auto"

        try:
            processor = AutoProcessor.from_pretrained(
                model_name, cache_dir=self.cache_dir
            )
            model = AutoModel.from_pretrained(model_name, **load_kwargs)

            model_info = {
                "model": model,
                "processor": processor,
                "type": "multimodal",
                "model_name": model_name,
                "memory_usage": (
                    self._get_model_memory(model)
                    if torch.cuda.is_available()
                    else "N/A"
                ),
            }

            self.loaded_models[f"mm_{model_name}"] = model_info
            print(f"  ✅ Loaded successfully | Memory: {model_info['memory_usage']}")
            return model_info

        except Exception as e:
            print(f"  ❌ Failed to load {model_name}: {e}")
            return None

    def load_speech_model(
        self, model_name: str = "openai/whisper-base", use_quantization: bool = False
    ) -> Dict[str, Any]:
        """Load speech recognition model (Whisper, Wav2Vec2, etc.)"""

        print(f"🔄 Loading speech model: {model_name}")

        load_kwargs = {"cache_dir": self.cache_dir}

        if use_quantization and self.bnb_config and torch.cuda.is_available():
            load_kwargs["quantization_config"] = self.bnb_config
            load_kwargs["device_map"] = "auto"
        elif torch.cuda.is_available():
            load_kwargs["device_map"] = "auto"

        try:
            processor = AutoProcessor.from_pretrained(
                model_name, cache_dir=self.cache_dir
            )
            model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name, **load_kwargs)

            model_info = {
                "model": model,
                "processor": processor,
                "type": "speech",
                "model_name": model_name,
                "memory_usage": (
                    self._get_model_memory(model)
                    if torch.cuda.is_available()
                    else "N/A"
                ),
            }

            self.loaded_models[f"speech_{model_name}"] = model_info
            print(f"  ✅ Loaded successfully | Memory: {model_info['memory_usage']}")
            return model_info

        except Exception as e:
            print(f"  ❌ Failed to load {model_name}: {e}")
            return None

    def _get_model_memory(self, model) -> str:
        """Get approximate model memory usage"""
        if hasattr(model, "get_memory_footprint"):
            memory_mb = model.get_memory_footprint() / 1024**2
            return f"{memory_mb:.1f} MB"
        else:
            param_count = sum(p.numel() for p in model.parameters())
            estimated_mb = param_count * 4 / 1024**2  # Assume fp32
            return f"~{estimated_mb:.1f} MB"

    def list_loaded_models(self):
        """Display all loaded models"""
        if not self.loaded_models:
            print("📝 No models loaded yet")
            return

        print("📋 Loaded Models:")
        for key, info in self.loaded_models.items():
            print(
                f"  {key}: {info['model_name']} | {info['type']} | {info['memory_usage']}"
            )


# Initialize loader
loader = HFModelLoader()
print("🚀 HFModelLoader initialized with shared cache")

In [None]:
# ===================================================================
# Cell 4: GPT-2 Text Generation Model Loading & Inference
# ===================================================================


def demo_text_generation():
    """Demonstrate GPT-2 text generation with various decoding strategies"""

    print("=" * 50)
    print("🤖 GPT-2 Text Generation Demo")
    print("=" * 50)

    # Load GPT-2 model (start with smaller version for compatibility)
    model_info = loader.load_text_generation_model(
        model_name="gpt2",  # or "gpt2-medium" if you have enough VRAM
        use_quantization=True,
    )

    if not model_info:
        print("❌ Failed to load GPT-2 model")
        return

    model = model_info["model"]
    tokenizer = model_info["tokenizer"]

    # Sample prompts
    prompts = [
        "The future of artificial intelligence is",
        "In a world where robots and humans coexist,",
        "The most important breakthrough in science was",
    ]

    # Generation parameters to test
    gen_configs = [
        {
            "do_sample": True,
            "top_k": 50,
            "top_p": 0.95,
            "temperature": 0.7,
            "name": "Creative",
        },
        {
            "do_sample": True,
            "top_k": 10,
            "top_p": 0.9,
            "temperature": 0.3,
            "name": "Focused",
        },
        {"do_sample": False, "num_beams": 3, "name": "Beam Search"},
    ]

    for prompt in prompts:
        print(f"\n💭 Prompt: '{prompt}'")

        # Tokenize input
        inputs = tokenizer(prompt, return_tensors="pt", padding=True)
        if torch.cuda.is_available() and not hasattr(
            model.config, "quantization_config"
        ):
            inputs = {k: v.cuda() for k, v in inputs.items()}

        for config in gen_configs:
            print(f"\n  🎯 Strategy: {config['name']}")

            # Prepare generation config
            gen_kwargs = {k: v for k, v in config.items() if k != "name"}
            gen_kwargs.update(
                {
                    "max_new_tokens": 50,
                    "pad_token_id": tokenizer.eos_token_id,
                    "attention_mask": inputs["attention_mask"],
                }
            )

            start_time = time.time()

            try:
                with torch.no_grad():
                    outputs = model.generate(inputs["input_ids"], **gen_kwargs)

                # Decode and display
                generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
                continuation = generated_text[len(prompt) :].strip()

                elapsed = time.time() - start_time
                print(f"    📝 Output: {continuation}")
                print(f"    ⏱️ Time: {elapsed:.2f}s")

            except Exception as e:
                print(f"    ❌ Generation failed: {e}")

    # Memory cleanup
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    print("\n✅ GPT-2 demo completed")


# Run the demo
demo_text_generation()

In [None]:
# ===================================================================
# Cell 5: BERT Classification Model Loading & Feature Extraction
# ===================================================================


def demo_bert_classification():
    """Demonstrate BERT for text classification and feature extraction"""

    print("=" * 50)
    print("🧠 BERT Classification & Feature Extraction Demo")
    print("=" * 50)

    # Load BERT model
    model_info = loader.load_classification_model(
        model_name="bert-base-uncased",
        use_quantization=False,  # BERT typically doesn't need quantization
    )

    if not model_info:
        print("❌ Failed to load BERT model")
        return

    model = model_info["model"]
    tokenizer = model_info["tokenizer"]

    # Sample texts for classification/embedding
    texts = [
        "I love this movie! It's absolutely fantastic and entertaining.",
        "This product is terrible. I want my money back.",
        "The weather today is quite nice and sunny.",
        "Machine learning is revolutionizing many industries.",
    ]

    print("📝 Input Texts:")
    for i, text in enumerate(texts, 1):
        print(f"  {i}. {text}")

    # Extract features/embeddings
    print("\n🔍 Extracting BERT embeddings...")

    embeddings = []
    for text in texts:
        # Tokenize
        inputs = tokenizer(
            text, return_tensors="pt", padding=True, truncation=True, max_length=512
        )

        if torch.cuda.is_available():
            inputs = {k: v.cuda() for k, v in inputs.items()}

        # Get embeddings
        with torch.no_grad():
            outputs = model(**inputs)
            # Use [CLS] token embedding (first token)
            cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            embeddings.append(cls_embedding.flatten())

    embeddings = np.array(embeddings)
    print(f"  📊 Embeddings shape: {embeddings.shape}")

    # Compute similarity matrix
    from sklearn.metrics.pairwise import cosine_similarity

    similarity_matrix = cosine_similarity(embeddings)

    print("\n📈 Cosine Similarity Matrix:")
    print("     Text1  Text2  Text3  Text4")
    for i, row in enumerate(similarity_matrix):
        similarities = " ".join([f"{sim:.3f}" for sim in row])
        print(f"Text{i+1}: {similarities}")

    # Find most similar pairs
    print("\n🔗 Most Similar Text Pairs:")
    for i in range(len(texts)):
        for j in range(i + 1, len(texts)):
            sim = similarity_matrix[i, j]
            print(f"  Text{i+1} ↔ Text{j+1}: {sim:.3f}")

    # Memory cleanup
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    print("\n✅ BERT demo completed")


# Run the demo
demo_bert_classification()

In [None]:
# ===================================================================
# Cell 6: CLIP Multimodal Model Loading & Image-Text Matching
# ===================================================================


def demo_clip_multimodal():
    """Demonstrate CLIP for image-text matching"""

    print("=" * 50)
    print("🖼️ CLIP Multimodal Image-Text Matching Demo")
    print("=" * 50)

    # Load CLIP model
    model_info = loader.load_multimodal_model(
        model_name="openai/clip-vit-base-patch32", use_quantization=False
    )

    if not model_info:
        print("❌ Failed to load CLIP model")
        return

    model = model_info["model"]
    processor = model_info["processor"]

    # Download sample images
    print("🔄 Downloading sample images...")

    image_urls = [
        "https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/PNG_transparency_demonstration_1.png/280px-PNG_transparency_demonstration_1.png",  # Cat
        "https://upload.wikimedia.org/wikipedia/commons/thumb/1/15/Cat_August_2010-4.jpg/272px-Cat_August_2010-4.jpg",  # Another cat
        "https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Cat03.jpg/481px-Cat03.jpg",  # Cat
    ]

    # For demo purposes, create simple colored images if download fails
    try:
        images = []
        for url in image_urls[:2]:  # Use first 2 URLs
            response = requests.get(url, timeout=10)
            image = Image.open(BytesIO(response.content)).convert("RGB")
            images.append(image)
        print(f"  ✅ Downloaded {len(images)} images")
    except Exception as e:
        print(f"  ⚠️ Download failed: {e}")
        print("  🎨 Creating synthetic images instead...")
        # Create simple colored images
        images = [
            Image.new("RGB", (224, 224), color="red"),
            Image.new("RGB", (224, 224), color="blue"),
        ]

    # Text descriptions to match
    text_descriptions = [
        "a red image",
        "a blue image",
        "a cat",
        "a dog",
        "a beautiful landscape",
        "a car on the road",
    ]

    print(f"\n📝 Text descriptions ({len(text_descriptions)}):")
    for i, desc in enumerate(text_descriptions):
        print(f"  {i+1}. {desc}")

    # Process images and texts
    print(f"\n🔄 Processing {len(images)} images and {len(text_descriptions)} texts...")

    try:
        # Prepare inputs
        inputs = processor(
            text=text_descriptions, images=images, return_tensors="pt", padding=True
        )

        if torch.cuda.is_available():
            inputs = {k: v.cuda() for k, v in inputs.items()}

        # Get embeddings
        with torch.no_grad():
            outputs = model(**inputs)
            image_embeds = outputs.image_embeds
            text_embeds = outputs.text_embeds

            # Normalize embeddings
            image_embeds = image_embeds / image_embeds.norm(dim=-1, keepdim=True)
            text_embeds = text_embeds / text_embeds.norm(dim=-1, keepdim=True)

            # Compute similarity scores
            similarity_scores = torch.matmul(text_embeds, image_embeds.T)

        print("\n📊 Image-Text Similarity Scores:")
        print("Text \\ Image        Image1    Image2")
        print("-" * 35)

        for i, desc in enumerate(text_descriptions):
            scores = similarity_scores[i].cpu().numpy()
            score_str = "  ".join([f"{score:.3f}" for score in scores])
            print(f"{desc:20s} {score_str}")

        # Find best matches
        print("\n🎯 Best Matches:")
        for i in range(len(images)):
            best_text_idx = similarity_scores[:, i].argmax().item()
            best_score = similarity_scores[best_text_idx, i].item()
            print(
                f"  Image{i+1} ↔ '{text_descriptions[best_text_idx]}' (score: {best_score:.3f})"
            )

    except Exception as e:
        print(f"❌ CLIP processing failed: {e}")

    # Memory cleanup
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    print("\n✅ CLIP demo completed")


# Run the demo
demo_clip_multimodal()

In [None]:
# ===================================================================
# Cell 7: Whisper Speech Recognition Model Loading & Transcription
# ===================================================================


def demo_whisper_speech():
    """Demonstrate Whisper for speech recognition (using synthetic audio)"""

    print("=" * 50)
    print("🎤 Whisper Speech Recognition Demo")
    print("=" * 50)

    # Check if we can create synthetic audio
    try:
        import numpy as np
        import torch

        print("✅ Audio processing libraries available")
    except ImportError:
        print("❌ Audio libraries not available - skipping Whisper demo")
        return

    # Load Whisper model
    model_info = loader.load_speech_model(
        model_name="openai/whisper-base", use_quantization=False
    )

    if not model_info:
        print("❌ Failed to load Whisper model")
        return

    model = model_info["model"]
    processor = model_info["processor"]

    print("🎵 Creating synthetic speech audio...")

    # Create synthetic audio that resembles speech patterns
    # This is for demo purposes - in real use, you'd load actual audio files
    sample_rate = 16000
    duration = 3  # 3 seconds
    time = np.linspace(0, duration, int(sample_rate * duration))

    # Create multiple frequency components to simulate speech
    frequencies = [200, 400, 800, 1600]  # Typical speech frequency range
    synthetic_audio = np.zeros_like(time)

    for freq in frequencies:
        # Add some frequency modulation and amplitude variation
        modulated_freq = freq * (1 + 0.1 * np.sin(2 * np.pi * 5 * time))
        amplitude = 0.1 * (1 + np.sin(2 * np.pi * 2 * time)) / len(frequencies)
        synthetic_audio += amplitude * np.sin(2 * np.pi * modulated_freq * time)

    # Add some noise to make it more realistic
    noise = 0.01 * np.random.randn(len(time))
    synthetic_audio += noise

    # Normalize
    synthetic_audio = synthetic_audio.astype(np.float32)
    synthetic_audio = synthetic_audio / np.max(np.abs(synthetic_audio))

    print(f"  📊 Audio shape: {synthetic_audio.shape}")
    print(f"  🎚️ Sample rate: {sample_rate} Hz")
    print(f"  ⏱️ Duration: {duration} seconds")

    try:
        # Process audio
        print("\n🔄 Processing audio with Whisper...")

        inputs = processor(
            synthetic_audio, sampling_rate=sample_rate, return_tensors="pt"
        )

        if torch.cuda.is_available():
            inputs = {k: v.cuda() for k, v in inputs.items()}

        # Generate transcription
        with torch.no_grad():
            predicted_ids = model.generate(inputs["input_features"])
            transcription = processor.batch_decode(
                predicted_ids, skip_special_tokens=True
            )

        print(f"\n📝 Transcription Result:")
        print(f"  '{transcription[0]}'")

        # Note about synthetic audio
        print("\n💡 Note: This is synthetic audio, so the transcription")
        print("    may not be meaningful. In practice, use real speech audio.")

        # Demonstrate with pipeline for easier usage
        print("\n🔄 Alternative: Using Whisper pipeline...")
        try:
            # Create pipeline (more user-friendly)
            pipe = pipeline(
                "automatic-speech-recognition",
                model="openai/whisper-base",
                cache_dir=loader.cache_dir,
            )

            # Process the same audio
            result = pipe(synthetic_audio, sampling_rate=sample_rate)
            print(f"  Pipeline result: '{result['text']}'")

        except Exception as e:
            print(f"  ⚠️ Pipeline method failed: {e}")

    except Exception as e:
        print(f"❌ Whisper processing failed: {e}")

    # Memory cleanup
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    print("\n✅ Whisper demo completed")


# Run the demo
demo_whisper_speech()

In [None]:
# ===================================================================
# Cell 8: Low-VRAM Optimization Strategies
# ===================================================================


def demo_low_vram_strategies():
    """Demonstrate various low-VRAM optimization techniques"""

    print("=" * 50)
    print("⚡ Low-VRAM Optimization Strategies Demo")
    print("=" * 50)

    if not torch.cuda.is_available():
        print("💡 Running on CPU - VRAM optimizations not applicable")
        return

    # Check initial VRAM usage
    torch.cuda.empty_cache()
    initial_memory = torch.cuda.memory_allocated() / 1024**2
    print(f"🔍 Initial VRAM usage: {initial_memory:.1f} MB")

    strategies = []

    # Strategy 1: Standard loading
    print("\n1️⃣ Strategy: Standard Loading")
    try:
        model_standard = AutoModelForCausalLM.from_pretrained(
            "gpt2", cache_dir=loader.cache_dir
        ).cuda()

        memory_standard = torch.cuda.memory_allocated() / 1024**2
        memory_used = memory_standard - initial_memory
        strategies.append(("Standard Loading", memory_used))
        print(f"   📊 VRAM used: {memory_used:.1f} MB")

        # Clean up
        del model_standard
        torch.cuda.empty_cache()

    except Exception as e:
        print(f"   ❌ Standard loading failed: {e}")

    # Strategy 2: 8-bit quantization
    print("\n2️⃣ Strategy: 8-bit Quantization")
    try:
        if loader.bnb_config:
            config_8bit = BitsAndBytesConfig(load_in_8bit=True)
            model_8bit = AutoModelForCausalLM.from_pretrained(
                "gpt2",
                cache_dir=loader.cache_dir,
                quantization_config=config_8bit,
                device_map="auto",
            )

            memory_8bit = torch.cuda.memory_allocated() / 1024**2
            memory_used = memory_8bit - initial_memory
            strategies.append(("8-bit Quantization", memory_used))
            print(f"   📊 VRAM used: {memory_used:.1f} MB")

            del model_8bit
            torch.cuda.empty_cache()
        else:
            print("   ⚠️ BitsAndBytes not available")

    except Exception as e:
        print(f"   ❌ 8-bit loading failed: {e}")

    # Strategy 3: 4-bit quantization
    print("\n3️⃣ Strategy: 4-bit Quantization")
    try:
        if loader.bnb_config:
            model_4bit = AutoModelForCausalLM.from_pretrained(
                "gpt2",
                cache_dir=loader.cache_dir,
                quantization_config=loader.bnb_config,
                device_map="auto",
            )

            memory_4bit = torch.cuda.memory_allocated() / 1024**2
            memory_used = memory_4bit - initial_memory
            strategies.append(("4-bit Quantization", memory_used))
            print(f"   📊 VRAM used: {memory_used:.1f} MB")

            del model_4bit
            torch.cuda.empty_cache()
        else:
            print("   ⚠️ BitsAndBytes not available")

    except Exception as e:
        print(f"   ❌ 4-bit loading failed: {e}")

    # Strategy 4: CPU offloading
    print("\n4️⃣ Strategy: CPU Offloading")
    try:
        model_offload = AutoModelForCausalLM.from_pretrained(
            "gpt2",
            cache_dir=loader.cache_dir,
            device_map="auto",
            offload_folder="./offload_temp",
            offload_state_dict=True,
        )

        memory_offload = torch.cuda.memory_allocated() / 1024**2
        memory_used = memory_offload - initial_memory
        strategies.append(("CPU Offloading", memory_used))
        print(f"   📊 VRAM used: {memory_used:.1f} MB")

        del model_offload
        torch.cuda.empty_cache()

        # Clean up temporary offload folder
        import shutil

        if os.path.exists("./offload_temp"):
            shutil.rmtree("./offload_temp")

    except Exception as e:
        print(f"   ❌ CPU offloading failed: {e}")

    # Strategy 5: Gradient checkpointing (for training)
    print("\n5️⃣ Strategy: Gradient Checkpointing (Training Mode)")
    try:
        model_checkpoint = AutoModelForCausalLM.from_pretrained(
            "gpt2", cache_dir=loader.cache_dir
        ).cuda()

        # Enable gradient checkpointing
        model_checkpoint.gradient_checkpointing_enable()

        memory_checkpoint = torch.cuda.memory_allocated() / 1024**2
        memory_used = memory_checkpoint - initial_memory
        strategies.append(("Gradient Checkpointing", memory_used))
        print(f"   📊 VRAM used: {memory_used:.1f} MB (training optimized)")

        del model_checkpoint
        torch.cuda.empty_cache()

    except Exception as e:
        print(f"   ❌ Gradient checkpointing failed: {e}")

    # Summary comparison
    print("\n📋 VRAM Usage Comparison:")
    print("-" * 40)
    strategies.sort(key=lambda x: x[1])

    for i, (strategy, memory) in enumerate(strategies, 1):
        efficiency = "🟢" if memory < 200 else "🟡" if memory < 500 else "🔴"
        print(f"{i}. {strategy:20s}: {memory:6.1f} MB {efficiency}")

    if strategies:
        best_strategy = strategies[0]
        print(f"\n🏆 Most efficient: {best_strategy[0]} ({best_strategy[1]:.1f} MB)")

    # Recommendations based on VRAM
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1024**3
    print(f"\n💡 Recommendations for {gpu_memory:.1f} GB VRAM:")

    if gpu_memory >= 16:
        print("   • Use standard loading for most models")
        print("   • Consider larger models (GPT-2-large, BERT-large)")
    elif gpu_memory >= 8:
        print("   • Use 8-bit quantization for larger models")
        print("   • Standard loading for base models")
    elif gpu_memory >= 4:
        print("   • Use 4-bit quantization for all models")
        print("   • CPU offloading for very large models")
    else:
        print("   • Use 4-bit quantization + CPU offloading")
        print("   • Consider smaller model variants")

    print("\n✅ Low-VRAM optimization demo completed")


# Run the demo
demo_low_vram_strategies()

In [None]:
# === Smoke Test for nb08_hf_models_loading ===
def smoke_test():
    """5-line smoke test to verify notebook functionality"""
    loader = HFModelLoader()
    model_info = loader.load_text_generation_model("gpt2", use_quantization=True)
    assert model_info is not None, "Model loading failed"
    inputs = model_info["tokenizer"]("Test", return_tensors="pt")
    outputs = model_info["model"].generate(inputs["input_ids"], max_new_tokens=3)
    print("🎉 Smoke test PASSED - Model loading system works!")


smoke_test()