In [None]:
# nb01_tensor_autograd.ipynb
# PyTorch Tensor Operations & Autograd Fundamentals

# Cell 1: Shared Cache Bootstrap & Environment Setup
import os, pathlib, torch
import numpy as np

AI_CACHE_ROOT = os.getenv("AI_CACHE_ROOT", "/mnt/ai/cache")
for k, v in {
    "HF_HOME": f"{AI_CACHE_ROOT}/hf",
    "TRANSFORMERS_CACHE": f"{AI_CACHE_ROOT}/hf/transformers",
    "HF_DATASETS_CACHE": f"{AI_CACHE_ROOT}/hf/datasets",
    "HUGGINGFACE_HUB_CACHE": f"{AI_CACHE_ROOT}/hf/hub",
    "TORCH_HOME": f"{AI_CACHE_ROOT}/torch",
}.items():
    os.environ[k] = v
    pathlib.Path(v).mkdir(parents=True, exist_ok=True)

print(f"[Cache] Root: {AI_CACHE_ROOT}")
print(f"[GPU] Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"[GPU] Device: {torch.cuda.get_device_name(0)}")
    print(
        f"[GPU] Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB"
    )
print(f"[PyTorch] Version: {torch.__version__}")

In [None]:
# Cell 2: Basic Tensor Creation & Operations
# 基本張量創建與操作

def demonstrate_tensor_basics():
    """Demonstrate basic tensor creation and operations"""

    # Create tensors with different methods
    print("=== Tensor Creation ===")

    # From Python lists
    data = [[1, 2], [3, 4]]
    x_data = torch.tensor(data, dtype=torch.float32)
    print(f"From list: {x_data}")

    # From NumPy arrays
    np_array = np.array(data)
    x_np = torch.from_numpy(np_array).float()
    print(f"From numpy: {x_np}")

    # Specific creation functions
    x_ones = torch.ones(2, 3)
    x_zeros = torch.zeros_like(x_ones)
    x_rand = torch.randn(2, 3)  # Normal distribution

    print(f"Ones: \n{x_ones}")
    print(f"Zeros: \n{x_zeros}")
    print(f"Random: \n{x_rand}")

    # Tensor properties
    print(f"\n=== Tensor Properties ===")
    print(f"Shape: {x_rand.shape}")
    print(f"Dtype: {x_rand.dtype}")
    print(f"Device: {x_rand.device}")

    return x_rand


# Run basic demo
sample_tensor = demonstrate_tensor_basics()

In [None]:
# Cell 3: Tensor Operations & Broadcasting
# 張量運算與廣播機制


def demonstrate_tensor_operations():
    """Show various tensor operations and broadcasting"""

    print("=== Arithmetic Operations ===")

    a = torch.tensor([[1, 2], [3, 4]], dtype=torch.float32)
    b = torch.tensor([[5, 6], [7, 8]], dtype=torch.float32)

    # Element-wise operations
    print(f"A: \n{a}")
    print(f"B: \n{b}")
    print(f"A + B: \n{a + b}")
    print(f"A * B (element-wise): \n{a * b}")
    print(f"A @ B (matrix multiplication): \n{a @ b}")

    # Broadcasting examples
    print(f"\n=== Broadcasting ===")
    c = torch.tensor([1, 2])  # Shape: (2,)
    print(f"A + c (broadcasting): \n{a + c}")

    # Reduction operations
    print(f"\n=== Reductions ===")
    print(f"Sum: {a.sum()}")
    print(f"Mean: {a.mean()}")
    print(f"Max: {a.max()}")
    print(f"Argmax: {a.argmax()}")

    # Reshape operations
    print(f"\n=== Reshaping ===")
    x = torch.arange(12).float()
    print(f"Original: {x}")
    print(f"Reshaped (3x4): \n{x.view(3, 4)}")
    print(f"Reshaped (2x6): \n{x.reshape(2, 6)}")

    return a, b


# Run operations demo
tensor_a, tensor_b = demonstrate_tensor_operations()

In [None]:
# Cell 4: Autograd Fundamentals
# 自動微分基礎


def demonstrate_autograd_basics():
    """Demonstrate automatic differentiation with simple examples"""

    print("=== Autograd Basics ===")

    # Create tensors with gradient tracking
    x = torch.tensor(2.0, requires_grad=True)
    y = torch.tensor(3.0, requires_grad=True)

    # Simple function: z = x^2 + 2*x*y + y^2
    z = x**2 + 2 * x * y + y**2

    print(f"x = {x.item()}, y = {y.item()}")
    print(f"z = x² + 2xy + y² = {z.item()}")

    # Compute gradients
    z.backward()

    print(f"∂z/∂x = {x.grad.item()}")  # Should be 2x + 2y = 4 + 6 = 10
    print(f"∂z/∂y = {y.grad.item()}")  # Should be 2x + 2y = 4 + 6 = 10

    # Verify manual calculation
    manual_dz_dx = 2 * x.item() + 2 * y.item()
    manual_dz_dy = 2 * x.item() + 2 * y.item()
    print(f"Manual ∂z/∂x = {manual_dz_dx}")
    print(f"Manual ∂z/∂y = {manual_dz_dy}")


# Run autograd demo
demonstrate_autograd_basics()

In [None]:
# Cell 5: Gradient Computation for Vectors/Matrices
# 向量/矩陣的梯度計算


def demonstrate_vector_gradients():
    """Show gradient computation for vector/matrix operations"""

    print("=== Vector/Matrix Gradients ===")

    # Vector input
    x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)

    # Vector function: f(x) = sum(x^2) = x₁² + x₂² + x₃²
    f = torch.sum(x**2)

    print(f"x = {x}")
    print(f"f(x) = sum(x²) = {f.item()}")

    f.backward()
    print(f"∇f = [∂f/∂x₁, ∂f/∂x₂, ∂f/∂x₃] = {x.grad}")
    print(f"Expected: [2x₁, 2x₂, 2x₃] = {2*x.detach()}")

    # Matrix example
    print(f"\n=== Matrix Gradients ===")
    W = torch.randn(2, 3, requires_grad=True)
    x_input = torch.randn(3, 1)

    # Linear transformation + loss
    y = W @ x_input  # Matrix multiplication
    loss = torch.sum(y**2)  # Simple quadratic loss

    print(f"W shape: {W.shape}")
    print(f"x shape: {x_input.shape}")
    print(f"y shape: {y.shape}")
    print(f"Loss: {loss.item():.4f}")

    loss.backward()
    print(f"Gradient W.grad shape: {W.grad.shape}")
    print(f"Gradient norm: {torch.norm(W.grad).item():.4f}")


# Run vector gradients demo
demonstrate_vector_gradients()

In [None]:
# Cell 6: Gradient Flow in Neural Network Context
# 神經網路情境下的梯度流


def demonstrate_neural_network_gradients():
    """Simulate simple neural network forward and backward pass"""

    print("=== Simple Neural Network Gradients ===")

    # Input data (batch_size=4, features=3)
    X = torch.randn(4, 3)
    y_true = torch.randn(4, 1)  # Target values

    # Network parameters
    W1 = torch.randn(3, 5, requires_grad=True)  # Input to hidden
    b1 = torch.zeros(5, requires_grad=True)  # Hidden bias
    W2 = torch.randn(5, 1, requires_grad=True)  # Hidden to output
    b2 = torch.zeros(1, requires_grad=True)  # Output bias

    print(f"Input shape: {X.shape}")
    print(f"Target shape: {y_true.shape}")

    # Forward pass
    hidden = torch.relu(X @ W1 + b1)  # ReLU activation
    output = hidden @ W2 + b2  # Linear output

    # Mean Squared Error loss
    loss = torch.mean((output - y_true) ** 2)

    print(f"Hidden shape: {hidden.shape}")
    print(f"Output shape: {output.shape}")
    print(f"Loss: {loss.item():.6f}")

    # Backward pass
    loss.backward()

    # Check gradients exist
    print(f"\n=== Gradient Information ===")
    print(f"W1.grad exists: {W1.grad is not None}")
    print(f"W1.grad shape: {W1.grad.shape}")
    print(f"W1.grad norm: {torch.norm(W1.grad).item():.6f}")

    print(f"W2.grad exists: {W2.grad is not None}")
    print(f"W2.grad shape: {W2.grad.shape}")
    print(f"W2.grad norm: {torch.norm(W2.grad).item():.6f}")

    # Gradient descent step (manual)
    learning_rate = 0.01
    with torch.no_grad():
        W1 -= learning_rate * W1.grad
        W2 -= learning_rate * W2.grad
        b1 -= learning_rate * b1.grad
        b2 -= learning_rate * b2.grad

    print(f"Applied gradient descent with lr={learning_rate}")

    return loss.item()


# Run neural network demo
initial_loss = demonstrate_neural_network_gradients()

In [None]:
# Cell 7: Device Management (CPU/GPU)
# 設備管理 (CPU/GPU)


def demonstrate_device_management():
    """Show how to work with different devices"""

    print("=== Device Management ===")

    # Check available devices
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Create tensors on specific devices
    x_cpu = torch.randn(1000, 1000)
    print(f"CPU tensor device: {x_cpu.device}")

    if torch.cuda.is_available():
        x_gpu = x_cpu.to(device)
        print(f"GPU tensor device: {x_gpu.device}")

        # Timing comparison for matrix multiplication
        import time

        # CPU timing
        start_time = time.time()
        result_cpu = x_cpu @ x_cpu.T
        cpu_time = time.time() - start_time

        # GPU timing (with synchronization)
        start_time = time.time()
        result_gpu = x_gpu @ x_gpu.T
        torch.cuda.synchronize()  # Wait for GPU computation
        gpu_time = time.time() - start_time

        print(f"CPU matmul time: {cpu_time:.4f}s")
        print(f"GPU matmul time: {gpu_time:.4f}s")
        print(f"Speedup: {cpu_time/gpu_time:.2f}x")
    else:
        print("CUDA not available, using CPU only")


# Run device management demo
demonstrate_device_management()

In [None]:
# Cell 8: Smoke Test - Verification
# 驗收測試


def smoke_test():
    """Quick verification that all concepts work correctly"""

    print("=== Smoke Test ===")

    # Test 1: Basic tensor creation and operations
    x = torch.tensor([1, 2, 3], dtype=torch.float32, requires_grad=True)
    y = torch.sum(x**2)
    y.backward()

    expected_grad = torch.tensor([2, 4, 6], dtype=torch.float32)
    assert torch.allclose(x.grad, expected_grad), "Gradient computation failed"
    print("✓ Basic autograd works")

    # Test 2: Matrix operations
    A = torch.randn(2, 3, requires_grad=True)
    B = torch.randn(3, 2)
    C = A @ B
    loss = torch.sum(C)
    loss.backward()

    assert A.grad is not None, "Matrix gradient is None"
    assert A.grad.shape == A.shape, "Gradient shape mismatch"
    print("✓ Matrix gradients work")

    # Test 3: Device placement (if GPU available)
    if torch.cuda.is_available():
        x_gpu = torch.randn(10, device="cuda")
        assert x_gpu.device.type == "cuda", "GPU placement failed"
        print("✓ GPU placement works")
    else:
        print("✓ CPU-only mode works")

    # Test 4: Shared cache is configured
    assert "TORCH_HOME" in os.environ, "TORCH_HOME not set"
    assert os.path.exists(os.environ["TORCH_HOME"]), "TORCH_HOME directory missing"
    print("✓ Shared cache configured")

    print("\n🎉 All smoke tests passed!")


# Run smoke test
smoke_test()

In [None]:
# Cell 9: Summary and Next Steps
print(
    """
=== 本章完成摘要 (Chapter Summary) ===

✅ 完成項目 (Completed Items):
- PyTorch 張量基本操作 (Basic tensor operations)
- 自動微分機制理解 (Autograd mechanism understanding)
- 梯度計算與反向傳播 (Gradient computation & backpropagation)
- 設備管理 (CPU/GPU) (Device management)
- 共享快取配置 (Shared cache setup)

🔑 核心概念 (Core Concepts):
- Tensor: PyTorch 的基本數據結構，支援 GPU 加速
- Autograd: 自動微分系統，追蹤運算圖並計算梯度
- requires_grad=True: 啟用梯度追蹤的關鍵參數
- .backward(): 執行反向傳播的方法
- Device placement: 在 CPU/GPU 間移動張量

⚠️ 常見陷阱 (Common Pitfalls):
- 忘記設定 requires_grad=True 導致無法計算梯度
- 在不同設備間操作張量會報錯
- 梯度會累積，需要手動清零 (.zero_grad())
- 計算圖在 .backward() 後會被釋放

🚀 下一步 (Next Steps):
- 學習 nn.Module 和自訂層 (Custom layers)
- 理解訓練迴圈結構 (Training loop structure)
- 掌握優化器使用 (Optimizer usage)
- 實作完整的神經網路訓練流程
"""
)



## 本章小結

### ✅ 完成項目
- **共享快取系統**：建立統一的模型和數據存放機制
- **PyTorch 基礎**：張量操作、廣播、重塑等核心概念
- **自動微分**：從簡單函數到神經網路的梯度計算
- **設備管理**：CPU/GPU 切換和性能優化
- **實用範例**：可直接運行的最小可行代碼

### 🔑 原理要點
- **計算圖 (Computation Graph)**：PyTorch 動態建構運算圖來追蹤梯度
- **反向傳播 (Backpropagation)**：通過鏈式法則自動計算所有參數的梯度
- **記憶體效率**：適當使用 `with torch.no_grad()` 來節省記憶體
- **設備一致性**：確保所有張量在相同設備上進行運算

### 🚀 下一步建議
1. **立即行動**：進入 `nb02_nn_module_training.ipynb` 學習模組化網路設計
2. **深化理解**：嘗試修改範例中的函數，觀察梯度變化
3. **效能優化**：實驗不同的資料類型（float16/float32）對速度的影響
4. **實際應用**：準備學習如何將這些基礎概念應用到實際的機器學習任務中

**準備好進入下一個 notebook：`nb02_nn_module_training.ipynb` 了嗎？我們將學習如何建構可重用的神經網路模組和完整的訓練流程。**