# CUDA编程模型 - 实践篇

本notebook通过实际代码帮助你理解CUDA的线程组织结构。

**学习目标：**
- 编写第一个CUDA Kernel
- 理解Grid/Block/Thread的索引计算
- 掌握1D/2D配置的使用方法


In [None]:
%load_ext nvcc4jupyter


## 1. Hello World - 第一个CUDA Kernel

每个线程打印自己的索引信息，观察线程的组织结构。


In [None]:
%%cuda
#include <stdio.h>

// __global__ 标记这是一个Kernel函数，可以从CPU调用，在GPU上执行
__global__ void helloKernel() {
    // 计算全局线程索引
    int globalIdx = blockIdx.x * blockDim.x + threadIdx.x;
    
    printf("Hello from Thread %d (Block %d, Thread %d in block)\n",
           globalIdx, blockIdx.x, threadIdx.x);
}

int main() {
    printf("==========================================\n");
    printf("          CUDA Hello World\n");
    printf("==========================================\n\n");
    
    // 启动配置: 2个Block，每Block 4个线程，共8个线程
    printf("启动配置: <<<2, 4>>> (2 Blocks × 4 Threads = 8 Threads)\n\n");
    
    helloKernel<<<2, 4>>>();
    
    // 等待GPU完成
    cudaDeviceSynchronize();
    
    printf("\n注意: 线程的打印顺序可能是乱序的，因为它们是并行执行的！\n");
    
    return 0;
}


## 2. 向量加法 - 1D线程配置

这是CUDA编程的经典入门示例：两个向量相加。


In [None]:
%%cuda
#include <stdio.h>
#include <cuda_runtime.h>

// 向量加法Kernel
__global__ void vectorAdd(float* a, float* b, float* c, int n) {
    // 计算全局索引
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    
    // 边界检查：防止越界访问
    if (idx < n) {
        c[idx] = a[idx] + b[idx];
    }
}

int main() {
    printf("==========================================\n");
    printf("            向量加法示例\n");
    printf("==========================================\n\n");
    
    int n = 1000;  // 向量长度
    size_t size = n * sizeof(float);
    
    // 分配Host内存
    float *h_a = (float*)malloc(size);
    float *h_b = (float*)malloc(size);
    float *h_c = (float*)malloc(size);
    
    // 初始化数据
    for (int i = 0; i < n; i++) {
        h_a[i] = i;
        h_b[i] = i * 2;
    }
    
    // 分配Device内存
    float *d_a, *d_b, *d_c;
    cudaMalloc(&d_a, size);
    cudaMalloc(&d_b, size);
    cudaMalloc(&d_c, size);
    
    // 拷贝数据到GPU
    cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);
    
    // 计算Grid和Block大小
    int threadsPerBlock = 256;
    int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;
    
    printf("向量长度: %d\n", n);
    printf("每Block线程数: %d\n", threadsPerBlock);
    printf("Block数量: %d\n", blocksPerGrid);
    printf("总线程数: %d\n\n", blocksPerGrid * threadsPerBlock);
    
    // 启动Kernel
    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, n);
    
    // 拷贝结果回Host
    cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);
    
    // 验证结果
    bool success = true;
    for (int i = 0; i < n; i++) {
        if (h_c[i] != h_a[i] + h_b[i]) {
            success = false;
            break;
        }
    }
    
    printf("验证结果: %s\n", success ? "通过 ✓" : "失败 ✗");
    printf("示例: c[0] = a[0] + b[0] = %.0f + %.0f = %.0f\n", h_a[0], h_b[0], h_c[0]);
    printf("示例: c[999] = a[999] + b[999] = %.0f + %.0f = %.0f\n", h_a[999], h_b[999], h_c[999]);
    
    // 清理
    cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
    free(h_a); free(h_b); free(h_c);
    
    return 0;
}


## 3. 2D索引 - 矩阵元素访问

对于2D数据（如图像、矩阵），使用2D线程配置更直观。


In [None]:
%%cuda
#include <stdio.h>
#include <cuda_runtime.h>

// 矩阵初始化Kernel - 每个线程处理一个元素
__global__ void matrixInit(float* matrix, int width, int height) {
    // 计算2D坐标
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    
    // 边界检查
    if (col < width && row < height) {
        // 计算线性索引 (行优先存储)
        int idx = row * width + col;
        // 将坐标值存入矩阵
        matrix[idx] = row * 10 + col;
    }
}

int main() {
    printf("==========================================\n");
    printf("          2D索引示例 - 矩阵初始化\n");
    printf("==========================================\n\n");
    
    int width = 8;
    int height = 4;
    size_t size = width * height * sizeof(float);
    
    float *h_matrix = (float*)malloc(size);
    float *d_matrix;
    cudaMalloc(&d_matrix, size);
    
    // 2D Block配置
    dim3 blockDim(4, 2);  // 4×2 = 8 threads per block
    dim3 gridDim((width + blockDim.x - 1) / blockDim.x,
                 (height + blockDim.y - 1) / blockDim.y);
    
    printf("矩阵大小: %d × %d\n", width, height);
    printf("Block大小: (%d, %d)\n", blockDim.x, blockDim.y);
    printf("Grid大小: (%d, %d)\n\n", gridDim.x, gridDim.y);
    
    matrixInit<<<gridDim, blockDim>>>(d_matrix, width, height);
    cudaMemcpy(h_matrix, d_matrix, size, cudaMemcpyDeviceToHost);
    
    // 打印矩阵
    printf("生成的矩阵 (值 = row*10 + col):\n");
    for (int row = 0; row < height; row++) {
        printf("  ");
        for (int col = 0; col < width; col++) {
            printf("%4.0f ", h_matrix[row * width + col]);
        }
        printf("\n");
    }
    
    cudaFree(d_matrix);
    free(h_matrix);
    
    return 0;
}


## 4. 理解索引计算

可视化展示线程索引如何映射到数据元素。


In [None]:
%%cuda
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void visualizeIndex() {
    int globalX = blockIdx.x * blockDim.x + threadIdx.x;
    int globalY = blockIdx.y * blockDim.y + threadIdx.y;
    int globalIdx = globalY * (gridDim.x * blockDim.x) + globalX;
    
    printf("Block(%d,%d) Thread(%d,%d) -> Global(%d,%d) -> Linear %d\n",
           blockIdx.x, blockIdx.y,
           threadIdx.x, threadIdx.y,
           globalX, globalY,
           globalIdx);
}

int main() {
    printf("==========================================\n");
    printf("          索引计算可视化\n");
    printf("==========================================\n\n");
    
    printf("配置: Grid(2,2) × Block(2,2) = 16个线程\n\n");
    
    dim3 gridDim(2, 2);
    dim3 blockDim(2, 2);
    
    visualizeIndex<<<gridDim, blockDim>>>();
    cudaDeviceSynchronize();
    
    printf("\n索引公式:\n");
    printf("  globalX = blockIdx.x * blockDim.x + threadIdx.x\n");
    printf("  globalY = blockIdx.y * blockDim.y + threadIdx.y\n");
    printf("  linearIdx = globalY * totalWidth + globalX\n");
    
    return 0;
}


## 总结

**关键概念：**

1. **Kernel函数** (`__global__`)
   - 在GPU上执行，由CPU调用
   - 语法: `kernel<<<gridDim, blockDim>>>(args)`

2. **线程索引**
   - `threadIdx`: 线程在Block内的索引
   - `blockIdx`: Block在Grid内的索引
   - `blockDim`/`gridDim`: Block/Grid的维度

3. **全局索引计算**
   - 1D: `idx = blockIdx.x * blockDim.x + threadIdx.x`
   - 2D: `row/col = blockIdx.y/x * blockDim.y/x + threadIdx.y/x`

4. **常用Block大小**
   - 1D: 128, 256, 512
   - 2D: (16,16), (32,8), (8,32)

## 练习

1. 修改向量加法，尝试不同的Block大小，观察是否有性能差异
2. 实现矩阵加法（两个矩阵相加）
3. 实现一个kernel，将矩阵每个元素乘以其行号
