# 异步内存拷贝 - 实践篇

本notebook演示CUDA Stream和异步内存拷贝的使用方法。

**学习目标：**
- 理解CUDA Stream的概念
- 掌握异步内存拷贝的用法
- 实现计算与数据传输的重叠


In [None]:
%load_ext nvcc4jupyter


## 1. CUDA Stream基础

对比同步拷贝和异步拷贝的执行时间。


In [None]:
%%cuda
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void simpleKernel(float* data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        for (int i = 0; i < 100; i++) {
            data[idx] = sinf(data[idx]) * cosf(data[idx]);
        }
    }
}

int main() {
    printf("==========================================\n");
    printf("      CUDA Stream 基础演示\n");
    printf("==========================================\n\n");
    
    int n = 1024 * 1024;
    size_t size = n * sizeof(float);
    
    // 分配Pinned Memory（异步拷贝需要）
    float *h_data;
    cudaMallocHost(&h_data, size);  // Pinned Memory
    
    float *d_data;
    cudaMalloc(&d_data, size);
    
    // 初始化
    for (int i = 0; i < n; i++) h_data[i] = 1.0f;
    
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    int threadsPerBlock = 256;
    int numBlocks = (n + threadsPerBlock - 1) / threadsPerBlock;
    
    // 同步版本
    cudaEventRecord(start);
    cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice);
    simpleKernel<<<numBlocks, threadsPerBlock>>>(d_data, n);
    cudaMemcpy(h_data, d_data, size, cudaMemcpyDeviceToHost);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    
    float time_sync;
    cudaEventElapsedTime(&time_sync, start, stop);
    printf("同步版本: %.3f ms\n", time_sync);
    
    // 异步版本（使用Stream）
    cudaStream_t stream;
    cudaStreamCreate(&stream);
    
    cudaEventRecord(start);
    cudaMemcpyAsync(d_data, h_data, size, cudaMemcpyHostToDevice, stream);
    simpleKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(d_data, n);
    cudaMemcpyAsync(h_data, d_data, size, cudaMemcpyDeviceToHost, stream);
    cudaStreamSynchronize(stream);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    
    float time_async;
    cudaEventElapsedTime(&time_async, start, stop);
    printf("异步版本: %.3f ms\n", time_async);
    
    printf("\n说明: 单Stream情况下时间相近，\n");
    printf("      异步的优势在于CPU可以同时做其他工作\n");
    
    cudaStreamDestroy(stream);
    cudaFreeHost(h_data);
    cudaFree(d_data);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    
    return 0;
}


## 2. 多Stream并行

使用多个Stream实现真正的重叠执行。


In [None]:
%%cuda
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void processChunk(float* data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        for (int i = 0; i < 200; i++) {
            data[idx] = sinf(data[idx]) * cosf(data[idx]);
        }
    }
}

int main() {
    printf("==========================================\n");
    printf("        多Stream并行演示\n");
    printf("==========================================\n\n");
    
    const int numStreams = 4;
    int n = 4 * 1024 * 1024;  // 4M elements
    int chunkSize = n / numStreams;
    size_t totalSize = n * sizeof(float);
    size_t chunkBytes = chunkSize * sizeof(float);
    
    // Pinned Memory
    float *h_data;
    cudaMallocHost(&h_data, totalSize);
    
    float *d_data;
    cudaMalloc(&d_data, totalSize);
    
    for (int i = 0; i < n; i++) h_data[i] = 1.0f;
    
    cudaStream_t streams[numStreams];
    for (int i = 0; i < numStreams; i++) {
        cudaStreamCreate(&streams[i]);
    }
    
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    int threadsPerBlock = 256;
    int blocksPerChunk = (chunkSize + threadsPerBlock - 1) / threadsPerBlock;
    
    printf("总数据量: %d 元素\n", n);
    printf("Stream数量: %d\n", numStreams);
    printf("每Stream处理: %d 元素\n\n", chunkSize);
    
    // 单Stream版本
    cudaEventRecord(start);
    cudaMemcpy(d_data, h_data, totalSize, cudaMemcpyHostToDevice);
    processChunk<<<n / threadsPerBlock, threadsPerBlock>>>(d_data, n);
    cudaMemcpy(h_data, d_data, totalSize, cudaMemcpyDeviceToHost);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    
    float time_single;
    cudaEventElapsedTime(&time_single, start, stop);
    printf("单Stream: %.3f ms\n", time_single);
    
    // 多Stream版本
    cudaEventRecord(start);
    for (int i = 0; i < numStreams; i++) {
        int offset = i * chunkSize;
        cudaMemcpyAsync(d_data + offset, h_data + offset, chunkBytes, 
                        cudaMemcpyHostToDevice, streams[i]);
        processChunk<<<blocksPerChunk, threadsPerBlock, 0, streams[i]>>>(
            d_data + offset, chunkSize);
        cudaMemcpyAsync(h_data + offset, d_data + offset, chunkBytes,
                        cudaMemcpyDeviceToHost, streams[i]);
    }
    cudaDeviceSynchronize();
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    
    float time_multi;
    cudaEventElapsedTime(&time_multi, start, stop);
    printf("多Stream: %.3f ms\n", time_multi);
    printf("加速比:   %.2fx\n", time_single / time_multi);
    
    // 清理
    for (int i = 0; i < numStreams; i++) {
        cudaStreamDestroy(streams[i]);
    }
    cudaFreeHost(h_data);
    cudaFree(d_data);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    
    return 0;
}


## 总结

**关键要点：**

1. **CUDA Stream** - 命令队列，同一Stream内顺序执行，不同Stream可并行
2. **异步拷贝** - 使用cudaMemcpyAsync + Pinned Memory
3. **多Stream策略** - 将工作分块分配到不同Stream，实现传输与计算重叠

**练习：** 尝试不同Stream数量找最优值；实现双缓冲kernel
