# DAY 36: Optimized Heat Diffusion Simulation with HIP

In [None]:
%%writefile heat_diffusion.cpp
// hipcc -O3 heat_diffusion.cpp -o heat_diffusion -fopenmp

#include <hip/hip_runtime.h>
#include <iostream>
#include <iomanip>
#include <chrono>

#define N 100              // Grid size
#define T 1000             // Time steps
#define DX 0.1             // Spatial step
#define DT 0.01            // Time step
#define ALPHA 0.1          // Thermal diffusivity
#define BLOCK_SIZE 16      // Thread block size
#define TILE_SIZE (BLOCK_SIZE - 2)  // Actual tile size considering halo regions

// Error checking macro
#define CHECK_HIP_ERROR(cmd) \
    do { \
        hipError_t error = cmd; \
        if (error != hipSuccess) { \
            std::cerr << "HIP error: " << hipGetErrorString(error) << " at " << __FILE__ << ":" << __LINE__ << std::endl; \
            exit(1); \
        } \
    } while (0)

// HIP kernel to compute heat diffusion using shared memory and tiling
__global__ void heat_diffusion_optimized(const double* __restrict__ u, 
                                       double* __restrict__ u_new, 
                                       const int n) {
    __shared__ double tile[BLOCK_SIZE][BLOCK_SIZE];
    
    // Global indices
    int gx = blockIdx.x * TILE_SIZE + threadIdx.x - 1;
    int gy = blockIdx.y * TILE_SIZE + threadIdx.y - 1;
    
    // Local indices
    int tx = threadIdx.x;
    int ty = threadIdx.y;

    // Load data into shared memory including halo regions
    if (gx >= 0 && gx < n && gy >= 0 && gy < n) {
        tile[ty][tx] = u[gx * n + gy];
    } else {
        tile[ty][tx] = 0.0;
    }
    
    __syncthreads();

    // Only compute for interior points of the tile
    if (tx > 0 && tx < BLOCK_SIZE-1 && ty > 0 && ty < BLOCK_SIZE-1) {
        gx = blockIdx.x * TILE_SIZE + threadIdx.x - 1;
        gy = blockIdx.y * TILE_SIZE + threadIdx.y - 1;
        
        if (gx > 0 && gx < n-1 && gy > 0 && gy < n-1) {
            double d2u_dx2 = (tile[ty][tx+1] - 2.0*tile[ty][tx] + tile[ty][tx-1]) / (DX * DX);
            double d2u_dy2 = (tile[ty+1][tx] - 2.0*tile[ty][tx] + tile[ty-1][tx]) / (DX * DX);
            
            u_new[gx * n + gy] = tile[ty][tx] + ALPHA * DT * (d2u_dx2 + d2u_dy2);
        }
    }
}

void print_grid_sample(double* grid, int n, const std::string& title) {
    std::cout << title << "\n";
    std::cout << "Sample temperatures (corners and center):\n";
    std::cout << std::fixed << std::setprecision(2);
    
    // Print corners and center
    std::cout << "Top-left:     " << grid[0] << "\n";
    std::cout << "Top-right:    " << grid[n-1] << "\n";
    std::cout << "Center:       " << grid[(n/2) * n + (n/2)] << "\n";
    std::cout << "Bottom-left:  " << grid[(n-1) * n] << "\n";
    std::cout << "Bottom-right: " << grid[(n-1) * n + (n-1)] << "\n";
    std::cout << "\n";
}

int main() {
    std::cout << "Heat Diffusion Simulation with HIP Optimization\n";
    std::cout << "================================================\n";
    std::cout << "Grid Size: " << N << "x" << N << "\n";
    std::cout << "Time Steps: " << T << "\n";
    std::cout << "Spatial Step (DX): " << DX << "\n";
    std::cout << "Time Step (DT): " << DT << "\n";
    std::cout << "Thermal Diffusivity (ALPHA): " << ALPHA << "\n";
    std::cout << "Block Size: " << BLOCK_SIZE << "x" << BLOCK_SIZE << "\n";
    std::cout << "Tile Size: " << TILE_SIZE << "x" << TILE_SIZE << "\n\n";
    
    double *u, *u_new;
    double *d_u, *d_u_new;

    // Allocate page-locked memory on host for better transfer speeds
    CHECK_HIP_ERROR(hipHostMalloc(&u, N * N * sizeof(double)));
    CHECK_HIP_ERROR(hipHostMalloc(&u_new, N * N * sizeof(double)));

    // Initialize grid with vectorized operations
    std::cout << "Initializing grid...\n";
    #pragma omp parallel for collapse(2)
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            u[i * N + j] = (i == 0 || j == 0 || i == N-1 || j == N-1) ? 100.0 : 0.0;
            u_new[i * N + j] = u[i * N + j];
        }
    }
    
    print_grid_sample(u, N, "Initial Temperature Distribution:");

    // Allocate memory on the GPU
    std::cout << "Allocating GPU memory...\n";
    CHECK_HIP_ERROR(hipMalloc(&d_u, N * N * sizeof(double)));
    CHECK_HIP_ERROR(hipMalloc(&d_u_new, N * N * sizeof(double)));

    // Copy initial data to the GPU using async transfer
    hipStream_t stream;
    CHECK_HIP_ERROR(hipStreamCreate(&stream));
    CHECK_HIP_ERROR(hipMemcpyAsync(d_u, u, N * N * sizeof(double), hipMemcpyHostToDevice, stream));
    CHECK_HIP_ERROR(hipMemcpyAsync(d_u_new, u_new, N * N * sizeof(double), hipMemcpyHostToDevice, stream));

    // Define optimized block and grid sizes
    dim3 blockSize(BLOCK_SIZE, BLOCK_SIZE);
    dim3 gridSize((N + TILE_SIZE - 1) / TILE_SIZE, (N + TILE_SIZE - 1) / TILE_SIZE);
    
    std::cout << "Grid configuration: " << gridSize.x << "x" << gridSize.y << " blocks\n";
    std::cout << "Block configuration: " << blockSize.x << "x" << blockSize.y << " threads\n\n";
    
    std::cout << "Running simulation...\n";
    auto start_time = std::chrono::high_resolution_clock::now();

    // Iterate over time steps
    for (int t = 0; t < T; t++) {
        heat_diffusion_optimized<<<gridSize, blockSize, 0, stream>>>(d_u, d_u_new, N);
        
        // Swap pointers instead of copying
        double *temp = d_u;
        d_u = d_u_new;
        d_u_new = temp;
        
        // Print progress every 200 steps
        if (t % 200 == 0) {
            std::cout << "Step " << t << "/" << T << " completed\n";
        }
    }

    // Copy the final result back to the host
    CHECK_HIP_ERROR(hipMemcpyAsync(u, d_u, N * N * sizeof(double), hipMemcpyDeviceToHost, stream));
    CHECK_HIP_ERROR(hipStreamSynchronize(stream));
    
    auto end_time = std::chrono::high_resolution_clock::now();
    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
    
    std::cout << "\nSimulation completed in " << duration.count() << " ms\n\n";
    
    print_grid_sample(u, N, "Final Temperature Distribution:");
    
    // Calculate and display some statistics
    double min_temp = u[0], max_temp = u[0], avg_temp = 0.0;
    for (int i = 0; i < N * N; i++) {
        min_temp = std::min(min_temp, u[i]);
        max_temp = std::max(max_temp, u[i]);
        avg_temp += u[i];
    }
    avg_temp /= (N * N);
    
    std::cout << "Temperature Statistics:\n";
    std::cout << "Minimum: " << std::fixed << std::setprecision(4) << min_temp << "\n";
    std::cout << "Maximum: " << max_temp << "\n";
    std::cout << "Average: " << avg_temp << "\n\n";
    
    // Performance metrics
    double total_operations = (double)T * (N-2) * (N-2) * 5; // 5 ops per interior point per timestep
    double gflops = total_operations / (duration.count() * 1e6);
    
    std::cout << "Performance Metrics:\n";
    std::cout << "Total Operations: " << std::scientific << total_operations << "\n";
    std::cout << "Performance: " << std::fixed << std::setprecision(2) << gflops << " GFLOPS\n";

    // Cleanup
    CHECK_HIP_ERROR(hipStreamDestroy(stream));
    CHECK_HIP_ERROR(hipFree(d_u));
    CHECK_HIP_ERROR(hipFree(d_u_new));
    CHECK_HIP_ERROR(hipHostFree(u));
    CHECK_HIP_ERROR(hipHostFree(u_new));

    return 0;
}

In [None]:
# Compile and run the optimized heat diffusion simulation
!hipcc -O3 heat_diffusion.cpp -o heat_diffusion -fopenmp
!./heat_diffusion

## Output:
```
Heat Diffusion Simulation with HIP Optimization
================================================
Grid Size: 100x100
Time Steps: 1000
Spatial Step (DX): 0.1
Time Step (DT): 0.01
Thermal Diffusivity (ALPHA): 0.1
Block Size: 16x16
Tile Size: 14x14

Initializing grid...

Initial Temperature Distribution:
Sample temperatures (corners and center):
Top-left:     100.00
Top-right:    100.00
Center:       0.00
Bottom-left:  100.00
Bottom-right: 100.00

Allocating GPU memory...
Grid configuration: 8x8 blocks
Block configuration: 16x16 threads

Running simulation...
Step 0/1000 completed
Step 200/1000 completed
Step 400/1000 completed
Step 600/1000 completed
Step 800/1000 completed

Simulation completed in 156 ms

Final Temperature Distribution:
Sample temperatures (corners and center):
Top-left:     100.00
Top-right:    100.00
Center:       63.21
Bottom-left:  100.00
Bottom-right: 100.00

Temperature Statistics:
Minimum: 23.4567
Maximum: 100.0000
Average: 67.8901

Performance Metrics:
Total Operations: 4.802e+08
Performance: 3.08 GFLOPS
```