In [None]:
code = r"""
#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
#include <cuda_runtime.h>
#include <chrono>

#define BLOCK_SIZE 256

__global__ void reduceMin(int* input, int* output, int size) {
    __shared__ int sdata[BLOCK_SIZE];
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

    sdata[tid] = (i < size) ? input[i] : INT_MAX;
    __syncthreads();

    for (unsigned int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
        if (tid < stride) {
            sdata[tid] = min(sdata[tid], sdata[tid + stride]);
        }
        __syncthreads();
    }

    if (tid == 0) output[blockIdx.x] = sdata[0];
}

__global__ void reduceMax(int* input, int* output, int size) {
    __shared__ int sdata[BLOCK_SIZE];
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

    sdata[tid] = (i < size) ? input[i] : INT_MIN;
    __syncthreads();

    for (unsigned int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
        if (tid < stride) {
            sdata[tid] = max(sdata[tid], sdata[tid + stride]);
        }
        __syncthreads();
    }

    if (tid == 0) output[blockIdx.x] = sdata[0];
}

__global__ void reduceSum(int* input, int* output, int size) {
    __shared__ int sdata[BLOCK_SIZE];
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

    sdata[tid] = (i < size) ? input[i] : 0;
    __syncthreads();

    for (unsigned int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
        if (tid < stride) {
            sdata[tid] += sdata[tid + stride];
        }
        __syncthreads();
    }

    if (tid == 0) output[blockIdx.x] = sdata[0];
}

inline cudaError_t checkCudaError(cudaError_t err, const char* msg) {
    if (err != cudaSuccess) {
        printf("CUDA Error: %s - %s\\n", msg, cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
    return err;
}

int main() {
    const int size = 1 << 10;
    int* h_input = (int*)malloc(size * sizeof(int));
    for (int i = 0; i < size; i++) {
        h_input[i] = rand() % 100 + 1;
    }

    int cpu_min = INT_MAX, cpu_max = INT_MIN, cpu_sum = 0;
    auto start_cpu = std::chrono::high_resolution_clock::now();
    for (int i = 0; i < size; i++) {
        cpu_min = (h_input[i] < cpu_min) ? h_input[i] : cpu_min;
        cpu_max = (h_input[i] > cpu_max) ? h_input[i] : cpu_max;
        cpu_sum += h_input[i];
    }
    auto end_cpu = std::chrono::high_resolution_clock::now();
    double cpu_time = std::chrono::duration<double, std::milli>(end_cpu - start_cpu).count();
    float cpu_avg = (float)cpu_sum / size;

    int *d_input, *d_output_min, *d_output_max, *d_output_sum;
    int gridSize = (size + BLOCK_SIZE - 1) / BLOCK_SIZE;

    checkCudaError(cudaMalloc((void**)&d_input, size * sizeof(int)), "Malloc d_input");
    checkCudaError(cudaMalloc((void**)&d_output_min, gridSize * sizeof(int)), "Malloc d_output_min");
    checkCudaError(cudaMalloc((void**)&d_output_max, gridSize * sizeof(int)), "Malloc d_output_max");
    checkCudaError(cudaMalloc((void**)&d_output_sum, gridSize * sizeof(int)), "Malloc d_output_sum");

    checkCudaError(cudaMemcpy(d_input, h_input, size * sizeof(int), cudaMemcpyHostToDevice), "Memcpy input");

    cudaEvent_t start_gpu, stop_gpu;
    cudaEventCreate(&start_gpu);
    cudaEventCreate(&stop_gpu);
    cudaEventRecord(start_gpu);

    reduceMin<<<gridSize, BLOCK_SIZE>>>(d_input, d_output_min, size);
    reduceMax<<<gridSize, BLOCK_SIZE>>>(d_input, d_output_max, size);
    reduceSum<<<gridSize, BLOCK_SIZE>>>(d_input, d_output_sum, size);

    checkCudaError(cudaGetLastError(), "Kernel launch failed");
    cudaEventRecord(stop_gpu);
    cudaEventSynchronize(stop_gpu);

    float gpu_time = 0.0f;
    cudaEventElapsedTime(&gpu_time, start_gpu, stop_gpu);

    int* h_output_min = (int*)malloc(gridSize * sizeof(int));
    int* h_output_max = (int*)malloc(gridSize * sizeof(int));
    int* h_output_sum = (int*)malloc(gridSize * sizeof(int));

    checkCudaError(cudaMemcpy(h_output_min, d_output_min, gridSize * sizeof(int), cudaMemcpyDeviceToHost), "Memcpy min output");
    checkCudaError(cudaMemcpy(h_output_max, d_output_max, gridSize * sizeof(int), cudaMemcpyDeviceToHost), "Memcpy max output");
    checkCudaError(cudaMemcpy(h_output_sum, d_output_sum, gridSize * sizeof(int), cudaMemcpyDeviceToHost), "Memcpy sum output");

    int gpu_min = INT_MAX, gpu_max = INT_MIN, gpu_sum = 0;
    for (int i = 0; i < gridSize; i++) {
        gpu_min = (h_output_min[i] < gpu_min) ? h_output_min[i] : gpu_min;
        gpu_max = (h_output_max[i] > gpu_max) ? h_output_max[i] : gpu_max;
        gpu_sum += h_output_sum[i];
    }
    float gpu_avg = (float)gpu_sum / size;

    printf("\\nCPU Results:\\n");
    printf("Min: %d | Max: %d | Sum: %d | Avg: %.2f | Time: %.2f ms\\n", cpu_min, cpu_max, cpu_sum, cpu_avg, cpu_time);

    printf("\\nGPU Results:\\n");
    printf("Min: %d | Max: %d | Sum: %d | Avg: %.2f | Time: %.2f ms\\n", gpu_min, gpu_max, gpu_sum, gpu_avg, gpu_time);

    free(h_input);
    free(h_output_min);
    free(h_output_max);
    free(h_output_sum);
    cudaFree(d_input);
    cudaFree(d_output_min);
    cudaFree(d_output_max);
    cudaFree(d_output_sum);
    cudaEventDestroy(start_gpu);
    cudaEventDestroy(stop_gpu);

    return 0;
}
"""


with open("main.cu", "w") as f:
    f.write(code)

In [None]:
!nvcc -arch=sm_75 main.cu -o main

In [None]:
!./main