In [None]:
pip install nvcc4jupyter

Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1


In [None]:
%%writefile q1.cu
#include <iostream>
#include <cuda_runtime.h>
#include <cstdlib>
#include <ctime>
#include <chrono>

using namespace std;

__global__ void vectorAddCUDA(const float* A, const float* B, float* C, int N) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N) {
        C[i] = A[i] + B[i];
    }
}

void vectorAddCPU(const float* A, const float* B, float* C, int N) {
    for (int i = 0; i < N; ++i) {
        C[i] = A[i] + B[i];
    }
}

int main() {
    int N = 10000000;
    size_t size = N * sizeof(float);

    float *h_A = (float*)malloc(size);
    float *h_B = (float*)malloc(size);
    float *h_C_cpu = (float*)malloc(size); // Result for CPU
    float *h_C_gpu = (float*)malloc(size); // Result for GPU

    srand(time(0));
    for (int i = 0; i < N; i++) {
        h_A[i] = static_cast<float>(rand()) / RAND_MAX;
        h_B[i] = static_cast<float>(rand()) / RAND_MAX;
    }

    // CPU (Serial) Execution
    auto start_cpu = chrono::high_resolution_clock::now();
    vectorAddCPU(h_A, h_B, h_C_cpu, N);
    auto end_cpu = chrono::high_resolution_clock::now();
    chrono::duration<float, milli> cpu_duration = end_cpu - start_cpu;
    cout << "CPU Execution Time: " << cpu_duration.count() << " ms" << endl;

    float *d_A, *d_B, *d_C;
    cudaMalloc((void**)&d_A, size);
    cudaMalloc((void**)&d_B, size);
    cudaMalloc((void**)&d_C, size);

    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

    // Define block and grid sizes
    int blockSize = 256;
    int gridSize = (N + blockSize - 1) / blockSize;

    // GPU (CUDA) Execution
    auto start_gpu = chrono::high_resolution_clock::now();
    vectorAddCUDA<<<gridSize, blockSize>>>(d_A, d_B, d_C, N);
    cudaDeviceSynchronize();
    auto end_gpu = chrono::high_resolution_clock::now();
    chrono::duration<float, milli> gpu_duration = end_gpu - start_gpu;
    cout << "GPU Execution Time: " << gpu_duration.count() << " ms" << endl;

    cudaMemcpy(h_C_gpu, d_C, size, cudaMemcpyDeviceToHost);

    float speedup = cpu_duration.count() / gpu_duration.count();
    cout << "Speedup (CPU Time / GPU Time): " << speedup << endl;


    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    free(h_A);
    free(h_B);
    free(h_C_cpu);
    free(h_C_gpu);

    return 0;
}


Overwriting q1.cu


In [None]:
! nvcc -o q1 q1.cu
!./q1

CPU Execution Time: 58.4208 ms
GPU Execution Time: 0.002466 ms
Speedup (CPU Time / GPU Time): 23690.5


In [None]:
%%writefile q2.cu
#include <iostream>
#include <cuda_runtime.h>
#include <cstdlib>
#include <ctime>
#include <chrono>

using namespace std;



// CUDA kernel for matrix addition
__global__ void matrixAddCUDA(const float* A, const float* B, float* C, int M, int N) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < M && col < N) {
        int index = row * N + col;
        C[index] = A[index] + B[index];
    }
}

// Serial CPU implementation for matrix addition
void matrixAddCPU(const float* A, const float* B, float* C, int M, int N) {
    for (int i = 0; i < M; ++i) {
        for (int j = 0; j < N; ++j) {
            int index = i * N + j;
            C[index] = A[index] + B[index];
        }
    }
}

int main() {
    int M = 1000; // Number of rows
    int N = 1000; // Number of columns
    size_t size = M * N * sizeof(float);

    // Allocate host memory
    float *h_A = (float*)malloc(size);
    float *h_B = (float*)malloc(size);
    float *h_C_cpu = (float*)malloc(size); // Result for CPU
    float *h_C_gpu = (float*)malloc(size); // Result for GPU

    // Initialize matrices A and B with random values
    srand(time(0));
    for (int i = 0; i < M * N; i++) {
        h_A[i] = static_cast<float>(rand()) / RAND_MAX;
        h_B[i] = static_cast<float>(rand()) / RAND_MAX;
    }

    // CPU (Serial) Execution
    auto start_cpu = chrono::high_resolution_clock::now();
    matrixAddCPU(h_A, h_B, h_C_cpu, M, N);
    auto end_cpu = chrono::high_resolution_clock::now();
    chrono::duration<float, milli> cpu_duration = end_cpu - start_cpu;
    cout << "CPU Execution Time: " << cpu_duration.count() << " ms" << endl;

    // Allocate device memory
    float *d_A, *d_B, *d_C;
    cudaMalloc((void**)&d_A, size);
    cudaMalloc((void**)&d_B, size);
    cudaMalloc((void**)&d_C, size);
    CUDA_CHECK_ERROR();

    // Copy data from host to device
    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

    // Define block and grid sizes
    dim3 blockSize(16, 16); // 16x16 threads per block
    dim3 gridSize((N + blockSize.x - 1) / blockSize.x, (M + blockSize.y - 1) / blockSize.y);

    // GPU (CUDA) Execution
    auto start_gpu = chrono::high_resolution_clock::now();
    matrixAddCUDA<<<gridSize, blockSize>>>(d_A, d_B, d_C, M, N);
    cudaDeviceSynchronize(); // Ensure kernel has finished executing
    auto end_gpu = chrono::high_resolution_clock::now();

    chrono::duration<float, milli> gpu_duration = end_gpu - start_gpu;
    cout << "GPU Execution Time: " << gpu_duration.count() << " ms" << endl;

    // Copy result from device to host
    cudaMemcpy(h_C_gpu, d_C, size, cudaMemcpyDeviceToHost);

    // Calculate speedup
    float speedup = cpu_duration.count() / gpu_duration.count();
    cout << "Speedup (CPU Time / GPU Time): " << speedup << endl;

    // Free device and host memory
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    free(h_A);
    free(h_B);
    free(h_C_cpu);
    free(h_C_gpu);

    return 0;
}


Overwriting q2.cu


In [None]:
!nvcc -o q2 q2.cu
!./q2

CPU Execution Time: 11.8038 ms
GPU Execution Time: 0.003225 ms
Speedup (CPU Time / GPU Time): 3660.1
