## CPU vs GPU Performance with vector size

In [16]:
%%writefile 1.cu

#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <chrono>

__global__ void sumArraysOnGPU(int* A, int *B, int *C, const int N){
	int tid = threadIdx.x + blockIdx.x * blockDim.x;

	if(tid < N)
		C[tid] = A[tid] + B[tid];
}

void sumArrayOnHost(int *A, int *B, int *C, const int N){
	for(int id = 0; id<N ; id++){
		C[id] = A[id]+B[id];
	}
}

int main(){
	int N = 1000;
	printf("vector size %d\n", N);

	size_t nBytes = N * sizeof(int);

	int *h_A, *h_B, *h_C, *h_result;
	int *d_A, *d_B, *d_C;

	h_A = (int*)malloc(nBytes);
	h_B = (int*)malloc(nBytes);
	h_C = (int*)malloc(nBytes);
	h_result = (int*)malloc(nBytes);

	cudaMalloc((int**)&d_A, nBytes);
	cudaMalloc((int**)&d_B, nBytes);
	cudaMalloc((int**)&d_C, nBytes);

	for(int i=0; i<N; i++){
		h_A[i]=i+1;
		h_B[i]=i+1;
		h_C[i]=0;
		h_result[i] = 0;
	}


	auto start_time = std::chrono::high_resolution_clock::now();
	sumArrayOnHost(h_A, h_B, h_C, N);
	auto end_time = std::chrono::high_resolution_clock::now();

	auto duration_ns = std::chrono::duration_cast < std::chrono::nanoseconds> (end_time - start_time).count();
	double seconds_cpu = duration_ns / 1000000000.0;

	std::cout<<"Time take by CPU: "<<seconds_cpu<<" seconds"<<std::endl;


	cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice);
	cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice);
	cudaMemcpy(d_C, h_C, nBytes, cudaMemcpyHostToDevice);

	int threadsPerBlock = 1024;
	int blockPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

	cudaEventRecord(start);
	sumArraysOnGPU<<<blockPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);
	cudaEventSynchronize(stop);
	cudaEventRecord(stop);

	float milli_seconds_device = 0;
	cudaEventElapsedTime(&milli_seconds_device, start, stop);
	printf("Time taken by device(GPU): %.0f seconds\n", milli_seconds_device/1000);

	cudaMemcpy(h_result, d_C, nBytes, cudaMemcpyDeviceToHost);

	printf("printing Result\n");
  for(int i=0; i<10; i++){
		printf("i = %d, sum = %d\n", i, h_result[i]);
	}

	cudaFree(d_A);
	cudaFree(d_B);
	cudaFree(d_C);

	free(h_A);
	free(h_B);
	free(h_C);
	free(h_result);

	return 0;

}


Overwriting 1.cu


In [17]:
!nvcc -o 1 1.cu

[01m[0m[01m1.cu(65)[0m: [01;31merror[0m: identifier "[01mstart[0m" is undefined
   cudaEventRecord(start);
                   ^

[01m[0m[01m1.cu(67)[0m: [01;31merror[0m: identifier "[01mstop[0m" is undefined
   cudaEventSynchronize(stop);
                        ^

2 errors detected in the compilation of "1.cu".


In [6]:
%%writefile 3.cu

#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <chrono>

__global__ void sumArraysOnGPU(int* A, int* B, int* C, const int N) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N)
        C[tid] = A[tid] + B[tid];
}

void sumArrayOnHost(int* A, int* B, int* C, const int N) {
    for (int id = 0; id < N; id++) {
        C[id] = A[id] + B[id];
    }
}

int main() {
    int N = 1000;
    printf("Vector size %d\n", N);

    size_t nBytes = N * sizeof(int);

    int* h_A, * h_B, * h_C, * h_result;
    int* d_A, * d_B, * d_C;

    h_A = (int*)malloc(nBytes);
    h_B = (int*)malloc(nBytes);
    h_C = (int*)malloc(nBytes);
    h_result = (int*)malloc(nBytes);

    cudaMalloc((void**)&d_A, nBytes);
    cudaMalloc((void**)&d_B, nBytes);
    cudaMalloc((void**)&d_C, nBytes);

    for (int i = 0; i < N; i++) {
        h_A[i] = i + 1;
        h_B[i] = i + 1;
        h_C[i] = 0;
        h_result[i] = 0;
    }

    // Measure CPU time
    auto start_time = std::chrono::high_resolution_clock::now();
    sumArrayOnHost(h_A, h_B, h_C, N);
    auto end_time = std::chrono::high_resolution_clock::now();
    auto duration_ns = std::chrono::duration_cast<std::chrono::nanoseconds>(end_time - start_time).count();
    double seconds_cpu = duration_ns / 1000000000.0;
    std::cout << "Time taken by CPU: " << seconds_cpu << " seconds" << std::endl;

    // Copy data to device
    cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice);

    int threadsPerBlock = 1024;
    int blockPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    // Measure GPU time
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);
    sumArraysOnGPU <<< blockPerGrid, threadsPerBlock >>> (d_A, d_B, d_C, N);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float milliseconds_device = 0;
    cudaEventElapsedTime(&milliseconds_device, start, stop);
    double seconds_gpu = milliseconds_device / 1000.0;
    std::cout << "Time taken by device (GPU): " << seconds_gpu << " seconds" << std::endl;

    cudaMemcpy(h_result, d_C, nBytes, cudaMemcpyDeviceToHost);

    // Print a sample of the results
    printf("Printing Result\n");
    for (int i = 0; i < 10; i++) {
        printf("i = %d, sum = %d\n", i, h_result[i]);
    }

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    free(h_A);
    free(h_B);
    free(h_C);
    free(h_result);

    return 0;
}


Writing 3.cu


In [7]:
!nvcc -o 3 3.cu

In [8]:
!./3

Vector size 1000
Time taken by CPU: 3.297e-06 seconds
Time taken by device (GPU): 0.000202784 seconds
Printing Result
i = 0, sum = 2
i = 1, sum = 4
i = 2, sum = 6
i = 3, sum = 8
i = 4, sum = 10
i = 5, sum = 12
i = 6, sum = 14
i = 7, sum = 16
i = 8, sum = 18
i = 9, sum = 20


In [33]:
%%writefile 4.cu

#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <chrono>

__global__ void sumArraysOnGPU(int* A, int* B, int* C, const int N) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N)
        C[tid] = A[tid] + B[tid];
}

void sumArrayOnHost(int* A, int* B, int* C, const int N) {
    for (int id = 0; id < N; id++) {
        C[id] = A[id] + B[id];
    }
}

int main() {
    int N = 1000000;
    printf("Vector size %d\n", N);

    size_t nBytes = N * sizeof(int);

    int* h_A, * h_B, * h_C, * h_result;
    int* d_A, * d_B, * d_C;

    h_A = (int*)malloc(nBytes);
    h_B = (int*)malloc(nBytes);
    h_C = (int*)malloc(nBytes);
    h_result = (int*)malloc(nBytes);

    cudaMalloc((void**)&d_A, nBytes);
    cudaMalloc((void**)&d_B, nBytes);
    cudaMalloc((void**)&d_C, nBytes);

    for (int i = 0; i < N; i++) {
        h_A[i] = i + 1;
        h_B[i] = i + 1;
        h_C[i] = 0;
        h_result[i] = 0;
    }

    // Measure CPU time
    auto start_time = std::chrono::high_resolution_clock::now();
    sumArrayOnHost(h_A, h_B, h_C, N);
    auto end_time = std::chrono::high_resolution_clock::now();
    auto duration_ns = std::chrono::duration_cast<std::chrono::nanoseconds>(end_time - start_time).count();
    double seconds_cpu = duration_ns / 1000000000.0;
    std::cout << "Time taken by CPU: " << seconds_cpu << " seconds" << std::endl;

    cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice);

    int threadsPerBlock = 1024;
    int blockPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    // Measure GPU time
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);
    sumArraysOnGPU <<< blockPerGrid, threadsPerBlock >>> (d_A, d_B, d_C, N);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float milliseconds_device = 0;
    cudaEventElapsedTime(&milliseconds_device, start, stop);
    double seconds_gpu = milliseconds_device / 1000.0;
    std::cout << "Time taken by device (GPU): " << seconds_gpu << " seconds" << std::endl;

    cudaMemcpy(h_result, d_C, nBytes, cudaMemcpyDeviceToHost);

    // Print a sample of the results
    printf("Printing Result\n");
    //for (int i = 0; i < 10; i++) {
        //printf("i = %d, sum = %d\n", i, h_result[i]);
    //}

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    free(h_A);
    free(h_B);
    free(h_C);
    free(h_result);

    return 0;
}


Overwriting 4.cu


In [34]:
!nvcc -o 4 4.cu

In [35]:
!./4

Vector size 1000000
Time taken by CPU: 0.00317881 seconds
Time taken by device (GPU): 0.000180736 seconds
Printing Result
