                                                                            Question 1

In [2]:
%%writefile vector_add.cu
#include <stdio.h>
#include <cuda_runtime.h>

#define N 1024

__global__ void vectorAdd(float *A, float *B, float *C)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N)
        C[i] = A[i] + B[i];
}

int main()
{
    float *h_A, *h_B, *h_C;
    float *d_A, *d_B, *d_C;

    size_t size = N * sizeof(float);

    h_A = (float*)malloc(size);
    h_B = (float*)malloc(size);
    h_C = (float*)malloc(size);

    for(int i = 0; i < N; i++)
    {
        h_A[i] = i;
        h_B[i] = i * 2;
    }

    cudaMalloc((void**)&d_A, size);
    cudaMalloc((void**)&d_B, size);
    cudaMalloc((void**)&d_C, size);

    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

    vectorAdd<<<(N+255)/256, 256>>>(d_A, d_B, d_C);

    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

    for(int i = 0; i < 10; i++)
        printf("%f\n", h_C[i]);

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    free(h_A);
    free(h_B);
    free(h_C);

    return 0;
}


Writing vector_add.cu


In [3]:
!nvcc vector_add.cu -o vector_add

In [4]:
!./vector_add

0.000000
3.000000
6.000000
9.000000
12.000000
15.000000
18.000000
21.000000
24.000000
27.000000


                                                                            Question 2

In [7]:
%%writefile vector_add_thrust.cu
#include <stdio.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/functional.h>

#define N 1024

int main()
{
    thrust::host_vector<float> h_A(N);
    thrust::host_vector<float> h_B(N);

    for(int i = 0; i < N; i++)
    {
        h_A[i] = i;
        h_B[i] = i * 2;
    }

    thrust::device_vector<float> d_A = h_A;
    thrust::device_vector<float> d_B = h_B;
    thrust::device_vector<float> d_C(N);

    thrust::transform(d_A.begin(), d_A.end(),
                      d_B.begin(),
                      d_C.begin(),
                      thrust::plus<float>());

    thrust::host_vector<float> h_C = d_C;

    for(int i = 0; i < 10; i++)
        printf("%f\n", h_C[i]);

    return 0;
}


Overwriting vector_add_thrust.cu


In [8]:
!nvcc vector_add_thrust.cu -o vector_add_thrust

In [9]:
!./vector_add_thrust

0.000000
3.000000
6.000000
9.000000
12.000000
15.000000
18.000000
21.000000
24.000000
27.000000


                                                                            Question 3

In [13]:
%%writefile dot_product.cu
#include <stdio.h>
#include <cuda_runtime.h>
#include <thrust/device_vector.h>
#include <thrust/inner_product.h>
#include <chrono>

#define N 1024

int main()
{
    float *h_A = (float*)malloc(N * sizeof(float));
    float *h_B = (float*)malloc(N * sizeof(float));

    for(int i = 0; i < N; i++)
    {
        h_A[i] = 1.0f;
        h_B[i] = 2.0f;
    }

    auto c1 = std::chrono::high_resolution_clock::now();
    float cpu = 0.0f;
    for(int i = 0; i < N; i++)
        cpu += h_A[i] * h_B[i];
    auto c2 = std::chrono::high_resolution_clock::now();
    double cpu_time = std::chrono::duration<double, std::milli>(c2 - c1).count();
    thrust::device_vector<float> d_A(h_A, h_A + N);
    thrust::device_vector<float> d_B(h_B, h_B + N);

    cudaDeviceSynchronize();
    auto g1 = std::chrono::high_resolution_clock::now();
    float gpu = thrust::inner_product(d_A.begin(), d_A.end(), d_B.begin(), 0.0f);
    cudaDeviceSynchronize();
    auto g2 = std::chrono::high_resolution_clock::now();
    double gpu_time = std::chrono::duration<double, std::milli>(g2 - g1).count();

    printf("CPU Result: %f\n", cpu);
    printf("GPU Result: %f\n", gpu);
    printf("CPU Time (ms): %f\n", cpu_time);
    printf("GPU Time (ms): %f\n", gpu_time);

    free(h_A);
    free(h_B);
    return 0;
}


Writing dot_product.cu


In [16]:
!nvcc dot_product.cu -o dot_product

In [17]:
!./dot_product

CPU Result: 2048.000000
GPU Result: 2048.000000
CPU Time (ms): 0.002465
GPU Time (ms): 238.646183


                                                                            Question 4

In [18]:
%%writefile matrix_mul.cu
#include <stdio.h>
#include <cuda_runtime.h>

#define N 16

__global__ void matMul(float *A, float *B, float *C)
{
    int row = threadIdx.y;
    int col = threadIdx.x;

    float sum = 0.0f;

    for(int k = 0; k < N; k++)
        sum += A[row * N + k] * B[k * N + col];

    C[row * N + col] = sum;
}

int main()
{
    float h_A[N*N], h_B[N*N], h_C[N*N];
    float *d_A, *d_B, *d_C;

    for(int i = 0; i < N*N; i++)
    {
        h_A[i] = 1.0f;
        h_B[i] = 1.0f;
    }

    cudaMalloc(&d_A, N*N*sizeof(float));
    cudaMalloc(&d_B, N*N*sizeof(float));
    cudaMalloc(&d_C, N*N*sizeof(float));

    cudaMemcpy(d_A, h_A, N*N*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, N*N*sizeof(float), cudaMemcpyHostToDevice);

    dim3 threads(N, N);
    matMul<<<1, threads>>>(d_A, d_B, d_C);

    cudaMemcpy(h_C, d_C, N*N*sizeof(float), cudaMemcpyDeviceToHost);

    for(int i = 0; i < 5; i++)
        printf("%f\n", h_C[i]);

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    return 0;
}


Writing matrix_mul.cu


In [19]:
!nvcc matrix_mul.cu -o matrix_mul

In [20]:
!./matrix_mul

16.000000
16.000000
16.000000
16.000000
16.000000


                                                                            Question 5

In [21]:
%%writefile q5_cpu.cu
#include <stdio.h>
#include <chrono>

#define N 5000000

int main()
{
    float *A = (float*)malloc(N*sizeof(float));
    float *B = (float*)malloc(N*sizeof(float));
    float *C = (float*)malloc(N*sizeof(float));

    for(int i = 0; i < N; i++)
    {
        A[i] = 1.0f;
        B[i] = 2.0f;
    }

    auto t1 = std::chrono::high_resolution_clock::now();
    for(int i = 0; i < N; i++)
        C[i] = A[i] + B[i];
    auto t2 = std::chrono::high_resolution_clock::now();

    double time = std::chrono::duration<double, std::milli>(t2 - t1).count();
    printf("CPU Time (ms): %f\n", time);

    free(A); free(B); free(C);
    return 0;
}


Writing q5_cpu.cu


In [22]:
!nvcc -std=c++14 q5_cpu.cu -o q5_cpu
!./q5_cpu

CPU Time (ms): 19.783716


In [23]:
%%writefile q5_cuda.cu
#include <stdio.h>
#include <cuda_runtime.h>
#include <chrono>

#define N 5000000

__global__ void add(float *A, float *B, float *C)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if(i < N)
        C[i] = A[i] + B[i];
}

int main()
{
    float *h_A = (float*)malloc(N*sizeof(float));
    float *h_B = (float*)malloc(N*sizeof(float));
    float *h_C = (float*)malloc(N*sizeof(float));

    float *d_A, *d_B, *d_C;

    for(int i = 0; i < N; i++)
    {
        h_A[i] = 1.0f;
        h_B[i] = 2.0f;
    }

    cudaMalloc(&d_A, N*sizeof(float));
    cudaMalloc(&d_B, N*sizeof(float));
    cudaMalloc(&d_C, N*sizeof(float));

    cudaMemcpy(d_A, h_A, N*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, N*sizeof(float), cudaMemcpyHostToDevice);

    cudaDeviceSynchronize();
    auto t1 = std::chrono::high_resolution_clock::now();

    add<<<(N+255)/256, 256>>>(d_A, d_B, d_C);

    cudaDeviceSynchronize();
    auto t2 = std::chrono::high_resolution_clock::now();

    double time = std::chrono::duration<double, std::milli>(t2 - t1).count();
    printf("CUDA Time (ms): %f\n", time);

    cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);
    free(h_A); free(h_B); free(h_C);

    return 0;
}


Writing q5_cuda.cu


In [24]:
!nvcc -std=c++14 q5_cuda.cu -o q5_cuda
!./q5_cuda

CUDA Time (ms): 9.590731


In [25]:
%%writefile q5_thrust.cu
#include <stdio.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/functional.h>
#include <chrono>

#define N 5000000

int main()
{
    thrust::device_vector<float> A(N, 1.0f);
    thrust::device_vector<float> B(N, 2.0f);
    thrust::device_vector<float> C(N);

    cudaDeviceSynchronize();
    auto t1 = std::chrono::high_resolution_clock::now();

    thrust::transform(A.begin(), A.end(),
                      B.begin(),
                      C.begin(),
                      thrust::plus<float>());

    cudaDeviceSynchronize();
    auto t2 = std::chrono::high_resolution_clock::now();

    double time = std::chrono::duration<double, std::milli>(t2 - t1).count();
    printf("Thrust Time (ms): %f\n", time);

    return 0;
}

Writing q5_thrust.cu


In [26]:
!nvcc -std=c++14 q5_thrust.cu -o q5_thrust
!./q5_thrust

Thrust Time (ms): 0.065372


In [27]:
import cudf
import cupy as cp
import time

N = 5000000

A = cudf.Series(cp.ones(N))
B = cudf.Series(cp.full(N, 2))

cp.cuda.Stream.null.synchronize()
t1 = time.time()

C = A + B

cp.cuda.Stream.null.synchronize()
t2 = time.time()

print("RAPIDS Time (ms):", (t2 - t1) * 1000)

RAPIDS Time (ms): 2.4755001068115234


                                                                            Question 6

In [28]:
%%writefile q6_reduction.cu
#include <stdio.h>
#include <thrust/device_vector.h>
#include <thrust/reduce.h>
#include <chrono>

#define N 5000000

int main()
{
    thrust::device_vector<float> A(N, 1.0f);

    cudaDeviceSynchronize();
    auto t1 = std::chrono::high_resolution_clock::now();

    float result = thrust::reduce(A.begin(), A.end(), 0.0f);

    cudaDeviceSynchronize();
    auto t2 = std::chrono::high_resolution_clock::now();

    double time = std::chrono::duration<double, std::milli>(t2 - t1).count();

    printf("Reduction Result: %f\n", result);
    printf("Reduction Time (ms): %f\n", time);

    return 0;
}


Writing q6_reduction.cu


In [29]:
!nvcc -std=c++14 q6_reduction.cu -o q6_reduction
!./q6_reduction

Reduction Result: 5000000.000000
Reduction Time (ms): 0.860924


                                                                            Question 7

In [32]:
%%writefile q7_prefix_sum.cu
#include <stdio.h>
#include <thrust/device_vector.h>
#include <thrust/scan.h>
#include <chrono>

#define N 5000000

int main()
{
    thrust::device_vector<int> A(N, 1);

    cudaDeviceSynchronize();
    auto t1 = std::chrono::high_resolution_clock::now();

    thrust::inclusive_scan(A.begin(), A.end(), A.begin());

    cudaDeviceSynchronize();
    auto t2 = std::chrono::high_resolution_clock::now();

    int last = A[N-1];

    double time = std::chrono::duration<double, std::milli>(t2 - t1).count();

    printf("Last Element: %d\n", last);
    printf("Prefix Sum Time (ms): %f\n", time);

    return 0;
}


Overwriting q7_prefix_sum.cu


In [33]:
!nvcc -std=c++14 q7_prefix_sum.cu -o q7_prefix_sum
!./q7_prefix_sum

Last Element: 5000000
Prefix Sum Time (ms): 1.936666
