In [None]:
!nvcc --version

In [None]:
!nvidia-smi

In [13]:
%%writefile vector.cu
#include <iostream>
#include <vector>
#include <chrono>
#include <cuda_runtime.h>
using namespace std;

// CUDA Kernel for vector addition
__global__ void vecAdd(int* A, int* B, int* C, int size) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < size)
        C[tid] = A[tid] + B[tid];
}

// CUDA Kernel for matrix multiplication
__global__ void matMul(int* A, int* B, int* C, int N) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    if (row < N && col < N) {
        int sum = 0;
        for (int k = 0; k < N; ++k)
            sum += A[row * N + k] * B[k * N + col];
        C[row * N + col] = sum;
    }
}

// Sequential vector addition
void sequentialVecAdd(const vector<int>& A, const vector<int>& B, vector<int>& C) {
    for (int i = 0; i < A.size(); ++i)
        C[i] = A[i] + B[i];
}

// Sequential matrix multiplication
void sequentialMatMul(const vector<int>& A, const vector<int>& B, vector<int>& C, int N) {
    for (int row = 0; row < N; ++row) {
        for (int col = 0; col < N; ++col) {
            int sum = 0;
            for (int k = 0; k < N; ++k)
                sum += A[row * N + k] * B[k * N + col];
            C[row * N + col] = sum;
        }
    }
}

int main() {
    const int vecSize = 1000000;
    const int N = 512;

    // Initialize vectors
    vector<int> A(vecSize, 1), B(vecSize, 2), C_seq(vecSize), C_gpu(vecSize);

    // Sequential Vector Add
    auto start = chrono::high_resolution_clock::now();
    sequentialVecAdd(A, B, C_seq);
    auto end = chrono::high_resolution_clock::now();
    double cpu_vec_time = chrono::duration<double, milli>(end - start).count();

    // Parallel Vector Add
    int *d_A, *d_B, *d_C;
    size_t bytes = vecSize * sizeof(int);
    cudaMalloc(&d_A, bytes);
    cudaMalloc(&d_B, bytes);
    cudaMalloc(&d_C, bytes);
    cudaMemcpy(d_A, A.data(), bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, B.data(), bytes, cudaMemcpyHostToDevice);

    cudaEvent_t startVec, stopVec;
    cudaEventCreate(&startVec);
    cudaEventCreate(&stopVec);
    cudaEventRecord(startVec);

    vecAdd<<<(vecSize + 255) / 256, 256>>>(d_A, d_B, d_C, vecSize);
    cudaDeviceSynchronize();
    cudaEventRecord(stopVec);
    cudaEventSynchronize(stopVec);
    float gpu_vec_time;
    cudaEventElapsedTime(&gpu_vec_time, startVec, stopVec);
    cudaMemcpy(C_gpu.data(), d_C, bytes, cudaMemcpyDeviceToHost);

    // Print first 5 elements
    cout << "\nVector Addition (first 5 results): ";
    for (int i = 0; i < 5; ++i)
        cout << C_gpu[i] << " ";
    cout << "\nVector Addition Time (ms): Seq = " << cpu_vec_time << ", GPU = " << gpu_vec_time << endl;

    // Free vector memory
    cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);

    // Initialize matrices
    vector<int> M1(N * N), M2(N * N), M_seq(N * N), M_gpu(N * N);
    for (int i = 0; i < N * N; ++i) {
        M1[i] = 1;
        M2[i] = 2;
    }

    // Sequential Matrix Mul
    start = chrono::high_resolution_clock::now();
    sequentialMatMul(M1, M2, M_seq, N);
    end = chrono::high_resolution_clock::now();
    double cpu_mat_time = chrono::duration<double, milli>(end - start).count();

    // Parallel Matrix Mul
    int *d_M1, *d_M2, *d_M3;
    size_t matBytes = N * N * sizeof(int);
    cudaMalloc(&d_M1, matBytes);
    cudaMalloc(&d_M2, matBytes);
    cudaMalloc(&d_M3, matBytes);
    cudaMemcpy(d_M1, M1.data(), matBytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_M2, M2.data(), matBytes, cudaMemcpyHostToDevice);

    cudaEvent_t startMat, stopMat;
    cudaEventCreate(&startMat);
    cudaEventCreate(&stopMat);
    cudaEventRecord(startMat);

    dim3 threads(16, 16);
    dim3 blocks((N + 15) / 16, (N + 15) / 16);
    matMul<<<blocks, threads>>>(d_M1, d_M2, d_M3, N);
    cudaDeviceSynchronize();
    cudaEventRecord(stopMat);
    cudaEventSynchronize(stopMat);
    float gpu_mat_time;
    cudaEventElapsedTime(&gpu_mat_time, startMat, stopMat);
    cudaMemcpy(M_gpu.data(), d_M3, matBytes, cudaMemcpyDeviceToHost);

    // Print a few matrix values
    cout << "\nMatrix Multiplication (first 3x3 block):\n";
    for (int i = 0; i < N; ++i) {
        for (int j = 0; j < N; ++j)
            cout << M_gpu[i * N + j] << " ";
        cout << endl;
    }

    cout << "Matrix Multiplication Time (ms): Seq = " << cpu_mat_time << ", GPU = " << gpu_mat_time << endl;

    // Free matrix memory
    cudaFree(d_M1); cudaFree(d_M2); cudaFree(d_M3);

    return 0;
}


Overwriting vector.cu


In [14]:
!nvcc -arch=sm_75 vector.cu -o vec

In [15]:
!./vec


Vector Addition (first 5 results): 3 3 3 3 3 
Vector Addition Time (ms): Seq = 8.91145, GPU = 0.109696

Matrix Multiplication (first 3x3 block):
1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024 1024