<a href="https://colab.research.google.com/github/Ibrahim170105/Warmup_assignment/blob/main/GPU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%writefile matrix_add_cpu.cpp
#include <iostream>
#include <fstream>
#include <chrono>
using namespace std;

int main() {
    ifstream fin("input.txt");
    ofstream fout("output_cpu.txt");

    int n,m;
    fin >> n >> m;

    int size = n * m * sizeof(int);

    int* h_A = new int[n * m];
    int* h_B = new int[n * m];
    int* h_C_cpu = new int[n * m];

    for (int i = 0; i < n * m; i++) fin >> h_A[i];
    for (int i = 0; i < n * m; i++) fin >> h_B[i];

    auto cpu_start = chrono::high_resolution_clock::now();

    for (int i = 0; i < n * m; i++)
        h_C_cpu[i] = h_A[i] + h_B[i];

    auto cpu_end = chrono::high_resolution_clock::now();
    double cpu_time =
        chrono::duration<double, milli>(cpu_end - cpu_start).count();

    fout << "Resultant Matrix:\n";
    for (int i = 0; i < n; i++) {
        for (int j = 0; j < m; j++)
            fout << h_C_cpu[i * m + j] << " ";
        fout << endl;
    }

    cout << "Matrix Size: " << n << " x " << m << endl;
    cout << "CPU Time (ms): " << cpu_time << endl;

    return 0;
}


Writing matrix_add_cpu.cpp


In [2]:
!g++ matrix_add_cpu.cpp -o matrix_cpu
!./matrix_cpu


Matrix Size: 0 x 0
CPU Time (ms): 9.3e-05


In [None]:

%%writefile matrix_add_gpu.cu
#include <iostream>
#include <fstream>
#include <chrono>
#include <cuda_runtime.h>
#inlclude "matrix_add_cpu.cpp"

using namespace std;

/* ================= CUDA KERNEL ================= */
__global__ void matrixAdd(int* A, int* B, int* C, int n, int m) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < n && col < m) {
        int idx = row * m + col;
        C[idx] = A[idx] + B[idx];
    }
}

/* ================= MAIN ================= */
int main() {
    ifstream fin("input.txt");

    int n, m;
    fin >> n >> m;

    int size = n * m * sizeof(int);

    // Host memory
    int* h_A = new int[n * m];
    int* h_B = new int[n * m];
    int* h_C_cpu = new int[n * m];
    int* h_C_gpu = new int[n * m];

    for (int i = 0; i < n * m; i++) fin >> h_A[i];
    for (int i = 0; i < n * m; i++) fin >> h_B[i];

    /* ================= GPU MEMORY ================= */
    int *d_A, *d_B, *d_C;
    cudaMalloc(&d_A, size);
    cudaMalloc(&d_B, size);
    cudaMalloc(&d_C, size);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    /* ================= GPU TIMING ================= */
    cudaEventRecord(start);

    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

    dim3 block(16, 16);
    dim3 grid((m + 15) / 16, (n + 15) / 16);

    matrixAdd<<<grid, block>>>(d_A, d_B, d_C, n, m);

    cudaMemcpy(h_C_gpu, d_C, size, cudaMemcpyDeviceToHost);

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float gpu_time;
    cudaEventElapsedTime(&gpu_time, start, stop);

    /* ================= OUTPUT ================= */
    cout << "Matrix Size: " << n << " x " << m << endl;
    cout << "GPU Time incl. transfer (ms): " << gpu_time << endl;

    ofstream fout("output_gpu.txt");
    fout << "Result Matrix:\n";
    for (int i = 0; i < n; i++) {
        for (int j = 0; j < m; j++)
            fout << h_C_gpu[i * m + j] << " ";
        fout << endl;
    }

    /* ================= CLEANUP ================= */
    delete[] h_A;
    delete[] h_B;
    delete[] h_C_cpu;
    delete[] h_C_gpu;

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    return 0;
}


Writing matrix_add_gpu.cu


In [None]:
!nvcc matrix_add_gpu.cu -o matrix_add -arch=sm_75


In [None]:
!./matrix_add

Matrix Size: 1024 x 1024
GPU Time incl. transfer (ms): 5.60246
