<a href="https://colab.research.google.com/github/J-A-S-H-U/Hardware_for_AI_ML/blob/main/SAXPY.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
%%cuda
#include <stdio.h>
#include <cuda_runtime.h>
#include <math.h>

__global__
void saxpy(int n, float a, float *x, float *y) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n) y[i] = a * x[i] + y[i];
}

int main() {
    int start_power = 15;
    int end_power = 25;
    int threadsPerBlock = 256;

    printf("N,TotalTime(ms),KernelTime(ms)\n");

    for (int p = start_power; p <= end_power; p++) {
        int N = 1 << p;
        float *x, *y, *d_x, *d_y;

        cudaEvent_t startTotal, stopTotal, startKernel, stopKernel;
        cudaEventCreate(&startTotal);
        cudaEventCreate(&stopTotal);
        cudaEventCreate(&startKernel);
        cudaEventCreate(&stopKernel);

        cudaEventRecord(startTotal);
        x = (float*)malloc(N * sizeof(float));
        y = (float*)malloc(N * sizeof(float));

        cudaMalloc(&d_x, N * sizeof(float));
        cudaMalloc(&d_y, N * sizeof(float));

        for (int i = 0; i < N; i++) {
            x[i] = 1.0f;
            y[i] = 2.0f;
        }

        cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice);
        cudaMemcpy(d_y, y, N * sizeof(float), cudaMemcpyHostToDevice);

        // Start kernel timing
        cudaEventRecord(startKernel);
        saxpy<<<(N + threadsPerBlock - 1) / threadsPerBlock, threadsPerBlock>>>(N, 2.0f, d_x, d_y);
        cudaEventRecord(stopKernel);

        cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost);

        // Stop total timing
        cudaEventRecord(stopTotal);
        cudaEventSynchronize(stopTotal);
        cudaEventSynchronize(stopKernel);

        float totalTime = 0.0f, kernelTime = 0.0f;
        cudaEventElapsedTime(&totalTime, startTotal, stopTotal);
        cudaEventElapsedTime(&kernelTime, startKernel, stopKernel);

        printf("%d,%f,%f\n", N, totalTime, kernelTime);

        cudaFree(d_x);
        cudaFree(d_y);
        free(x);
        free(y);

        cudaEventDestroy(startTotal);
        cudaEventDestroy(stopTotal);
        cudaEventDestroy(startKernel);
        cudaEventDestroy(stopKernel);
    }

    return 0;
}


N,TotalTime(ms),KernelTime(ms)
32768,48.523457,47.230110
65536,0.573088,0.002592
131072,1.033024,0.003680
262144,1.932992,0.002688
524288,3.912384,0.002656
1048576,8.959840,0.002880
2097152,17.475103,0.002624
4194304,34.190880,0.003584
8388608,70.373985,0.002656
16777216,142.803391,0.003008
33554432,291.787323,0.003072



In [4]:
!python --version
!nvcc --version
!pip install nvcc4jupyter
%load_ext nvcc4jupyter

Python 3.11.12
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0
Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1
Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmp0res6chf".
