In [None]:
!nvidia-smi

Thu Nov  7 13:41:46 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   48C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

In [None]:
%load_ext nvcc4jupyter

In [None]:
%%cuda

#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <stdio.h>

__global__ void printDimentaionLocations() {
    printf("ThreadIdx.X: %d ThreadIdx.Y: %d ThreadIdx.Z: %d \n",  threadIdx.x, threadIdx.y, threadIdx.z);
    printf("BlockIdx.X: %d BlockIdx.Y: %d BlockIdx.Z: %d \n", blockIdx.x, blockIdx.y, blockIdx.z);
    printf("GridIdx.X: %d GridIdx.Y: %d GridIdx.Z: %d \n", gridDim.x, gridDim.y, gridDim.z);
}

int main() {
    printf("Welcome to Cuda\n");
    int nx=4, ny=4, nz=4;

    dim3 block(2, 2, 2);
    dim3 grid(nx/block.x, ny/block.y, nz/block.z);


    printDimentaionLocations<<<grid, block>>>();
    cudaDeviceSynchronize();
    cudaDeviceReset();
    return 0;
}

In [None]:
%%cuda

#include <cuda_runtime.h>
#include <device_launch_parameters.h>

#include <stdio.h>
#include <stdlib.h>

__global__ void unique_id_threadIdx(int* input) {
    int tid = threadIdx.x;
    printf("ThreadIdx = %d | Value = %d \n", tid, input[tid]);
}

__global__ void unique_gid(int* input) {
    int tid = threadIdx.x;
    int offset = blockIdx.x * blockDim.x;
    int gid = tid + offset;
    printf("BlockIdx.x = %d | ThreadIdx = %d | Gid = %d | Value = %d \n",
           blockIdx.x, tid, gid, input[gid]);
    /*
    16 elemanlı array
    dim3 block(4,1,1);
    dim3 grid(4,1,1);
    */
}

__global__ void unique_gid_2d(int* input) {
    int tid = threadIdx.x;
    int block_offset = blockIdx.x * blockDim.x;
    int row_offset = gridDim.x * blockDim.x * blockIdx.y;
    int gid = tid + block_offset + row_offset;
    printf("BlockIdx.X = %d | BlockIdx.Y = %d | ThreadIdx.X = %d | Gid = %d | Value = %d \n",
           blockIdx.x, blockIdx.x, tid, gid, input[gid]);

    /*
    dim3 block(4,1,1);
    dim3 grid(3,2,1);
    */
}

__global__ void unique_gid_2d_2d(int* input) {
    int tid = thradIdx.y * blockIdx.x + threadIdx.x;

    int num_threads_in_a_block = blockDim.x * blockDim.y;
    int block_offset = blockIdx.x * num_threads_in_a_block;

    int num_threads_in_a_row = num_threads_in_a_block * gridDim.x;
    int row_offset = num_threads_in_a_row * blockIdx.y;


    int gid = tid + block_offset + row_offset;
    printf("BlockIdx.X = %d | BlockIdx.Y = %d | ThreadIdx.X = %d | Gid = %d | Value = %d \n",
           blockIdx.x, blockIdx.x, tid, gid, input[gid]);
}

int main() {
    int array_size { 24 };
    int byte_size { sizeof(int) * array_size };
    int host_data[] = {12, 34, 342, 453 , 23, 56, 567, 789, 345, 578, 89, 44, 36, 67, 798, 57, 16, 57, 234, 56, 545, 67, 69, 24 };

    for(const int& i : host_data) {
        printf("%d ", i);
    }

    printf("\n\n");

    int* device_data;
    cudaMalloc((void**)&device_data, byte_size);
    cudaMemcpy(device_data, host_data, byte_size, cudaMemcpyHostToDevice);

    dim3 block(2,2,1);
    dim3 grid(2,2,1);

    unique_gid_2d <<<grid,block>>> (device_data);
    cudaDeviceSynchronize();
    cudaDeviceReset();
    return 0;
}

In [None]:
%%cuda
// DERS 1.14 1.15 1.16
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <stdio.h>
#include <stdlib.h>
#include <cstring>


// GPU ASSERT -------
#define gpuErrorCheck(ans) { gpuAssert((ans), __FILE__, __LINE__); }

inline void gpuAssert(cudaError_t errorCode, const char* file, int line, bool abort = true) {
    if(errorCode != cudaSuccess) {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(errorCode), file, line);
        if(abort) exit(errorCode);
    }
   // else fprintf(stdout, "GPUNotAssert: %s %s %d\n", cudaGetErrorString(errorCode), file, line); // Delete later
}

// ------- GPU ASSERT

__global__ void sum_array_gpu(int* a, int* b, int* c, int size) {
    int gid = blockIdx.x * blockDim.x + threadIdx.x ;

    if(gid < size) {
        c[gid] = a[gid] + b[gid] ;
    }
}

void sum_array_cpu(int* a, int* b, int* c, int size) {
    for(int i{}; i < size; i++) {
        c[i] = a[i] + b[i] ;
    }
}

void print_time(const clock_t& start, const clock_t& end, const char* message) {
    printf(message);
    printf(" time : %4.6f \n", (double)((double)(end-start)/CLOCKS_PER_SEC));
}

int main() {
    int size { 1000000 };
    int block_size { 256 };
    cudaError error{};

    int NO_BYTES = size * sizeof(int) ;

    // Host Pointers
    int *h_a{}, *h_b{}, *gpu_result{}, *h_c{};

    h_a = (int*)malloc(NO_BYTES);
    h_b = (int*)malloc(NO_BYTES);
    gpu_result = (int*)malloc(NO_BYTES);
    h_c = (int*)malloc(NO_BYTES);

    time_t t;

    srand((unsigned)time(&t));

    for(int i{}; i < size; i++) h_a[i] = (int)(rand() & 0xFF);
    for(int i{}; i < size; i++) h_b[i] = (int)(rand() & 0xFF);

    clock_t cpu_start{}, cpu_end{};
    cpu_start = clock();
    sum_array_cpu(h_a, h_b, h_c, size);
    cpu_end = clock();

    memset(gpu_result, 0, NO_BYTES);

    // Device Pointers
    int *d_a{}, *d_b{}, *d_c{};

    error = cudaMalloc((int**)&d_a, NO_BYTES);

    if(error != cudaSuccess) {
        fprintf(stderr, "Error : %s \n", cudaGetErrorString(error));
    }

    gpuErrorCheck(cudaMalloc((int**)&d_b, NO_BYTES));
    gpuErrorCheck(cudaMalloc((int**)&d_c, NO_BYTES));


    clock_t htod_start{}, htod_end{};
    htod_start = clock();
    cudaMemcpy(d_a, h_a, NO_BYTES, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, NO_BYTES, cudaMemcpyHostToDevice);
    htod_end = clock();



    dim3 block(block_size);
    dim3 grid((size/block.x) + 1); // + 1 eklenme sebebi bölme işleminden dolayı arada kaynayan veri olmasın diye

    clock_t gpu_start{}, gpu_end{};
    gpu_start  = clock();
    sum_array_gpu<<<grid, block>>>(d_a, d_b, d_c, size);
    cudaDeviceSynchronize();
    gpu_end  = clock();


    clock_t dtoh_start{}, dtoh_end{};
    dtoh_start = clock();
    cudaMemcpy(gpu_result, d_c, NO_BYTES, cudaMemcpyDeviceToHost); // gpu_result ile h_c elemanlarını karşılaştır buna da validation check de
    dtoh_end = clock();


    print_time(cpu_start, cpu_end, "Sum array CPU execution");
    print_time(gpu_start, gpu_end, "Sum array GPU execution");
    print_time(htod_start, htod_end, "HtoD mem transfer");
    print_time(dtoh_start, dtoh_end, "DtoH mem transfer");
    print_time(htod_start, dtoh_end, "Total execution");




    cudaFree(d_c);
    cudaFree(d_b);
    cudaFree(d_a);

    free(gpu_result);
    free(h_b);
    free(h_a);

    printf("It is done\n");
    return 0;
}

Sum array CPU execution time : 0.472199 
Sum array GPU execution time : 0.006328 
HtoD mem transfer time : 0.262671 
DtoH mem transfer time : 0.085119 
Total execution time : 0.354122 
It is done



In [None]:
%%cuda

#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <stdio.h>
#include <stdlib.h>

int device_query() {

    int device_count{};
    cudaGetDeviceCount(&device_count);

    if(device_count == 0){
        printf("There is no device which has supported by CUDA \n");
        return -1;
    }

    int device_no{};
    cudaDeviceProp iProp{};
    cudaGetDeviceProperties(&iProp, device_no);

    printf("Device %d: %s\n", device_no, iProp.name);
    printf(" Number of multiprocessors  : %d\n", iProp.multiProcessorCount);
    printf(" Clock Rate                 : %d khz\n", iProp.clockRate);
    printf(" Compute Capability         : %d.%d\n", iProp.major, iProp.minor);
    printf(" Total amount of global mem : %4.2f GB\n", iProp.totalGlobalMem / (1024.0f * 1024.0f *1024.0f));
    printf(" Warp Size                  : %d\n", iProp.warpSize);

    return 0;
}

int main(int argc, char** argv) {
    return device_query();
}

Device 0: Tesla T4
 Number of multiprocessors  : 40
 Clock Rate                 : 1590000 khz
 Compute Capability         : 7.5
 Total amount of global mem : 14.75 GB
 Warp Size                  : 32

