## Atomic Functions
To Avoid Race Condition

In [11]:
%%writefile 1.cu

#include <stdio.h>

__global__ void atomicKernel(int *shared_counter) {
  atomicAdd(shared_counter, 1);    //this will increment the counter
  //*shared_counter++;             //Every thread will count so counter remain 0
}

int main() {
  int h_counter = 0;
  int *d_counter;

  cudaMalloc((void **)&d_counter, sizeof(int));
  cudaMemcpy(d_counter, &h_counter, sizeof(int), cudaMemcpyHostToDevice);

  atomicKernel<<<1, 1024>>>(d_counter);
  cudaDeviceSynchronize();

  cudaMemcpy(&h_counter, d_counter, sizeof(int), cudaMemcpyDeviceToHost);

  printf("Final Counter value: %d\n", h_counter);

  cudaFree(d_counter);

  return 0;
}


Overwriting 1.cu


In [12]:
!nvcc -o 1 1.cu

In [13]:
!./1

Final Counter value: 1024


## Task1:
1) create an Array containing 1024 elements
2) init with values 1 to 1024 on host
3) in kernel

In [17]:
%%writefile 2.cu

#include <stdio.h>

#define N 1024

__global__ void AtomicAddition(int *d_input, int *d_output){

  int id = threadIdx.x;

  if(id < N){
    atomicAdd(d_output, d_input[id]);     //d_output += d_input[id]     //1024 threads working parallally, Atomic Add summing all the values of threads
    //d_output += d_input[id];            //Race Condition will occur
  }
}

int main(){
  int h_input[N], h_output = 0;
  int *d_input, *d_output;

  cudaMalloc((void**)&d_input, N * sizeof(int));
  cudaMalloc((void**)&d_output, sizeof(int));

  for(int i=0; i< N; i++){
    h_input[i] = i+1;
  }

  cudaMemcpy(d_input, &h_input, N * sizeof(int), cudaMemcpyHostToDevice);

  AtomicAddition<<<1, 1024>>>(d_input, d_output);
  cudaDeviceSynchronize();

  cudaMemcpy(&h_output, d_output, sizeof(int), cudaMemcpyDeviceToHost);

  printf("%d", h_output);

  cudaFree(d_output);
  cudaFree(d_input);

  return 0;


}

Overwriting 2.cu


In [18]:
!nvcc -o 2 2.cu

In [19]:
!./2

524800

In [6]:
%%writefile 3.cu

#include <stdio.h>

#define N 1024

__global__ void AtomicAddition(int *d_input, int *d_output, int *d_result){

  int id = threadIdx.x;

  if(id < N){
    atomicAdd(d_output, d_input[id]);     //d_output += d_input[id]     //1024 threads working parallally, Atomic Add summing all the values of threads
    //d_output += d_input[id];            //Race Condition will occur

    if (id == 0) {
      *d_result = (N * (N + 1)) / 2;
    }
  }
}

int main(){
  int h_input[N], h_output = 0, h_result = 0;
  int *d_input, *d_output, *d_result;

  cudaMalloc((void**)&d_input, N * sizeof(int));
  cudaMalloc((void**)&d_output, sizeof(int));
  cudaMalloc((void**)&d_result, sizeof(int));

  for(int i=0; i< N; i++){
    h_input[i] = i+1;
  }

  cudaMemcpy(d_input, &h_input, N * sizeof(int), cudaMemcpyHostToDevice);

  AtomicAddition<<<1, 1024>>>(d_input, d_output, d_result);
  cudaDeviceSynchronize();

  cudaMemcpy(&h_output, d_output, sizeof(int), cudaMemcpyDeviceToHost);
  cudaMemcpy(&h_result, d_result, sizeof(int), cudaMemcpyDeviceToHost);

  printf("sum = %d\n", h_output);
  printf("result = %d\n", h_result);

  cudaFree(d_output);
  cudaFree(d_input);
  cudaFree(d_result);

  return 0;


}

Overwriting 3.cu


In [7]:
!nvcc -o 3 3.cu

In [8]:
!./3

sum = 524800
result = 524800


## Finding Max & Min from Array

In [25]:
%%writefile 4.cu

#include <stdio.h>
#include <limits.h>

#define N 1024

__global__ void AtomicAddition(int *d_input, int *d_output, int *d_max, int *d_min) {
  int id = threadIdx.x;

  // Initialize d_max and d_min on the device only once
  if (id == 0) {
    *d_max = INT_MIN;
    *d_min = INT_MAX;
  }
  __syncthreads(); // Ensure all threads see the initialized values

  if (id < N) {
    atomicAdd(d_output, d_input[id]);

    // Atomic maximum and minimum operations
    atomicMax(d_max, d_input[id]);
    atomicMin(d_min, d_input[id]);
  }
}

int main() {
  int h_input[N], h_output = 0, h_max, h_min;
  int *d_input, *d_output, *d_max, *d_min;

  cudaMalloc((void**)&d_input, N * sizeof(int));
  cudaMalloc((void**)&d_output, sizeof(int));
  cudaMalloc((void**)&d_max, sizeof(int));
  cudaMalloc((void**)&d_min, sizeof(int));

  for (int i = 0; i < N; i++) {
    h_input[i] = i + 1;
  }

  cudaMemcpy(d_input, h_input, N * sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(d_output, &h_output, sizeof(int), cudaMemcpyHostToDevice);

  AtomicAddition<<<1, 1024>>>(d_input, d_output, d_max, d_min);
  cudaDeviceSynchronize();

  cudaMemcpy(&h_output, d_output, sizeof(int), cudaMemcpyDeviceToHost);
  cudaMemcpy(&h_max, d_max, sizeof(int), cudaMemcpyDeviceToHost);
  cudaMemcpy(&h_min, d_min, sizeof(int), cudaMemcpyDeviceToHost);

  printf("Sum: %d\n", h_output);
  printf("Max: %d\n", h_max);
  printf("Min: %d\n", h_min);

  cudaFree(d_output);
  cudaFree(d_input);
  cudaFree(d_max);
  cudaFree(d_min);

  return 0;
}


Overwriting 4.cu


In [26]:
!nvcc -o 4 4.cu

In [27]:
!./4

Sum: 524800
Max: 1024
Min: 1
