## Day5

In [None]:
%%writefile 1.cu

#include <stdio.h>
#define N 16

__device__ int blockReduceSum(int *sharedData, int blockSize){
  int blockSum =0;
  for(int i=0; i<blockSize; i++){
    blockSum += sharedData[i];
  }
  return blockSum;
}

__global__ void sumWithSharedMem(int *d_input, int *d_output, int n){
  __shared__ int sdata[4];

  __syncthreads();

  if(threadIdx.x == 0){
    int blockSum = blockReduceSum(sdata, blockDim.x);
    d_output[blockIdx.x] = blockSum;
  }
}

int main(){
  int h_input[N], h_output[N / 4], finalSum =0;
  int *d_input, *d_output;

  for(int i=0; i<N; i++){
    h_input[i] = i+1;
  }

  cudaMalloc((void **) &d_input, N * sizeof(int));
  cudaMalloc((void **) &d_output, (N/4) * sizeof(int));

  cudaMemcpy(d_input, h_input, N *sizeof(int), cudaMemcpyHostToDevice);

  sumWithSharedMem<<<4, 4>>>(d_input, d_output, N);

  cudaMemcpy(h_output, d_output, (N/4) * sizeof(int), cudaMemcpyDeviceToHost);

  for(int i=0; i<4; i++){
    finalSum += h_output[i];
  }

  printf("Final Sum: %d\n", finalSum);

  cudaFree(d_input);
  cudaFree(d_output);

  return 0;
}


Overwriting 1.cu


In [None]:
!nvcc -o 1 1.cu

In [None]:
%%writefile 1_1.cu

#include <stdio.h>
#define N 16

__device__ int blockReduceSum(int *sharedData, int blockSize) {
  int blockSum = 0;
  for (int i = 0; i < blockSize; i++) {
    blockSum += sharedData[i];
  }
  return blockSum;
}


__global__ void sumWithSharedMem(int *d_input, int *d_output, int n) {
  __shared__ int sdata[4];

  int tid = threadIdx.x + blockIdx.x * blockDim.x;

  // Load data into shared memory
  if (tid < n) {
    sdata[threadIdx.x] = d_input[tid];
  } else {
    sdata[threadIdx.x] = 0;
  }

  __syncthreads();

  // Perform reduction in one thread
  if (threadIdx.x == 0) {
    int blockSum = blockReduceSum(sdata, blockDim.x);
    d_output[blockIdx.x] = blockSum;
  }
}

int main() {
  int h_input[N], h_output[N / 4], finalSum = 0;
  int *d_input, *d_output;

  // Initialize host input
  for (int i = 0; i < N; i++) {
    h_input[i] = i + 1;
  }

  cudaMalloc((void **)&d_input, N * sizeof(int));
  cudaMalloc((void **)&d_output, (N / 4) * sizeof(int));

  cudaMemcpy(d_input, h_input, N * sizeof(int), cudaMemcpyHostToDevice);

  sumWithSharedMem<<<4, 4>>>(d_input, d_output, N);

  cudaMemcpy(h_output, d_output, (N / 4) * sizeof(int), cudaMemcpyDeviceToHost);

  // Sum up partial results from each block
  for (int i = 0; i < 4; i++) {
    finalSum += h_output[i];
  }

  printf("Final Sum: %d\n", finalSum);

  cudaFree(d_input);
  cudaFree(d_output);

  return 0;
}


Writing 2.cu


In [None]:
!nvcc -o 1_1 1_1.cu

In [None]:
!./1_1

Final Sum: 136


## Square of the Array Elements

In [None]:
%%writefile 2.cu

#include <stdio.h>

__device__ int Square(int x){

  return x*x;
}

__global__ void squareKernel(int *d_input, int *d_output){
  int id = blockIdx.x * blockDim.x + threadIdx.x;

  if(id < 5){
    d_output[id] = Square(d_input[id]);
  }
}

int main(){
  int h_input[] = {1,2,3,4,5};
  int h_output[5];

  int *d_input, *d_output;

  cudaMalloc((void**)&d_input, 5 * sizeof(int));
  cudaMalloc((void**)&d_output, 5 * sizeof(int));

  cudaMemcpy(d_input, h_input, 5 * sizeof(int), cudaMemcpyHostToDevice);

  squareKernel<<<1, 5>>>(d_input, d_output);

  cudaMemcpy(h_output, d_output, 5 * sizeof(int), cudaMemcpyDeviceToHost);

  for(int i=0; i<5; i++){
    printf("%d ", h_output[i]);
  }
  printf("\n");

  cudaFree(d_input);
  cudaFree(d_output);

  return 0;
}

Overwriting 3.cu


In [None]:
!nvcc -o 2 2.cu

In [None]:
!./3

1 4 9 16 25 


## Double the Square

In [None]:
%%writefile 3.cu

#include <stdio.h>

__device__ int Double(int x){
  return x * 2;
}
__device__ int Square(int x){

  int a = x*x;
  return a;

  int b = Double(a);
  return b;
}

__global__ void squareKernel(int *d_input, int *d_output, int *double_output){
  int id = blockIdx.x * blockDim.x + threadIdx.x;

  if(id < 5){
    d_output[id] = Square(d_input[id]);
    double_output[id] = Double(d_input[id]);
  }
}

int main(){
  int h_input[] = {1,2,3,4,5};
  int h_output[5], h_double_output[5];

  int *d_input, *d_output, *double_output;

  cudaMalloc((void**)&d_input, 5 * sizeof(int));
  cudaMalloc((void**)&d_output, 5 * sizeof(int));
  cudaMalloc((void**)&double_output, 5 * sizeof(int));

  cudaMemcpy(d_input, h_input, 5 * sizeof(int), cudaMemcpyHostToDevice);

  squareKernel<<<1, 5>>>(d_input, d_output, double_output);

  cudaMemcpy(h_output, d_output, 5 * sizeof(int), cudaMemcpyDeviceToHost);
  cudaMemcpy(h_double_output, double_output, 5 * sizeof(int), cudaMemcpyDeviceToHost)

  for(int i=0; i<5; i++){
    printf("%d ", h_output[i]);
  }
  printf("\n");


  cudaFree(d_input);
  cudaFree(d_output);
  cudaFree(double_output);

  return 0;
}

Overwriting 4.cu


In [None]:
!nvcc -o 3 3.cu  //Re-Run This

[01m[0m[01m4.cu(44)[0m: [01;31merror[0m: expected a ";"
    for(int i=0; i<5; i++){
    ^

    printf("\n");
                ^


1 error detected in the compilation of "4.cu".


In [None]:
%%writefile 4_2.cu

#include <stdio.h>

__device__ int Square(int x) {
  int a = x * x;
 // int b = Double(a);
  return a;
}

__device__ int Double(int x) {
  int a = Square(x);
  return a * 2;
}


__global__ void squareKernel(int *d_input, int *d_output, int *double_output) {
  int id = blockIdx.x * blockDim.x + threadIdx.x;

  if (id < 5) {
    d_output[id] = Square(d_input[id]);
    double_output[id] = Double(d_input[id]);
  }
}

int main() {
  int h_input[] = {1, 2, 3, 4, 5};
  int h_output[5], h_double_output[5];

  int *d_input, *d_output, *double_output;

  cudaMalloc((void**)&d_input, 5 * sizeof(int));
  cudaMalloc((void**)&d_output, 5 * sizeof(int));
  cudaMalloc((void**)&double_output, 5 * sizeof(int));

  cudaMemcpy(d_input, h_input, 5 * sizeof(int), cudaMemcpyHostToDevice);

  squareKernel<<<1, 5>>>(d_input, d_output, double_output);

  cudaMemcpy(h_output, d_output, 5 * sizeof(int), cudaMemcpyDeviceToHost);
  cudaMemcpy(h_double_output, double_output, 5 * sizeof(int), cudaMemcpyDeviceToHost);

  printf("Squared and doubled values:\n");
  for (int i = 0; i < 5; i++) {
    printf("Square[%d]: %d, Double[%d]: %d\n", i, h_output[i], i, h_double_output[i]);
  }
  printf("\n");

  cudaFree(d_input);
  cudaFree(d_output);
  cudaFree(double_output);

  return 0;
}


Overwriting 5.cu


In [None]:
!nvcc -o 4_2 4_2.cu

In [None]:
!./4_2

Squared and doubled values:
Square[0]: 1, Double[0]: 2
Square[1]: 4, Double[1]: 8
Square[2]: 9, Double[2]: 18
Square[3]: 16, Double[3]: 32
Square[4]: 25, Double[4]: 50



## Students Marks

In [None]:
%%writefile 6.cu   //Incomplete

#include <stdio.h>

__device__ int TotalMarks(int *marks){
  int total = 0;
  for(int i=0; i< 3; i++){
    total += marks[i];
  }
  return total;
}

__device__ int percentage(int *marks){
  int total = TotalMarks(marks);
  int perc = (total / 300) * 100  ;
  return perc;
}

__global__ void MarksofStd(int *d_input, int *d_total_marks, int *d_perc, int *d_result){
  int id = blockDim.x * blockIdx.x + threadIdx.x;

  if(id < 4){
    d_total_marks[id] = TotalMarks(&d_input[id * 3]);
    d_perc[id] = percentage(&d_input[id * 3]);
  }

}


int main(){
  int h_input[3][3] ={
    {40,60,70}, {35,70,90}, {22,66,56}
  } ;
  int h_total_marks[3], h_perc[3], h_result;

  int *d_input, *d_total_marks, *d_perc, *d_result;

  cudaMalloc((void**)&d_input, 3*3 * sizeof(int));
  cudaMalloc((void**)&d_total_marks, 3 * sizeof(int));
  cudaMalloc((void**)&d_perc, 3 * sizeof(int));
  cudaMalloc((void**)&d_result, 3 * sizeof(int));

  cudaMemcpy(d_input, h_input, 3*3 * sizeof(int), cudaMemcpyHostToDevice);

  MarksofStd<<<1, 3>>>(d_input, d_total_marks, d_perc, d_result);

  cudaMemcpy(h_total_marks, d_total_marks, 3 * sizeof(int), cudaMemcpyDeviceToHost);
  cudaMemcpy(h_perc, d_perc, 3 * sizeof(int), cudaMemcpyDeviceToHost);

  for(int i=0; i< 3; i++){
    printf("%d ", h_total_marks[i]);
    printf("%d ", h_perc[i]);
  }
  printf("\n");

  cudaFree(d_input);
  cudaFree(d_total_marks);
  cudaFree(d_perc);
  cudaFree(d_result);

  return 0;


}

Overwriting 6.cu


In [None]:
!nvcc -o 6 6.cu

    int h_total_marks[3], h_perc[3], h_result;
                                     ^




In [None]:

!./6

170 0 195 0 144 0 


Writing 7.cu
