<a href="https://colab.research.google.com/github/HeptaDecane/LP1_SEM7/blob/main/HPC/A01/parallel_reduction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Mon_Oct_12_20:09:46_PDT_2020
Cuda compilation tools, release 11.1, V11.1.105
Build cuda_11.1.TC455_06.29190527_0


In [2]:
code = """
#include<iostream>
#include<math.h>

#define n 8

using namespace std;

__global__ void minimum(int *input) {
    int tid = threadIdx.x;
    int step_size = 1;
    int number_of_threads = blockDim.x;

    printf("No of threads = %d", number_of_threads);

    while(number_of_threads>0) {
        if(tid < number_of_threads) {
            int first = tid*step_size*2;
            int second = first + step_size;
            if(input[second] < input[first])
                input[first] = input[second];
        }
        step_size <<= 1;
        number_of_threads >>= 1;
    }
}

__global__ void maximum(int *input) {
    int tid = threadIdx.x;
    int step_size = 1;
    int number_of_threads = blockDim.x;

    while(number_of_threads>0) {
        if(tid < number_of_threads) {
            int first = tid*step_size*2;
            int second = first + step_size;
            if(input[second] > input[first])
                input[first] = input[second];
        }
        step_size <<= 1;
        number_of_threads >>= 1;
    }
}

__global__ void sum(int *input) {
    const int tid = threadIdx.x;
    int step_size = 1;
    int number_of_threads = blockDim.x;

    while(number_of_threads > 0) {
        if(tid < number_of_threads) {
            int first = tid * step_size * 2;
            int second = first + step_size;

            input[first] += input[second];
        }
        step_size <<= 1;
        number_of_threads >>= 1;

    }
}

__global__ void mean_diff_sq(float *input, float mean) {
    input[threadIdx.x] -= mean;
    input[threadIdx.x] *= input[threadIdx.x];
}

__global__ void sum_floats(float *input) {
    int tid = threadIdx.x;
    int step_size = 1;
    int number_of_threads = blockDim.x;

    while(number_of_threads > 0) {
        if(tid < number_of_threads) {
            int first = tid * step_size * 2;
            int second = first + step_size;

            input[first] += input[second];
        }
        step_size <<= 1;
        number_of_threads >>= 1;

    }
}

void copy_int_to_float(float *dest, int *src, int size){
    for(int i=0; i<size; i++)
        dest[i] = float(src[i]);
}

void random_ints(int *input, int size) {
    cout<<"Input: ";
    for(int i=0; i<size; i++)  {
        input[i] = rand()%100;
        cout<<input[i]<<"  ";
    }
    cout<<endl;

}

int main() {
    //int n=8;
    int size = n*sizeof(int); //calculate no. of bytes for array

    int *arr;
    int *arr_d, result;

    arr = (int *)malloc(size);
    random_ints(arr, n);

    cudaMalloc((void **)&arr_d, size);

    //MIN
    cudaMemcpy(arr_d, arr, size, cudaMemcpyHostToDevice);

    minimum<<<1,n/2>>>(arr_d);

    cudaMemcpy(&result, arr_d, sizeof(int), cudaMemcpyDeviceToHost);

    cout<<"The minimum element is "<<result<<endl;


    //MAX
    cudaMemcpy(arr_d, arr, size, cudaMemcpyHostToDevice);

    maximum<<<1,n/2>>>(arr_d);

    cudaMemcpy(&result, arr_d, sizeof(int), cudaMemcpyDeviceToHost);

    cout<<"The maximum element is "<<result<<endl;

    //SUM
    cudaMemcpy(arr_d, arr, size, cudaMemcpyHostToDevice);

    sum<<<1,n/2>>>(arr_d);

    cudaMemcpy(&result, arr_d, sizeof(int), cudaMemcpyDeviceToHost);

    cout<<"The sum is "<<result<<endl;

    //AVERAGE

    float mean = float(result)/n;
    cout<<"The mean is "<<mean<<endl;

    //STANDARD DEVIATION
    float *arr_float;
    float *arr_std, stdValue;

    arr_float = (float *)malloc(n*sizeof(float));
    cudaMalloc((void **)&arr_std, n*sizeof(float));

    copy_int_to_float(arr_float, arr, n);

    cudaMemcpy(arr_std, arr_float, n*sizeof(float), cudaMemcpyHostToDevice);

    mean_diff_sq <<<1,n>>>(arr_std, mean);
    sum_floats<<<1,n/2>>>(arr_std);

    cudaMemcpy(&stdValue, arr_std, sizeof(float), cudaMemcpyDeviceToHost);


    stdValue = stdValue / n;
    cout<<"The variance is "<<stdValue<<endl;
    stdValue = sqrt(stdValue);

    cout<<"The standard deviation is "<<stdValue<<endl;

    cudaFree(arr_d);

    return 0;
}
"""

In [3]:
file = open("parallel_reduction.cu", "w")
file.write(code)
file.close()

In [4]:
!nvcc parallel_reduction.cu

In [5]:
!./a.out

Input: 83  86  77  15  93  35  86  92  
The minimum element is 83
The maximum element is 83
The sum is 83
The mean is 10.375
The variance is 10.375
The standard deviation is 3.22102


In [6]:
!nvprof ./a.out

Input: 83  86  77  15  93  35  86  92  
==453== NVPROF is profiling process 453, command: ./a.out
The minimum element is 83
The maximum element is 83
The sum is 83
The mean is 10.375
The variance is 10.375
The standard deviation is 3.22102
==453== Profiling application: ./a.out
==453== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   56.20%  8.9920us         4  2.2480us  2.0800us  2.4000us  [CUDA memcpy DtoH]
                   43.80%  7.0080us         4  1.7520us  1.5680us  2.3040us  [CUDA memcpy HtoD]
      API calls:   99.38%  178.78ms         2  89.391ms  10.057us  178.77ms  cudaMalloc
                    0.36%  646.97us         1  646.97us  646.97us  646.97us  cuDeviceTotalMem
                    0.13%  239.77us       101  2.3730us     159ns  115.79us  cuDeviceGetAttribute
                    0.09%  164.80us         8  20.599us  10.650us  36.186us  cudaMemcpy
                    0.02%  29.499us         1  29.499