In [20]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2021 NVIDIA Corporation
Built on Sun_Feb_14_21:12:58_PST_2021
Cuda compilation tools, release 11.2, V11.2.152
Build cuda_11.2.r11.2/compiler.29618528_0


In [21]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-5yiwd3b5
  Running command git clone -q https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-5yiwd3b5


In [22]:
%load_ext nvcc_plugin

The nvcc_plugin extension is already loaded. To reload it, use:
  %reload_ext nvcc_plugin


In [23]:
!nvidia-smi

Mon Oct 17 19:26:33 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [51]:
%%cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <chrono>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>

void reductionCudaImproved(float* result, const float* input, int SIZE);
__global__ void reductionKernelImproved(float* result, const float* input, int SIZE);
void reductionCPU(float* result, const float* input, int SIZE);

#define TILE 32
#define ILP 8
#define BLOCK_X_IMPR (TILE / ILP)
#define BLOCK_Y_IMPR 32
#define BLOCK_COUNT_X_IMPR 100

void reductionCPU(float* result, const float* input, int SIZE)
{
    for (int i = 0; i < SIZE; i++)
        *result += input[i];
}

__global__ void reductionKernelImproved(float* result, const float* input, int SIZE)
{
    int i;
    int col = (blockDim.x * blockIdx.x + threadIdx.x) * ILP;
    int row = blockDim.y * blockIdx.y + threadIdx.y;
    int index = row * blockDim.x * gridDim.x * ILP + col;
    __shared__ float interResult;

    if (threadIdx.x == 0 && threadIdx.y == 0)
        interResult = 0.0;

    __syncthreads();

#pragma unroll 
    for (i = 0; i < ILP; i++)
    {
        if (index < SIZE)
        {
            atomicAdd(&interResult, input[index]);
            index++;
        }
    }

    __syncthreads();

    if (threadIdx.x == 0 && threadIdx.y == 0)
        atomicAdd(result, interResult);
}

void reductionCudaImproved(float* result, const float* input, int SIZE, float cpuTime)
{
    dim3 dim_grid, dim_block;

    float* dev_input = 0;
    float* dev_result = 0;
    cudaEvent_t start, stop;
    float elapsed = 0;
    double gpuBandwidth;

    dim_block.x = BLOCK_X_IMPR;
    dim_block.y = BLOCK_Y_IMPR;
    dim_block.z = 1;

    dim_grid.x = BLOCK_COUNT_X_IMPR;
    dim_grid.y = (int)ceil((float)SIZE / (float)(TILE * dim_block.y * BLOCK_COUNT_X_IMPR));
    dim_grid.z = 1;

    cudaSetDevice(0);

    cudaMalloc((void**)&dev_input, SIZE * sizeof(float));
    cudaMalloc((void**)&dev_result, sizeof(float));
    cudaMemcpy(dev_input, input, SIZE * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(dev_result, result, sizeof(float), cudaMemcpyHostToDevice);

    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);
    reductionKernelImproved << <dim_grid, dim_block >> > (dev_result, dev_input, SIZE);

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    cudaEventElapsedTime(&elapsed, start, stop);

    printf("GPU Time (improved): %f ms\n", elapsed);
    printf("Acceleration factor: %f \n", cpuTime/elapsed);

    cudaDeviceSynchronize();

    cudaMemcpy(result, dev_result, sizeof(float), cudaMemcpyDeviceToHost);

    cudaFree(dev_input);
    cudaFree(dev_result);

    return;
}

int main()
{
    int i,j;
    float* input;
    float resultCPU, resultGPU;
    double cpuTime, cpuBandwidth;
    int SIZE;
    int a[] = {1000, 5000, 6000, 7000, 8000, 9000, 10000, 50000, 100000, 500000, 1000000};
  for (j = 0; j < 11; ++j) {
    SIZE = a[j];
    printf("Size : %d \n", SIZE);
    input = (float*)malloc(SIZE * sizeof(float));
    resultCPU = 0.0;
    resultGPU = 0.0;

    auto start = std::chrono::high_resolution_clock::now();
    auto end = std::chrono::high_resolution_clock::now();

    for (i = 0; i < SIZE; i++)
        input[i] = rand() % 10 - 5;

    start = std::chrono::high_resolution_clock::now();
    reductionCPU(&resultCPU, input, SIZE);
    end = std::chrono::high_resolution_clock::now();

    std::chrono::duration<double> diff = end - start;
    cpuTime = (diff.count() * 1000);
    printf("CPU Time: %f ms\n", cpuTime);

    reductionCudaImproved(&resultGPU, input, SIZE, cpuTime);
  }
    return 0;
 
}

Size : 1000 
CPU Time: 0.002889 ms
GPU Time (improved): 0.024640 ms
Acceleration factor: 0.117248 
Size : 5000 
CPU Time: 0.026103 ms
GPU Time (improved): 0.022048 ms
Acceleration factor: 1.183917 
Size : 6000 
CPU Time: 0.016935 ms
GPU Time (improved): 0.019936 ms
Acceleration factor: 0.849468 
Size : 7000 
CPU Time: 0.019786 ms
GPU Time (improved): 0.026528 ms
Acceleration factor: 0.745853 
Size : 8000 
CPU Time: 0.029911 ms
GPU Time (improved): 0.024576 ms
Acceleration factor: 1.217082 
Size : 9000 
CPU Time: 0.027104 ms
GPU Time (improved): 0.042272 ms
Acceleration factor: 0.641181 
Size : 10000 
CPU Time: 0.030123 ms
GPU Time (improved): 0.030752 ms
Acceleration factor: 0.979546 
Size : 50000 
CPU Time: 0.151009 ms
GPU Time (improved): 0.078592 ms
Acceleration factor: 1.921430 
Size : 100000 
CPU Time: 0.293632 ms
GPU Time (improved): 0.147712 ms
Acceleration factor: 1.987868 
Size : 500000 
CPU Time: 1.505501 ms
GPU Time (improved): 0.461504 ms
Acceleration factor: 3.262162 
Size

In [54]:
params = {
    'SIZE':            [1000, 5000, 6000, 7000, 8000, 9000, 10000, 50000, 100000, 500000, 1000000],
    'acceleration factor':[0.117248, 1.183917, 0.849468, 0.745853, 1.217082, 0.641181, 0.979546, 1.921430, 1.987868, 3.262162, 3.623168],
    'cpu_time':     [0.002889, 0.026103, 0.016935, 0.019786, 0.029911, 0.027104, 0.030123, 0.151009, 0.293632, 1.505501, 2.965433],
    'gpu_time':     [0.024640, 0.022048, 0.019936, 0.026528, 0.024576, 0.042272, 0.030752, 0.078592, 0.147712, 0.461504, 0.818464],
    'SIZE1':            [1000, 5000, 6000, 7000, 8000, 9000, 10000],
    'cpu_time1':     [0.002889, 0.026103, 0.016935, 0.019786, 0.029911, 0.027104, 0.030123],
    'gpu_time1':     [0.024640, 0.022048, 0.019936, 0.026528, 0.024576, 0.042272, 0.030752],
}

exp_count = len(params["SIZE"])

In [55]:
import plotly.graph_objects as go
import numpy as np
import matplotlib.pyplot as plt
data = []
for k in ['cpu_time', 'gpu_time']:
      plot = go.Scatter(x=params['SIZE'],
                          y=params[k],
                          mode="lines+markers",
                          name=k
                          )
      data.append(plot)
layout = go.Layout(title="Зависимость времени от размера вектора",
                   xaxis= dict(title= 'Размер вектора',ticklen= 20,zeroline= False), 
                   yaxis= dict(title= 'Время, миллисекунды',ticklen= 20,zeroline= False))
fig1 = go.Figure(data=data, layout = layout)
fig1.show()

data = []
plot = go.Scatter(x=params['SIZE'],
                      y=params['acceleration factor'],
                      mode="lines+markers"
                      )
layout = go.Layout(title="График зависимости ускорения от размера вектора",
                   xaxis= dict(title= 'Размер вектора',ticklen= 20,zeroline= False), 
                   yaxis= dict(title= 'Ускорение',ticklen= 20,zeroline= False))
data.append(plot)
fig2 = go.Figure(data=data, layout = layout)
fig2.show()

Рассмотрим график зависимости времени размера вектора приближенно.

In [56]:
data = []
for k in ['cpu_time1', 'gpu_time1']:
      plot = go.Scatter(x=params['SIZE1'],
                          y=params[k],
                          mode="lines+markers",
                          name=k
                          )
      data.append(plot)
layout = go.Layout(title="Зависимость времени от размера вектора",
                   xaxis= dict(title= 'Размер вектора',ticklen= 20,zeroline= False), 
                   yaxis= dict(title= 'Время, миллисекунды',ticklen= 20,zeroline= False))
fig1 = go.Figure(data=data, layout = layout)
fig1.show()


Хотелось найти такую граничную точку, при которой бы время вычисления на GPU стало бы меньше, но такую точку найти не удалось.

Вывод: как мы видим, при размере вектора более 50000 значений нужно использовать GPU. С увеличением размера вектора, данное утверждение становится очевидным.