# 3.2. Performance guidelines: data types

In [1]:
import math
import numpy as np
from numba import cuda, float32, int32
import cupy as cp

In [2]:
%%writefile 3_2_convolve_float32.py

import math
import numpy as np
from numba import cuda, float32, int32
from tests import benchmark_convolve

@cuda.jit
def convolve_gpu_kernel(y, x, h):
    i = cuda.grid(1)
    M = len(x)
    N = len(h)
    offset = int32(math.ceil(N/2)-1)
    
    if i >= len(y):
        return
    
    value = float32(0.0)
    
    for j in range(N):
        k = i + offset - j
        if k >= 0 and k < M:
            value += x[k]*h[j]
    
    y[i] = value
    
def convolve_gpu(y, x, h):
    if y is None:
        y = cuda.device_array(x.shape, dtype=x.dtype)
    block_size = (256, )
    grid_size = (math.ceil(len(y)/block_size[0]), )
    convolve_gpu_kernel[grid_size, block_size](y, x, h)
    return y.copy_to_host()

benchmark_convolve(lambda x, h: convolve_gpu(None, x, h))

Overwriting 3_2_convolve_float32.py


In [3]:
! nvprof --trace gpu python 3_2_convolve_float32.py

==10319== NVPROF is profiling process 10319, command: python 3_2_convolve_float32.py
Benchmark result: 
Average processing time: 0.0310 seconds (+/- 0.0530), median: 0.0230
==10319== Profiling application: python 3_2_convolve_float32.py
==10319== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   80.99%  1.89548s       100  18.955ms  17.090ms  29.145ms  cudapy::__main__::convolve_gpu_kernel$241(Array<float, int=1, C, mutable, aligned>, Array<float, int=1, C, mutable, aligned>, Array<float, int=1, C, mutable, aligned>)
                   12.28%  287.32ms       300  957.74us  1.1520us  3.0243ms  [CUDA memcpy DtoH]
                    6.73%  157.48ms       200  787.39us     896ns  5.1937ms  [CUDA memcpy HtoD]
No API activities were profiled.


What if we will just change `value = float32(0.0)` to `value = 0.0`?

In [4]:
%%writefile 3_2_convolve_value_float64.py

import math
import numpy as np
from numba import cuda, float32, int32
from tests import benchmark_convolve

@cuda.jit
def convolve_gpu_kernel(y, x, h):
    i = cuda.grid(1)
    M = len(x)
    N = len(h)
    offset = int32(math.ceil(N/2)-1)
    
    if i >= len(y):
        return
    
    value = 0.0
    
    for j in range(N):
        k = i + offset - j
        if k >= 0 and k < M:
            value += x[k]*h[j]
    
    y[i] = value
    
def convolve_gpu(y, x, h):
    if y is None:
        y = cuda.device_array(x.shape, dtype=x.dtype)
    block_size = (256, )
    grid_size = (math.ceil(len(y)/block_size[0]), )
    convolve_gpu_kernel[grid_size, block_size](y, x, h)
    return y.copy_to_host()

benchmark_convolve(lambda x, h: convolve_gpu(None, x, h))

Overwriting 3_2_convolve_value_float64.py


In [5]:
! nvprof --trace gpu python 3_2_convolve_value_float64.py

==10347== NVPROF is profiling process 10347, command: python 3_2_convolve_value_float64.py
Benchmark result: 
Average processing time: 0.0393 seconds (+/- 0.0526), median: 0.0323
==10347== Profiling application: python 3_2_convolve_value_float64.py
==10347== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   86.83%  2.77792s       100  27.779ms  26.496ms  46.774ms  cudapy::__main__::convolve_gpu_kernel$241(Array<float, int=1, C, mutable, aligned>, Array<float, int=1, C, mutable, aligned>, Array<float, int=1, C, mutable, aligned>)
                    8.48%  271.40ms       300  904.68us  1.1200us  2.6458ms  [CUDA memcpy DtoH]
                    4.69%  150.03ms       200  750.16us     928ns  2.6278ms  [CUDA memcpy HtoD]
No API activities were profiled.


The processing increases, because 0.0 is a float64, and we are doing a conversions from float32 to float64 (here: `value += x[k]*h[j]`), then from float64 to float32 (here: `y[i] = value`).

Lets check what results we will get when will change all to computations to float64.

In [7]:
%%writefile 3_2_convolve_float64.py

import math
import numpy as np
from numba import cuda, float64, int32
from tests import benchmark_convolve

@cuda.jit
def convolve_gpu_kernel(y, x, h):
    i = cuda.grid(1)
    M = len(x)
    N = len(h)
    offset = int32(math.ceil(N/2)-1)
    
    if i >= len(y):
        return
    
    value = float64(0.0)
    
    for j in range(N):
        k = i + offset - j
        if k >= 0 and k < M:
            value += x[k]*h[j]
    
    y[i] = value
    
def convolve_gpu(y, x, h):
    if y is None:
        y = cuda.device_array(x.shape, dtype=x.dtype)
    block_size = (256, )
    grid_size = (math.ceil(len(y)/block_size[0]), )
    convolve_gpu_kernel[grid_size, block_size](y, x, h)
    return y.copy_to_host()

benchmark_convolve(lambda x, h: convolve_gpu(None, x, h), dtype=np.float64)

Overwriting 3_2_convolve_float64.py


In [8]:
! nvprof --trace gpu python 3_2_convolve_float64.py

==10460== NVPROF is profiling process 10460, command: python 3_2_convolve_float64.py
Benchmark result: 
Average processing time: 0.0347 seconds (+/- 0.0605), median: 0.0276
==10460== Profiling application: python 3_2_convolve_float64.py
==10460== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   68.33%  1.81831s       100  18.183ms  17.710ms  27.042ms  cudapy::__main__::convolve_gpu_kernel$241(Array<double, int=1, C, mutable, aligned>, Array<double, int=1, C, mutable, aligned>, Array<double, int=1, C, mutable, aligned>)
                   20.28%  539.71ms       300  1.7990ms  1.1520us  5.1537ms  [CUDA memcpy DtoH]
                   11.39%  303.18ms       200  1.5159ms     992ns  3.7395ms  [CUDA memcpy HtoD]
No API activities were profiled.
