In [13]:
import numpy as np
import matplotlib as plt
import IPythonMagic
from Timer import Timer
import pycuda.driver as cuda_driver
import pycuda.compiler as cuda_compiler
from pycuda.gpuarray import GPUArray

In [14]:
%setup_logging

Global logger already initialized!


In [15]:
%cuda_context_handler context

Registering context in user workspace
Context already registered! Ignoring


In [22]:
kernel_src = """

__global__ void shmemReduction(float* output, float* input, int size, int _nt) {
    
    // Global memory -> maximum foe every thread
    // Block idx is always 0, since we use only one block
    
    int gid = blockIdx.x * blockIdx.x + threadIdx.x;
    float max_value = -9999999.99;
    // float max_value = 0.0;
    for (int i = threadIdx.x; i<size; i = i + blockDim.x) {
        max_value = fmax(max_value, input[i]);
    }
    
    // Check
    output[threadIdx.x] = max_value;
    
    // Find and store the local maximum in shared-memory
    int nt = _nt;
    __shared__ float max_shared[nt];
    max_shared[threadIdx.x] = max_value;
    
    // Sync
    __syncthreads();
    
    // Find the max
    /*
    if (threadIdx.x<32)
        max_shared[threadIdx.x] = fmaxf( max_shared[threadIdx.x], max_shared[threadIdx.x+32] );
    if (threadIdx.x<16)
        max_shared[threadIdx.x] = fmaxf( max_shared[threadIdx.x], max_shared[threadIdx.x+16] );
    if (threadIdx.x<8)
        max_shared[threadIdx.x] = fmaxf( max_shared[threadIdx.x], max_shared[threadIdx.x+8] );
    if (threadIdx.x<4)
        max_shared[threadIdx.x] = fmaxf( max_shared[threadIdx.x], max_shared[threadIdx.x+4] );
    if (threadIdx.x<2)
        max_shared[threadIdx.x] = fmaxf( max_shared[threadIdx.x], max_shared[threadIdx.x+2] );
    if (threadIdx.x<1)
        max_shared[threadIdx.x] = fmaxf( max_shared[threadIdx.x], max_shared[threadIdx.x+1] );
    */
    while ( nt>0 ) {
        if (threadIdx.x<nt)
            max_shared[threadIdx.x] = fmaxf( max_shared[threadIdx.x], max_shared[threadIdx.x+nt] );
        if ( nt>32 )
            __syncthreads();
        nt = nt/2;
    }
    
    // Write to output
    if (threadIdx.x==0)
        output[0] = max_shared[0];
    
}

"""

kernel_module = cuda_compiler.SourceModule(kernel_src)
kernel_function = kernel_module.get_function("shmemReduction")

CompileError: nvcc compilation of /tmp/tmpix5pdg7q/kernel.cu failed
[command: nvcc --cubin -arch sm_37 -I/home/ubuntu/.local/lib/python3.6/site-packages/pycuda/cuda kernel.cu]
[stderr:
kernel.cu(21): error: expression must have a constant value

kernel.cu(9): warning: variable "gid" was declared but never referenced

1 error detected in the compilation of "/tmp/tmpxft_00001420_00000000-6_kernel.cpp1.ii".
]

In [17]:
n = 128
a = np.random.rand(n).astype(np.float32)
print(a)

a_g = GPUArray(a.shape, a.dtype)
a_g.set(a)

num_threads = 64
b = np.empty((1,num_threads)).astype(np.float32)
b_g = GPUArray(b.shape, b.dtype)

[0.7848292  0.8035179  0.6661633  0.4362775  0.14559805 0.2237722
 0.8979147  0.24531668 0.6544041  0.20344712 0.7684155  0.64986336
 0.6732486  0.11934242 0.6662393  0.24418062 0.76820046 0.6371303
 0.5391755  0.07863975 0.85514474 0.30057132 0.3635121  0.9111666
 0.93807524 0.67226166 0.9067363  0.67060715 0.2590795  0.9183377
 0.44751853 0.25068313 0.47307548 0.38210386 0.5861356  0.6989991
 0.7308389  0.632069   0.1522887  0.5991435  0.2317969  0.13030311
 0.98944765 0.9419435  0.8709065  0.01420743 0.02860881 0.683816
 0.66840166 0.8669783  0.01202685 0.5459647  0.4260259  0.05549169
 0.86324567 0.95831513 0.13250278 0.20227909 0.9709796  0.12757729
 0.05081191 0.03339012 0.55195785 0.06025    0.41970348 0.4406396
 0.47934917 0.97619104 0.0918419  0.12877244 0.3438261  0.5032521
 0.4812024  0.7640697  0.57360566 0.7157317  0.20185104 0.7379211
 0.85921705 0.86202437 0.36600062 0.17110272 0.46811852 0.73640066
 0.8052173  0.1754387  0.3654525  0.35389277 0.9781277  0.67134196
 0.90

In [18]:
block_size = (num_threads, 1, 1)
grid_size = (1,1,1)

kernel_function(b_g, a_g, np.int32(n), grid=grid_size, block=block_size)

b_g.get(b)
print(a)
print(b)
print(np.max(a))

[0.7848292  0.8035179  0.6661633  0.4362775  0.14559805 0.2237722
 0.8979147  0.24531668 0.6544041  0.20344712 0.7684155  0.64986336
 0.6732486  0.11934242 0.6662393  0.24418062 0.76820046 0.6371303
 0.5391755  0.07863975 0.85514474 0.30057132 0.3635121  0.9111666
 0.93807524 0.67226166 0.9067363  0.67060715 0.2590795  0.9183377
 0.44751853 0.25068313 0.47307548 0.38210386 0.5861356  0.6989991
 0.7308389  0.632069   0.1522887  0.5991435  0.2317969  0.13030311
 0.98944765 0.9419435  0.8709065  0.01420743 0.02860881 0.683816
 0.66840166 0.8669783  0.01202685 0.5459647  0.4260259  0.05549169
 0.86324567 0.95831513 0.13250278 0.20227909 0.9709796  0.12757729
 0.05081191 0.03339012 0.55195785 0.06025    0.41970348 0.4406396
 0.47934917 0.97619104 0.0918419  0.12877244 0.3438261  0.5032521
 0.4812024  0.7640697  0.57360566 0.7157317  0.20185104 0.7379211
 0.85921705 0.86202437 0.36600062 0.17110272 0.46811852 0.73640066
 0.8052173  0.1754387  0.3654525  0.35389277 0.9781277  0.67134196
 0.90