In [1]:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import pycuda.gpuarray as gpuarray
import numpy as np

### Petit benchmark

Crois-les, ils sont longs à relancer et ça met longtemps

Pour vérifier la mémoire utilisée sur la cg, nvidia-smi

A priori ElementwiseKernel est bien le plus rapide, en plus la mémoire est mieux gérée qu'en tentant les opérations algébriques directement

In [2]:
from pycuda.elementwise import ElementwiseKernel

error_kernel = ElementwiseKernel("const float *x, const float *y, float *z",
                                "z[i] = (x[i] - y[i])*(x[i] - y[i])",
                                "error_kernel")

In [4]:
a = np.random.randn(10000, 1000).astype(np.float32)
b = np.random.randn(10000, 1000).astype(np.float32)
a_gpu = gpuarray.to_gpu(a)
b_gpu = gpuarray.to_gpu(b)
diff_matrix = gpuarray.empty_like(a_gpu)

In [8]:
%%timeit
((a_gpu - b_gpu)*(a_gpu - b_gpu)) # En plus soucis de mémoire

8.89 ms ± 39.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [6]:
%%timeit
error_kernel(a_gpu , b_gpu, diff_matrix)

2.74 ms ± 2.85 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [9]:
%%timeit
k = (a-b)**2

15.9 ms ± 297 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [38]:
%%timeit
k = diff_matrix.get()

91.6 ms ± 544 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [5]:
# Calcul de la pénalité pour chaque matrice pour éviter l'overfitting
a = np.arange(1,11).astype(np.float32).reshape(5,2)
a_gpu = gpuarray.to_gpu(a)
b = np.arange(1, 10).reshape(3,3)
b_gpu = gpuarray.to_gpu(b)

In [10]:
def create_from_template(template, function_name, v=False, **kwargs):
    """Create from a template the """
    if v: print(template % kwargs)
    mod = SourceModule(template % kwargs)
    return mod.get_function(function_name)
    


column_sum_template = """
                        __global__ void sumcol (const float *a, float *b)
                        {
                            const int nlines = %(nlines)s;
                            const int ncols = %(ncols)s;
                            const int block_start = blockDim.x * (blockIdx.x + gridDim.x * blockIdx.y);
                            const int idx_start = threadIdx.x + block_start ;
                            float sum = 0;
                            for (int idx = idx_start; idx < nlines * ncols; idx += ncols)
                                sum += a[idx];
                            b[idx_start] = sum;
                        }"""

line_sum_template = """
                        __global__ void sumline (const float *a, float *b)
                        {
                            const int ncols = %(ncols)s;
                            const int block_start = BlockDim.x * (blockIdx.x + gridDim.x * blockIdx.y);
                            const int idx_start = (block_start + threadIdx.x) * ncols;
                            float sum = 0;
                            for (int idx = idx_start; idx < idx_start + ncols; idx++)
                                sum += a[idx];
                            b[idx_start] = sum;
                        }"""

In [11]:
lsummodfunc = create_from_template(column_sum_template, "sumcol", nlines=5, ncols=2, d="%d")

In [8]:
out_sum_columns = gpuarray.zeros((1,2), dtype=np.float32)
lsummodfunc(a_gpu, out_sum_columns, block=(2,1,1), grid=(2,2))
out_sum_columns.get()

array([[ 25.,  30.]], dtype=float32)

In [9]:
a_gpu.get()

array([[  1.,   2.],
       [  3.,   4.],
       [  5.,   6.],
       [  7.,   8.],
       [  9.,  10.]], dtype=float32)

In [None]:
def cost_function(real_matrix, predicted_matrix, l_u, l_v, p, q):
    # Faire la matrice des différences au carré,
    # Calcule le coût de p
    # Calcule celui de q
    return (real_matrix - predicted_matrix)**2 + l_u*norm(p) + l_v * norm(q)

# Création de la matrice principale

On va se concentrer sur un modèle qui fonctionne, donc on crée une matrice où l'on sait qu'il existe une solution.
Pour cela, on crée une des matrices originales P_o et Q_o que l'on multiplie.

On note que contrairement à une véritable dataset, la matrice sera ici très dense avec chaque cellule contenant une valeur. Dans la vraie vie (Notes sur Netflix, Amazon, etc...), les matrices comprennent énormément de NaNs, qu'on ne retrouvera pas ici


In [11]:
# Générer deux "vraies" matrices P et Q, faire le produit
p_o = np.random.randn(100, 1000).astype(np.float32) # Generate from uniform(0, 1)
q_o = np.random.randn(1000, 100).astype(np.float32)

On en profite pour faire un benchmark de la vitesse de mutliplication de matrice par numpy et par cuda
On remarque d'ailleurs que la différence de facteur augmente au fur et à mesure qu'on augmente la taille de la matrice

In [12]:
%timeit np.matmul(p_o, q_o)

345 µs ± 27.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [182]:
p_gpu = gpuarray.to_gpu(p_o)
q_gpu = gpuarray.to_gpu(q_o)
matmulpq = create_from_template(matmul_template, "prodbyline", nq=100, ncom=1000)
res = gpuarray.zeros((100, 100), dtype=np.float32)
# Slower than the cpu multiplication because of having to constantly go back and forth


                  #include <stdio.h>
                  
                  __global__ void prodbyline (const float *p, const float *q, float *r)
                  {
                  const int nq = 100; // number of columns in q
                  const int ncom= 1000; // number of lines in q and of column in p
                  const int block_start = (blockIdx.x + gridDim.x * blockIdx.y) * blockDim.x; 
                  const int startp = (threadIdx.x + block_start) * ncom;
                  const int startr = (threadIdx.x + block_start) * nq;
                  
                  for (int linex = 0; linex < nq; linex++)
                      {
                      int idcell = linex + startr;
                      float sumcell = 0;
                      for (int idy = linex, idx = startp;
                           idx < startp + ncom;
                           idy += nq, idx++)
                             sumcell += p[idx] * q[idy];
                      r[idcell] = sumcell;
    

In [183]:
matmulpq(p_gpu, q_gpu, res, block =(128,1,1), grid=(1,1))

In [184]:
resmat = res.get()

In [187]:
diff = resmat - np.matmul(p_o, q_o)

In [224]:
# Some difference appear, probably because of precision problem
np.extract(np.abs(diff) > 1e-4, diff)

array([ 0.00011253,  0.00013733,  0.00018311,  0.00010681, -0.0001297 ,
        0.000103  , -0.00011444, -0.00010681, -0.00012207], dtype=float32)

In [193]:
diff[:10, :10]

array([[  7.62939453e-06,  -2.86102295e-06,  -3.43322754e-05,
          1.14440918e-05,   5.72204590e-05,   5.72204590e-06,
         -1.90734863e-05,   1.90734863e-05,  -4.19616699e-05,
          3.81469727e-06],
       [ -2.00271606e-05,   7.62939453e-06,   1.90734863e-06,
         -4.76837158e-06,   3.81469727e-06,   3.62396240e-05,
          0.00000000e+00,   3.81469727e-06,   5.72204590e-06,
          3.43322754e-05],
       [  1.90734863e-05,   1.81198120e-05,   0.00000000e+00,
         -3.81469727e-06,   4.76837158e-06,   5.72204590e-06,
          9.53674316e-07,   1.90734863e-06,   0.00000000e+00,
         -1.76578760e-05],
       [  1.90734863e-06,   1.14440918e-05,  -1.14440918e-05,
         -9.53674316e-06,   1.90734863e-05,   2.47955322e-05,
          3.81469727e-05,  -2.09808350e-05,  -3.43322754e-05,
          1.90734863e-06],
       [ -6.19888306e-06,  -3.81469727e-06,   0.00000000e+00,
          1.23977661e-05,  -4.76837158e-07,   1.71661377e-05,
         -2.86102295e-05

In [119]:
matmul_template = """
                  __global__ void prodbyline (const float *p, const float *q, float *r)
                  {
                  const uint nq = %(nq)s; // number of columns in q
                  const uint ncom= %(ncom)s; // number of lines in q and of column in p
                  const uint block_start = (blockIdx.x + gridDim.x * blockIdx.y) * blockDim.x; 
                  const uint startp = (threadIdx.x + block_start) * ncom;
                  const uint startr = (threadIdx.x + block_start) * nq;
                                
                  for (int linex = 0; linex < nq; linex++)
                      {
                      int idcell = linex + startr;
                      float sumcell = 0;
                      for (int idy = linex, idx = startp;
                           idx < startp + ncom;
                           idy += nq, idx++)
                             sumcell += p[idx] * q[idy];
                      r[idcell] = sumcell;
                      }
                  }
                  """

matmul = create_from_template(matmul_template, "prodbyline", nq=20, ncom=50)

In [120]:
a = np.arange(-500,500).reshape(20,50).astype(np.float32)/2
b = a.T.copy()
a_gpu = gpuarray.to_gpu(a)
b_gpu = gpuarray.to_gpu(b)

trueres = np.matmul(a,b)

In [121]:
out_mat = gpuarray.empty((20,20), dtype=np.float32)
matmul(a_gpu, b_gpu, out_mat, block=(64,1,1), grid=(1,1))
trueres - out_mat.get()

array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  

In [114]:
matmul_t_template = """
                  #include <stdio.h>
                  
                  __global__ void prodbyline (const float *p, const float *q, float *r)
                  {
                  const uint nq = %(nq)s; // number of columns in q
                  const uint ncom= %(ncom)s; // number of lines in q and of column in p
                  const uint block_start = (blockIdx.x + gridDim.x * blockIdx.y) * blockDim.x; 
                  const uint startp = (threadIdx.x + block_start) * ncom;
                  const uint startr = (threadIdx.x + block_start) * nq;
                  
                  for (int linex = 0; linex < nq; linex++)
                      {
                      int idcell = linex + startr;
                      float sumcell = 0;
                      for (int idy = linex*ncom, idx = startp;
                           idx < startp + ncom;
                           idy++, idx++)
                             sumcell += p[idx] * q[idy];
                      r[idcell] = sumcell;
                      }
                  }
                  """

matmul_t = create_from_template(matmul_t_template, "prodbyline", nq=20, ncom=50)

In [115]:
out_matt = gpuarray.zeros((20,20), dtype=np.float32)
matmul_t(a_gpu, a_gpu, out_matt, block=(32,1,1), grid=(1,2))

In [116]:
%timeit matmul_t(a_gpu, a_gpu, out_matt, block=(64,1,1), grid=(1,1))

1.04 ms ± 15.5 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [117]:
%timeit matmul(a_gpu, b_gpu, out_mat, block=(64,1,1), grid=(1,1))

1.06 ms ± 47.4 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)


### Que faire quand + de lignes que de threads possible ?
- Modulo nb total de threads
- "maillage" en python

# Utilisation du grid

In [6]:
gridmod = SourceModule("""
    #include <stdio.h>
    
    __global__ void reperage (float *a)
    {
    printf("__NEW__");
    const int idx = threadIdx.x + 5*threadIdx.y;
    a[0] = idx;
    printf("Block is (%d, %d), thread is (%d, %d):\\n", blockIdx.x, blockIdx.y, threadIdx.x, threadIdx.y);
    }""")

In [7]:
tryprint = gridmod.get_function('reperage')

In [26]:
tryprint(a_gpu, block=(2,1,1), grid=(1,1))

In [27]:
a_gpu.get() # ??? Seulement executé quand on appelle get ?

array([[  1.,   2.,   3.,   4.,   5.],
       [  6.,   7.,   8.,   9.,  10.]], dtype=float32)

# Dimension d'un block
Je ne comprends pas exactement quel est l'intérêt d'écrire un block en plus d'une dimension. Est-ce que la vitesse est meilleure dans certains cas pour (32, 16, 2) que (1024,1,1) ?
A tester