# Code

## CUDA Utilities

In [None]:
%%writefile cuda_stuff.cuh
#ifndef cuda_stuff_H
#define cuda_stuff_H

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>

//MACRO TO DEBUG CUDA FUNCTIONS
/** Error checking,
 *  taken from https://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
 */
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

void device_synchronize();

#endif


Writing cuda_stuff.cuh


In [None]:
%%writefile cuda_stuff.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>

#include "cuda_stuff.cuh"

void device_synchronize(){
    gpuErrchk(cudaDeviceSynchronize());
}

Writing cuda_stuff.cu


## Matrix Tools

In [None]:
%%writefile fmatrix.cuh
#ifndef fmatrices_H
#define fmatrices_H
#include <stddef.h> 

typedef struct {
    float* data;
    size_t cols;
    size_t rows;
} fmatrix;

/* transform matrix index to vector offset
   Since CUDA uses column major, 
   nb_rows = number of rows */
#define IDX2C(i,j,nb_rows) (((j)*(nb_rows))+(i))

/* Access element (i,j) of matrix mat */
#define getfm(mat,i,j) (mat.data[IDX2C(i,j,mat.rows)])


size_t fmatrix_elements(fmatrix mat);
size_t fmatrix_size(fmatrix mat);
void fmatrix_init(fmatrix mat, float f);
/** Assert that the matrix is coherent: all fields nonzero. */
void fmatrix_assert();

fmatrix fmatrix_create_on_host(size_t rows, size_t cols);
fmatrix fmatrix_create_on_device(size_t rows, size_t cols);

void fmatrix_data_to_host(fmatrix mat_host, fmatrix mat_device);
void fmatrix_data_to_device(fmatrix mat_host, fmatrix mat_device);

void fmatrix_free_on_host(fmatrix* mat);
void fmatrix_free_on_device(fmatrix* mat);

/** Print the first nb rows of the matrix mat
 *  on the host. 
 *  If nb<0, print all rows. 
 */
void fmatrix_host_print(fmatrix mat, int nb=-1);

/** Print the first nb rows of the matrix mat
 *  on the device. 
 *  If nb<0, print all rows. 
 */
void fmatrix_device_print(fmatrix mat, int nb=-1);

#endif


Writing fmatrix.cuh


In [None]:
%%writefile fmatrix.cu
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>

#include "cuda_stuff.cuh"
#include "fmatrix.cuh"

size_t fmatrix_elements(fmatrix mat) {
     return mat.cols*mat.rows;
}

size_t fmatrix_size(fmatrix mat) {
     return fmatrix_elements(mat) * sizeof(mat.data[0]);
}

void fmatrix_init(fmatrix mat, float f) {
    for (int i = 0; i < mat.rows; i++){
        for (int j = 0; j < mat.cols; j++){
            mat.data[IDX2C(i,j,mat.rows)] = f; 
    }
  }
} 

void fmatrix_assert(fmatrix mat) {
    assert(mat.data);
    assert(mat.cols);
    assert(mat.rows);
}



fmatrix fmatrix_create_on_host(size_t rows, size_t cols) {
    assert(cols>0);
    assert(rows>0);
    fmatrix mat;
    mat.cols = cols;
    mat.rows = rows;
    mat.data = (float*)malloc(fmatrix_size(mat)); 
    assert(mat.data);
    return mat;
}

fmatrix fmatrix_create_on_device(size_t rows, size_t cols) {
    assert(cols>0);
    assert(rows>0);
    fmatrix mat;
    mat.cols = cols;
    mat.rows = rows;
    gpuErrchk( 
        cudaMalloc((void **)&(mat.data), fmatrix_size(mat)) 
    );
    return mat;
}

void fmatrix_data_to_device(fmatrix mat_host, fmatrix mat_device) {
    fmatrix_assert(mat_host);
    fmatrix_assert(mat_device);
    assert(mat_host.cols==mat_device.cols);
    assert(mat_host.rows==mat_device.rows);
    gpuErrchk( 
        cudaMemcpy( mat_device.data, mat_host.data, 
                   fmatrix_size(mat_host), 
                   cudaMemcpyHostToDevice 
                   )
        );
}

void fmatrix_data_to_host(fmatrix mat_host, fmatrix mat_device) {
    fmatrix_assert(mat_host);
    fmatrix_assert(mat_device);
    assert(mat_host.cols==mat_device.cols);
    assert(mat_host.rows==mat_device.rows);
    gpuErrchk(
        cudaMemcpy( mat_host.data, mat_device.data,  
                   fmatrix_size(mat_device), 
                   cudaMemcpyDeviceToHost 
                   )
        );
}

void fmatrix_free_on_host(fmatrix* mat) {
    fmatrix_assert(*mat);  
  free(mat->data);
  mat->data = 0;
  mat->cols = 0;
  mat->rows = 0;
}

void fmatrix_free_on_device(fmatrix* mat) {
    fmatrix_assert(*mat);  
  gpuErrchk(cudaFree(mat->data));
  mat->data = 0;
  mat->cols = 0;
  mat->rows = 0;
}

void fmatrix_host_print(fmatrix mat, int nb){
    if (nb<0 || nb > mat.rows) {
        nb = mat.rows;
    }
    printf("[\n");
    for (int i = 0 ; i < nb; i++){
      for (int j = 0 ; j<mat.cols; j++){
        printf("%f", getfm(mat,i,j));
        if (j+1<mat.cols) {
          printf(",\t");
        }
      }
      if (i+1<nb) {
        printf(";\n");
      }
    }
    if (nb < mat.rows) {
      printf("\n...\n");
    }
  printf("\n]\n");
}

void fmatrix_device_print(fmatrix mat, int nb){
   // allocate copy
   fmatrix tmp = fmatrix_create_on_host(mat.rows, mat.cols);
   fmatrix_data_to_host(tmp, mat);
   fmatrix_host_print(tmp,nb);
   fmatrix_free_on_host(&tmp);
}



Writing fmatrix.cu


## Matrix Math

In [None]:
%%writefile sgemm.cuh
#ifndef sgemm_H
#define sgemm_H

#include <string>
#include "fmatrix.cuh"

void mat_mul(fmatrix A, fmatrix B, fmatrix C, std::string arg);

#endif

Writing sgemm.cuh


In [None]:
%%writefile sgemm.cu
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <time.h>
#include <math.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"

#include "cuda_stuff.cuh"
#include "sgemm.cuh"
#include "fmatrix.cuh"

#define THREADS_PER_BLOCK 1024
#define TILE_WIDTH 32

using namespace std;

static cublasHandle_t handle;

static int cublas_init = 0;

/* basic matrix multiplication C = alpha*A*B + beta*C on host as reference for the speedup */
void matrixMultiplication_basic_host(float alpha, fmatrix A, fmatrix B, float beta, fmatrix C) { 
  float tmp = 0;
  for (int i = 0; i<A.rows; i++){
    for (int j = 0; j<B.cols; j++){
      for (int k = 0; k<A.cols; k++){
        tmp += alpha * getfm(A,i, k) * getfm(B, k, j);
      }
      getfm(C, i, j) = beta * getfm(C, i, j) + tmp; 
      tmp = 0;
    }
  }
}
               
/* TODO : 3 different versions of matrix multiplication C = alpha*A*B + beta*C on device */
__global__
void matmul_basic_kernel(float alpha, float *A, float *B, float beta, float *C, int nb_ColA, int nb_ColB, int nb_LigneA, int nb_LigneB) {
  /* TODO */
  int i = blockIdx.y * blockDim.y + threadIdx.y;
  int j = blockIdx.x * blockDim.x + threadIdx.x;

  if (i < nb_LigneA && j < nb_ColB){
    float tmp = 0;
    for( int k=0; k < nb_ColA; k++ )
    {
      float A_i_k = A[IDX2C(i,k,nb_LigneA)];
      float B_k_j = B[IDX2C(k,j,nb_LigneB)];
      tmp += alpha * A_i_k * B_k_j;
    }

    C[IDX2C(i,j,nb_ColA)] = beta * C[IDX2C(i,j,nb_ColA)] + tmp;
  } 
}


void matrixMultiplication_basic(float alpha, fmatrix d_A, fmatrix d_B, float beta, fmatrix d_C) { 
  // TODO - declaration of dimGrid and dimBlock
  dim3 dimBlock(TILE_WIDTH,TILE_WIDTH);
  dim3 dimGrid( (int) d_A.cols/TILE_WIDTH, (int) d_A.rows/TILE_WIDTH);

  matmul_basic_kernel <<< dimGrid, dimBlock >>> (alpha, d_A.data, d_B.data, beta, d_C.data, d_A.cols, d_B.cols, d_A.rows, d_B.rows);

} 

/**********************/
__global__
void matmul_tiled_kernel(float alpha, float *A, float *B, float beta, float *C, int nb_ColA, int nb_ColB, int nb_LigneA, int nb_LigneB){
  /* TODO */

  int i = blockIdx.y * blockDim.y + threadIdx.y;
  int j = blockIdx.x * blockDim.x + threadIdx.x;

  __shared__ float shared_A[TILE_WIDTH][TILE_WIDTH];
  __shared__ float shared_B[TILE_WIDTH][TILE_WIDTH];

  if (i < nb_LigneA && j < nb_ColB){
    float tmp = 0;
    for( int l=0; l < nb_ColA/TILE_WIDTH; l++ )
    {
      shared_A[threadIdx.y][threadIdx.x] = A[i*nb_LigneA + (l*TILE_WIDTH + threadIdx.x)];
      shared_B[threadIdx.y][threadIdx.x] = B[(l*TILE_WIDTH + threadIdx.y)*nb_LigneB + j];
      __syncthreads();

      for( int k=0; k<TILE_WIDTH; k++){
        tmp += alpha * shared_A[threadIdx.y][k] *  shared_B[k][threadIdx.x];
        __syncthreads();
      }
   
    }
    C[IDX2C(i,j,nb_ColA)] = beta * C[IDX2C(i,j,nb_ColA)] + tmp;
  } 
}



void matrixMultiplication_tiled(float alpha, fmatrix d_A, fmatrix d_B, float beta, fmatrix d_C){
  // TODO - declaration of dimGrid and dimBlock
  dim3 dimBlock(TILE_WIDTH,TILE_WIDTH);
  dim3 dimGrid( (int) d_A.cols/TILE_WIDTH, (int) d_A.rows/TILE_WIDTH);

  matmul_tiled_kernel <<< dimGrid, dimBlock >>> (alpha, d_A.data, d_B.data, beta, d_C.data, d_A.cols, d_B.cols, d_A.rows, d_B.rows);
}

/**********************/
void matrixMultiplication_cublas(float alpha, fmatrix d_A, fmatrix d_B, float beta, fmatrix d_C){
  /* TODO */
  static cublasHandle_t handle;
  cublasCreate(&handle); 
  cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, d_C.rows, d_C.cols, d_A.cols, &alpha, d_A.data, d_A.rows, d_B.data, d_B.rows, &beta, d_C.data, d_C.rows);
  cublasDestroy(handle);
}



/*MAIN SGEMM*/
void gen_mat_mul(float alpha, fmatrix A, fmatrix B, float beta, fmatrix C, std::string arg){
    if (arg == "cpu"){
        matrixMultiplication_basic_host(alpha, A, B, beta, C);
    } else {
      /* kernel function*/  
      if (arg == "gpu_basic"){
          matrixMultiplication_basic(alpha, A, B, beta, C);
      
      } else if (arg == "gpu_tiled"){
          matrixMultiplication_tiled(alpha, A, B, beta, C);
      
      } else if (arg == "gpu_cublas"){
         matrixMultiplication_cublas(alpha, A, B, beta, C);
    
      } else{
          printf("Matrix Multiplication argument is Wrong");
          exit(0);
      }
      // wait for everything to finish
    device_synchronize();
    }
}

void mat_mul(fmatrix A, fmatrix B, fmatrix C, std::string arg){
 gen_mat_mul(1.0, A, B, 0.0, C, arg);   
}


Overwriting sgemm.cu


# Main

In [None]:
%%writefile main.cu

#include <stdio.h>
#include <stdlib.h>
#include "fmatrix.cuh"
#include "sgemm.cuh"

#define TILE_WIDTH 32
//#define SIZE 40

int main(void){
  printf("TILE_WIDTH: %.2d\n", TILE_WIDTH);
  for(int iter = 10; iter <= 60; iter +=20){
    #define SIZE iter
    printf("\n");
    printf("***Size***: %.2d\n", SIZE);   

  /* Allocate and initialize data on host */
  fmatrix A = fmatrix_create_on_host(TILE_WIDTH * SIZE, TILE_WIDTH * SIZE);
  fmatrix_init(A, 1.0);    
  fmatrix B = fmatrix_create_on_host(TILE_WIDTH * SIZE, TILE_WIDTH * SIZE);
  fmatrix_init(B, 2.0);
  fmatrix C = fmatrix_create_on_host(TILE_WIDTH * SIZE, TILE_WIDTH * SIZE);
  fmatrix_init(C, 0.0);

  /* Allocate data on device */
  fmatrix d_A = fmatrix_create_on_device(TILE_WIDTH * SIZE, TILE_WIDTH * SIZE);
  fmatrix d_B = fmatrix_create_on_device(TILE_WIDTH * SIZE, TILE_WIDTH * SIZE);
  fmatrix d_C = fmatrix_create_on_device(TILE_WIDTH * SIZE, TILE_WIDTH * SIZE);
  
  /* Transfer A and B on device */
  fmatrix_data_to_device(A, d_A);
  fmatrix_data_to_device(B, d_B); 
  fmatrix_data_to_device(C, d_C);

  clock_t start, end;
  float cpu_time_used;   

  /* Start calculation "cpu", "gpu_basic", "gpu_tiled", "gpu_cublas" */ 
  /************** "cpu" *******************/
  start = clock();
  mat_mul(A, B, C, "cpu");
  end = clock();
  cpu_time_used = ((double) (end - start)) * 1000 / CLOCKS_PER_SEC;
  printf("Time taken by CPU in milliseconds: %.2f\n", cpu_time_used);
 
  
  /* Result correctness */
  {
    float maxError = 0.0f;
    for (int i = 0; i < TILE_WIDTH * SIZE; i++){
      for (int j = 0; j < TILE_WIDTH * SIZE; j++){
        maxError = max(maxError, abs(getfm(C,i,j)- 2*TILE_WIDTH * SIZE));
      }
    } 
    printf("Max error: %f\n", maxError);
  }
  fmatrix_init(C, 0.0); 

  /************** "gpu_basic" *******************/
  start = clock();
  mat_mul(d_A, d_B, d_C, "gpu_basic");
  end = clock();
  cpu_time_used = ((double) (end - start)) * 1000 / CLOCKS_PER_SEC;
  printf("GPU basic matrix multiplication in milliseconcs : %.2f\n", cpu_time_used);
 
  /* Retrieve the result */
  fmatrix_data_to_host(C, d_C);
  /* Result correctness */
  {
    float maxError = 0.0f;
    for (int i = 0; i < TILE_WIDTH * SIZE; i++){
      for (int j = 0; j < TILE_WIDTH * SIZE; j++){
        maxError = max(maxError, abs(getfm(C,i,j)- 2*TILE_WIDTH * SIZE));
      }
    }   
    printf("Max error: %f\n", maxError);
  } 
  fmatrix_init(C, 0.0); 
  fmatrix_data_to_device(C, d_C);
 

 /************** "gpu_tiled" *******************/
  start = clock();
  mat_mul(d_A, d_B, d_C, "gpu_tiled");
  end = clock();
  cpu_time_used = ((double) (end - start)) * 1000 / CLOCKS_PER_SEC;
  printf("GPU tiled matrix multiplication in milliseconcs : %.2f\n", cpu_time_used);
 
  /* Retrieve the result */
  fmatrix_data_to_host(C, d_C);
  /* Result correctness */
  {
    float maxError = 0.0f;
    for (int i = 0; i < TILE_WIDTH * SIZE; i++){
      for (int j = 0; j < TILE_WIDTH * SIZE; j++){
        maxError = max(maxError, abs(getfm(C,i,j)- 2*TILE_WIDTH * SIZE));
      }
    }   
    printf("Max error: %f\n", maxError);
  } 
  fmatrix_init(C, 0.0); 
  fmatrix_data_to_device(C, d_C);


  /************** "gpu_cublas" *******************/
  for(int warmup = 0; warmup < 5; warmup++){
    mat_mul(d_A, d_B, d_C, "gpu_cublas");
  }
  fmatrix_init(C, 0.0); 
  fmatrix_data_to_device(C, d_C);

  start = clock();
  mat_mul(d_A, d_B, d_C, "gpu_cublas");
  end = clock();
  cpu_time_used = ((double) (end - start)) * 1000 / CLOCKS_PER_SEC;
  printf("GPU cuBLAS matrix multiplication in milliseconcs : %.2f\n", cpu_time_used);
 
  /* Retrieve the result */
  fmatrix_data_to_host(C, d_C);
  /* Result correctness */
  {
    float maxError = 0.0f;
    for (int i = 0; i < TILE_WIDTH * SIZE; i++){
      for (int j = 0; j < TILE_WIDTH * SIZE; j++){
        maxError = max(maxError, abs(getfm(C,i,j)- 2*TILE_WIDTH * SIZE));
      }
    }   
    printf("Max error: %f\n", maxError);
  } 
  fmatrix_init(C, 0.0); 
  fmatrix_data_to_device(C, d_C);

  /* Free */ 
  fmatrix_free_on_host(&A);
  fmatrix_free_on_host(&B);
  fmatrix_free_on_host(&C);
  fmatrix_free_on_device(&d_A);
  fmatrix_free_on_device(&d_B);
  fmatrix_free_on_device(&d_C);
}
    }

Overwriting main.cu


# Compiling

In [None]:
!nvcc  -arch=sm_37 -gencode=arch=compute_37,code=sm_37 -Wno-deprecated-gpu-targets -lcublas sgemm.cu  fmatrix.cu  cuda_stuff.cu main.cu





# Experiments

In [None]:
! ./a.out

TILE_WIDTH: 32

***Size***: 10
Time taken by CPU in milliseconds: 139.84
Max error: 0.000000
GPU basic matrix multiplication in milliseconcs : 3.06
Max error: 0.000000
GPU tiled matrix multiplication in milliseconcs : 0.41
Max error: 0.000000
GPU cuBLAS matrix multiplication in milliseconcs : 0.29
Max error: 0.000000

***Size***: 30
Time taken by CPU in milliseconds: 3704.80
Max error: 0.000000
GPU basic matrix multiplication in milliseconcs : 78.49
Max error: 0.000000
GPU tiled matrix multiplication in milliseconcs : 7.90
Max error: 0.000000
GPU cuBLAS matrix multiplication in milliseconcs : 1.16
Max error: 0.000000

***Size***: 50
Time taken by CPU in milliseconds: 23337.30
Max error: 0.000000
GPU basic matrix multiplication in milliseconcs : 267.22
Max error: 0.000000
GPU tiled matrix multiplication in milliseconcs : 19.00
Max error: 0.000000
GPU cuBLAS matrix multiplication in milliseconcs : 1.74
Max error: 0.000000


# Results



> TILE_WIDTH: 32

***Size***: 10 \
Time taken by CPU in milliseconds: 136.99 \
GPU basic matrix multiplication in milliseconcs : 3.59 \
GPU tiled matrix multiplication in milliseconcs : 0.41 \
GPU cuBLAS matrix multiplication in milliseconcs : 0.31 \

***Size***: 30 \
Time taken by CPU in milliseconds: 3709.97 \
GPU basic matrix multiplication in milliseconcs : 78.50 \
GPU tiled matrix multiplication in milliseconcs : 7.90 \
GPU cuBLAS matrix multiplication in milliseconcs : 1.17 \

***Size***: 50 \
Time taken by CPU in milliseconds: 25691.51 \
GPU basic matrix multiplication in milliseconcs : 273.54 \
GPU tiled matrix multiplication in milliseconcs : 13.68 \
GPU cuBLAS matrix multiplication in milliseconcs : 1.69 \


> TILE_WIDTH: 16

***Size***: 10 \
Time taken by CPU in milliseconds: 13.46 \
GPU basic matrix multiplication in milliseconcs : 0.23 \
GPU tiled matrix multiplication in milliseconcs : 0.07 \
GPU cuBLAS matrix multiplication in milliseconcs : 0.33 \

***Size***: 30 \
Time taken by CPU in milliseconds: 456.00 \
GPU basic matrix multiplication in milliseconcs : 5.14 \
GPU tiled matrix multiplication in milliseconcs : 1.00 \
GPU cuBLAS matrix multiplication in milliseconcs : 0.40 \

***Size***: 50 \
Time taken by CPU in milliseconds: 2153.65 \
GPU basic matrix multiplication in milliseconcs : 23.59 \
GPU tiled matrix multiplication in milliseconcs : 4.09 \
GPU cuBLAS matrix multiplication in milliseconcs : 0.97 \



> TILE_WIDTH: 08

***Size***: 10 \
Time taken by CPU in milliseconds: 1.74 \
GPU basic matrix multiplication in milliseconcs : 0.05 \
GPU tiled matrix multiplication in milliseconcs : 0.03 \
GPU cuBLAS matrix multiplication in milliseconcs : 0.34 \

***Size***: 30 \
Time taken by CPU in milliseconds: 46.19 \
GPU basic matrix multiplication in milliseconcs : 0.30 \
GPU tiled matrix multiplication in milliseconcs : 0.22 \
GPU cuBLAS matrix multiplication in milliseconcs : 0.26 \

***Size***: 50 \
Time taken by CPU in milliseconds: 249.98 \
GPU basic matrix multiplication in milliseconcs : 1.27 \
GPU tiled matrix multiplication in milliseconcs : 0.91 \
GPU cuBLAS matrix multiplication in milliseconcs : 0.39 \



# Interpretation

Après implémentation et plusieurs runs, une dynamique générale se déssine.

Avec TILE_WIDTH 32, cuBLAS est le plus rapides, suivi de GPU Tiled, de GPU basic et de CPU. L'explication c'est GPU basic est plus rapide que CPU car le GPU a une architecture qui sied mieux au calcul parallèle de la multiplication matricielle. GPU tiled est plus rapide que GPU basic car GPU tiled a moins d'accès à la mémoire globale du GPU que GPU basic. cuBLAS est le plus rapide de tous car issu d'une librairie bien optimisé à l'architecture du GPU. 
Ce constat est le même pour les différentes  valeurs size testées selon les TILE_WIDTH 8 et 16.

A TILE_WIDTH fixé, le temps de calcul augmente avec les valeurs croissantes de SIZE. Plus SIZE est élevé, plus il y'a d'opérations à faire et plus cela prend du temps.


# Debugging
Compile with debugging info on the host (`-g`) and device (`-G`).


# New Section

In [None]:
!nvcc -g -G -I /usr/local/cuda/samples/common/inc/ -L/usr/local/cuda/include -lcublas -lcusolver linear_classification.cu sgemm.cu read_csv.cu preprocess_data.cu xavier_weight.cu fmatrix.cu proba_calcul.cu cuda_stuff.cu

[01m[Kgcc:[m[K [01;31m[Kerror: [m[Klinear_classification.cu: No such file or directory
[01m[Kgcc:[m[K [01;31m[Kfatal error: [m[Kno input files
compilation terminated.


Run the debugger cuda-gdb, stopping at the first error that is detected. Shows first the call stack on the GPU, the values of local variables, then the call stack on the host (thread 1).

In [None]:
! printf "set cuda memcheck on\nset cuda api_failures stop\ncatch throw\nr UNIT\nbt\ninfo locals\nthread 1\nbt\n" > tmp.txt
! cat tmp.txt
! cuda-gdb -batch -x tmp.txt ./a.out

set cuda memcheck on
set cuda api_failures stop
catch throw
r UNIT
bt
info locals
thread 1
bt
Catchpoint 1 (throw)
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
[Detaching after fork from child process 32257]
[New Thread 0x7f8722156000 (LWP 32265)]
[New Thread 0x7f8721955000 (LWP 32266)]
Time taken by CPU in milliseconds: 15664.87
Max error: 0.000000
GPU basic matrix multiplication in milliseconcs : 7685.00
Max error: 0.000000
GPU tiled matrix multiplication in milliseconcs : 2841.51
Max error: 0.000000
GPU cuBLAS matrix multiplication in milliseconcs : 137.46
Max error: 0.000000
[Thread 0x7f8722156000 (LWP 32265) exited]
[Thread 0x7f872998f000 (LWP 32252) exited]
[Inferior 1 (process 32252) exited normally]
tmp.txt:5: Error in sourced command file:
No stack.


In [None]:
!cuda-memcheck ./a.out "UNIT"

Time taken by CPU in milliseconds: 15570.91
Max error: 0.000000
GPU basic matrix multiplication in milliseconcs : 7655.59
Max error: 0.000000
GPU tiled matrix multiplication in milliseconcs : 6.47
Max error: 2560.000000
