In [1]:
!pip install nvcc4jupyter
%load_ext nvcc4jupyter

Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1
Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpxx8asnay".


In [44]:
%%writefile cublas_psacalare.cu
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cuda.h>
#include <time.h>

int main (void){
    cudaError_t cudaStat;
    cublasStatus_t stat;
    cublasHandle_t handle;
    int M;
    float* h_a = 0;     // Host array a
    float* d_a;         // Device array a
    float* h_b = 0;     // Host array b
    float *d_b;         // Device array b
    float result = 0;   // Risultato finale
    float elapsed_cublas;
    cudaEvent_t start_cublas, stop_cublas;


    cudaSetDevice(1);   // Setto la GPU

    M = 1600000;

    printf("Dimensione degli array: %d.\n", M);

    h_a = (float *)malloc (M * sizeof (*h_a));      // Alloco h_a e lo inizializzo
    if (!h_a) {
        printf ("host memory allocation failed");
        return EXIT_FAILURE;
    }

    h_b = (float *)malloc (M * sizeof (*h_b));  // Alloco h_b e lo inizializzo
    if (!h_b) {
        printf ("host memory allocation failed");
        return EXIT_FAILURE;
    }

    srand((unsigned int) time(0));

    for (int i=0; i<M; i++) {
        h_a[i] = rand()%5-2;
        h_b[i] = rand()%5-2;
    }

    cudaStat = cudaMalloc ((void**)&d_a, M*sizeof(*h_a));       // Alloco d_a
    if (cudaStat != cudaSuccess) {
        printf ("device memory allocation failed");
        return EXIT_FAILURE;
    }

    cudaStat = cudaMalloc ((void**)&d_b, M*sizeof(*h_b));       // Alloco d_b
    if (cudaStat != cudaSuccess) {
        printf ("device memory allocation failed");
        return EXIT_FAILURE;
    }

    stat = cublasCreate(&handle);               // Creo l'handle per cublas
    if (stat != CUBLAS_STATUS_SUCCESS) {
        printf ("CUBLAS initialization failed\n");
        return EXIT_FAILURE;
    }

    stat = cublasSetVector(M,sizeof(float),h_a,1,d_a,1);    // Setto h_a su d_a
    if (stat != CUBLAS_STATUS_SUCCESS) {
        printf ("data download failed");
        cudaFree (d_a);
        cublasDestroy(handle);
        return EXIT_FAILURE;
    }

    stat = cublasSetVector(M,sizeof(float),h_b,1,d_b,1);    // Setto h_b su d_b
    if (stat != CUBLAS_STATUS_SUCCESS) {
        printf ("data download failed");
        cudaFree (d_b);
        cublasDestroy(handle);
        return EXIT_FAILURE;
    }

    cudaEventCreate(&start_cublas);
    cudaEventCreate(&stop_cublas);
    cudaEventRecord(start_cublas);
    stat = cublasSdot(handle,M,d_a,1,d_b,1,&result);        // Calcolo il prodotto
    if (stat != CUBLAS_STATUS_SUCCESS) {
        printf ("data download failed cublasSdot");
        cudaFree (d_a);
        cudaFree (d_b);
        cublasDestroy(handle);
        return EXIT_FAILURE;
    }
    cudaEventRecord(stop_cublas);
    cudaEventSynchronize(stop_cublas);
    cudaEventElapsedTime(&elapsed_cublas, start_cublas, stop_cublas);
    cudaEventDestroy(start_cublas);
    cudaEventDestroy(stop_cublas);

    printf("Risultato del prodotto --> %f\n", result);
    printf("Tempo impiegato: %f\n", elapsed_cublas / 1000);

    cudaFree (d_a);     // Dealloco d_a
    cudaFree (d_b);     // Dealloco d_b

    cublasDestroy(handle);  // Distruggo l'handle

    free(h_a);      // Dealloco h_a
    free(h_b);      // Dealloco h_b
    return EXIT_SUCCESS;
}


Overwriting cublas_psacalare.cu


In [43]:
!nvcc -o ./cublas_psacalare cublas_psacalare.cu -lcublas
! ./cublas_psacalare

Dimensione degli array: 1280000.
Risultato del prodotto --> 1170.000000
Tempo impiegato: 0.000761
