In [None]:
!pip install nvcc4jupyter
%load_ext nvcc4jupyter

Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1
Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpadvoh1rw".


In [None]:
%%writefile cublas_psacalare.cu
#include <cstdlib>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cuda.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>
#define IDX2C(i,j,ld) (((j)*(ld))+(i)) // ld = numero di righe

void print_matrix(int M, int N, float* a){
    for(int i = 0; i < M; i++){
        for(int j = 0; j < N; j++){
            printf("%f ", a[IDX2C(i, j, M)]);
        }
        printf("\n");
    }
}

int main(void){
    int M, N;
    float *h_a, *d_a;
    float *h_x, *d_x;
    float *h_y, *d_y;
    float alpha = 1.0, beta = 0.0f;
    float elapsed_time = 0;
    cudaError_t alloc_stat;
    cublasStatus_t cub_stat;
    cublasHandle_t handle;
    cudaEvent_t start_cublas, stop_cublas;

    M = 8000;
    N = 8000;

    // Allocazione della memoria sull'host
    h_a = (float *) malloc(sizeof(*h_a) * M * N); // M righe, N colonne
    if(!h_a){
        fprintf(stderr, "Host memory allocation failed for matrix a.");
        return EXIT_FAILURE;
    }

    h_x = (float *) malloc(sizeof(*h_x) * N); // N righe, 1 colonna
    if(!h_x){
        fprintf(stderr, "Host memory allocation failed for vector x.");
        return EXIT_FAILURE;
    }

    h_y = (float *) malloc(sizeof(*h_y) * M);
    if(!h_y){
        fprintf(stderr, "Host memory allocation failed for vector y.");
    }

    // Inizializzazione

    srand((unsigned int) time(0));

    // inizializzo il vettore x
    for (int i=0; i < N; i++) {
        // Inizializzato in row major
        h_x[i] = rand()%5-2;
    }

    // inizializzo la matrice a
    for (int i=0; i < M; i++) {
        for(int j = 0; j < N; j++)
        // Inizializzato in column major
            h_a[IDX2C(i, j, M)] = rand()%5-2;
    }

    for (int i = 0; i < M; i++) {
        h_y[i] = 0.0f;
    }

    if(M * N < 25) {
        printf("Matrix : \n");
        print_matrix(M, N, h_a);
        printf("Vector X: \n");
        print_matrix(N, 1, h_x);
    }

    // Allocazione della memoria sul device
    alloc_stat = cudaMalloc((void **)&d_a, M * N * sizeof(float));
    if (alloc_stat != cudaSuccess) {
        printf ("Device memory allocation failed for matrix a.");
        return EXIT_FAILURE;
    }
    alloc_stat = cudaMalloc((void **)&d_x, N * sizeof(float));
    if (alloc_stat != cudaSuccess) {
        printf ("Device memory allocation failed for vector x.");
        return EXIT_FAILURE;
    }
    alloc_stat = cudaMalloc((void **)&d_y, M * sizeof(float));
    if (alloc_stat != cudaSuccess) {
        printf ("Device memory allocation failed for vector y.");
        return EXIT_FAILURE;
    }

    // Creo l'handle per cublas
    cub_stat = cublasCreate(&handle);
    if (cub_stat != CUBLAS_STATUS_SUCCESS) {
        printf ("CUBLAS initialization failed\n");
        return EXIT_FAILURE;
    }

    cub_stat = cublasSetMatrix(M,N, sizeof(float),h_a,M,d_a,M);    // Setto h_a su d_a
    if (cub_stat != CUBLAS_STATUS_SUCCESS) {
        printf ("data download failed matrix a");
        cudaFree (d_a);
        cudaFree (d_x);
        cudaFree (d_y);
        cublasDestroy(handle);
        return EXIT_FAILURE;
    }

    cub_stat = cublasSetMatrix(N, 1, sizeof(float),h_x, N, d_x, N);
    if (cub_stat != CUBLAS_STATUS_SUCCESS) {
        printf ("data download failed vector x");
        cudaFree (d_x);
        cudaFree (d_y);
        cudaFree (d_a);
        cublasDestroy(handle);
        return EXIT_FAILURE;
    }

    cudaEventCreate(&start_cublas);
    cudaEventCreate(&stop_cublas);
    cudaEventRecord(start_cublas);
    cub_stat = cublasSgemv(handle, CUBLAS_OP_N, M, N, &alpha, d_a, M, d_x, 1, &beta, d_y, 1);        // Calcolo il prodotto
    if (cub_stat != CUBLAS_STATUS_SUCCESS) {
        printf ("data download failed cublasgemv");
        cudaFree (d_x);
        cudaFree (d_a);
        cudaFree (d_y);
        cublasDestroy(handle);
        return EXIT_FAILURE;
    }
    cudaEventRecord(stop_cublas);
    cudaEventSynchronize(stop_cublas);
    cudaEventElapsedTime(&elapsed_time, start_cublas, stop_cublas);
    cudaEventDestroy(start_cublas);
    cudaEventDestroy(stop_cublas);

    cub_stat = cublasGetMatrix(1, M, sizeof(float), d_y, 1, h_y, 1);
    if (cub_stat != CUBLAS_STATUS_SUCCESS){
        printf("data download failed vector y");
        cudaFree(d_y);
        cudaFree(d_a);
        cudaFree(d_x);
        cublasDestroy(handle);
        return EXIT_FAILURE;
    }

    if(M < 10) {
        printf("Vettore risultante:\n");
        print_matrix(1, M, h_y);
    }
    printf("Matrice %d x %d.\n", M, N);
    printf("Tempo impiegato: %f\n", elapsed_time);

    cudaFree(d_y);
    cudaFree(d_a);
    cudaFree(d_x);
    cublasDestroy(handle);

    free(h_a);
    free(h_x);
    free(h_y);

}

Overwriting cublas_psacalare.cu


In [None]:
!nvcc -o ./cublas_psacalare cublas_psacalare.cu -lcublas
! ./cublas_psacalare

Matrice 8000 x 8000.
Tempo impiegato: 10.413152


In [None]:
%%writefile seriale.c
#include <stdlib.h>
#include <stdio.h>

#define ROWS 2000
#define COLS 2000

int main(int argc, char **argv) {
    double *matrix, *vector, *result;
    double start_time, end_time;
    struct timeval start, end;
    long seconds, useconds;
    double elapsed_time;

    matrix = (double*)malloc(ROWS * COLS * sizeof(double));
    vector = (double*)malloc(COLS * sizeof(double));
    result = (double*)malloc(ROWS * sizeof(double));

    for (int i = 0; i < ROWS; i++) {
        for (int j = 0; j < COLS; j++) {
            matrix[i * COLS + j] = i;
        }
    }

    for (int j = 0; j < COLS; j++) {
        vector[j] = 1.0;
    }

    gettimeofday(&start, NULL);  // Inizio del cronometro

    for (int i = 0; i < ROWS; i++) {
        result[i] = 0.0;
        for (int j = 0; j < COLS; j++) {
            result[i] += matrix[i * COLS + j] * vector[j];
        }
    }

    gettimeofday(&end, NULL);  // Fine del cronometro

    // Calcolo del tempo trascorso in millisecondi
    seconds = end.tv_sec - start.tv_sec;
    useconds = end.tv_usec - start.tv_usec;
    elapsed_time = seconds * 1000 + useconds / 1000.0;

    printf("Tempo di esecuzione: %.2f ms\n", elapsed_time);

    free(matrix);
    free(vector);
    free(result);

    return 0;
}

Overwriting seriale.c


In [None]:
!gcc seriale.c -o seriale
! ./seriale

[01m[Kseriale.c:[m[K In function ‘[01m[Kmain[m[K’:
   28 |     [01;35m[Kgettimeofday[m[K(&start, NULL);  // Inizio del cronometro
      |     [01;35m[K^~~~~~~~~~~~[m[K
Tempo di esecuzione: 21.20 ms
