In [2]:
!pip install nvcc4jupyter

Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1


In [1]:
!ls /usr/local/ | grep cuda

cuda
cuda-12
cuda-12.5


In [3]:
!export PATH=/usr/local/cuda-12.5/bin:$PATH
!export LD_LIBRARY_PATH=/usr/local/cuda-12.5/lib64:$LD_LIBRARY_PATH

In [4]:
!echo $PATH
!echo $LD_LIBRARY_PATH

/opt/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/tools/node/bin:/tools/google-cloud-sdk/bin
/usr/lib64-nvidia


In [5]:
%load_ext nvcc4jupyter

Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpwpb0anub".


# **Install, set up, and test Cublas**

In [None]:
!ls /usr/local/cuda/lib64/libcublas*

/usr/local/cuda/lib64/libcublasLt.so	       /usr/local/cuda/lib64/libcublas.so
/usr/local/cuda/lib64/libcublasLt.so.12        /usr/local/cuda/lib64/libcublas.so.12
/usr/local/cuda/lib64/libcublasLt.so.12.4.5.8  /usr/local/cuda/lib64/libcublas.so.12.4.5.8
/usr/local/cuda/lib64/libcublasLt_static.a     /usr/local/cuda/lib64/libcublas_static.a


In [6]:
import os
import subprocess

%env LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH

CUBLAS_COMPILER_ARGS = "-L/usr/local/cuda/lib64 -lcublas"
print(f"cuBLAS Compiler Args: {CUBLAS_COMPILER_ARGS}")

os.environ['PATH'] += ':/usr/local/cuda/bin'
os.environ['LD_LIBRARY_PATH'] += ':/usr/local/cuda/lib64'

env: LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
cuBLAS Compiler Args: -L/usr/local/cuda/lib64 -lcublas


For reference:

In [None]:
%%cuda --compiler-args "$CUBLAS_COMPILER_ARGS"
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <cuda_runtime.h>
#include "math.h"
#include "cublas_v2.h"

int main()
{
    const int streamsNum = 2;
    int N = 1 << 10; // Default 1024 x 1024 matrix

    // pinned host memory
    float *h_a, *h_b, *h_c, *h_x, *h_y, *h_z;
    cudaMallocHost((void**)&h_a, sizeof(float) * N * N);
    cudaMallocHost((void**)&h_b, sizeof(float) * N * N);
    cudaMallocHost((void**)&h_c, sizeof(float) * N * N);
    cudaMallocHost((void**)&h_x, sizeof(float) * N * N);
    cudaMallocHost((void**)&h_y, sizeof(float) * N * N);
    cudaMallocHost((void**)&h_z, sizeof(float) * N * N);

    // only set diagonal elements
    for (int i = 0; i < N; i++) {
        int idx = i * N + i; // Diagonal index
        h_a[idx] = sin(idx);
        h_b[idx] = sin(idx);
        h_c[idx] = cos(idx) * cos(idx);
        h_x[idx] = sin(idx);
        h_y[idx] = sin(idx);
        h_z[idx] = cos(idx) * cos(idx);
    }

    // allocate device memory
    float *d_a, *d_b, *d_c, *d_x, *d_y, *d_z;
    cudaMalloc((void**)&d_a, sizeof(float) * N * N);
    cudaMalloc((void**)&d_b, sizeof(float) * N * N);
    cudaMalloc((void**)&d_c, sizeof(float) * N * N);
    cudaMalloc((void**)&d_x, sizeof(float) * N * N);
    cudaMalloc((void**)&d_y, sizeof(float) * N * N);
    cudaMalloc((void**)&d_z, sizeof(float) * N * N);

    // create streams
    cudaStream_t streams[streamsNum];
    for (int i = 0; i < streamsNum; i++) {
        cudaStreamCreate(&streams[i]);
    }

    // create cuBLAS handles
    cublasHandle_t handle[streamsNum];
    for (int i = 0; i < streamsNum; i++) {
        cublasCreate(&handle[i]);
        cublasSetStream(handle[i], streams[i]); // Associate handle with stream
    }

    float alpha = 1.0f, beta = 1.0f;

    // async memory copies to device
    // cublasSetMatrixAsync(int rows, int cols, int elemSize, const void *A,
    //                      int lda, void *B, int ldb, cudaStream_t stream)

    cublasSetMatrixAsync(N, N, sizeof(float), h_a, N, d_a, N, streams[0]);
    cublasSetMatrixAsync(N, N, sizeof(float), h_b, N, d_b, N, streams[0]);
    cublasSetMatrixAsync(N, N, sizeof(float), h_c, N, d_c, N, streams[0]);
    cublasSetMatrixAsync(N, N, sizeof(float), h_x, N, d_x, N, streams[1]);
    cublasSetMatrixAsync(N, N, sizeof(float), h_y, N, d_y, N, streams[1]);
    cublasSetMatrixAsync(N, N, sizeof(float), h_z, N, d_z, N, streams[1]);

    // SGEMM: C = alpha * A * B + beta * C
    // cublasSgemm(cublasHandle_t handle,
    //                     cublasOperation_t transa, cublasOperation_t transb,
    //                     int m, int n, int k,
    //                     const float           *alpha,
    //                     const float           *A, int lda,
    //                     const float           *B, int ldb,
    //                     const float           *beta,
    //                     float           *C, int ldc)

    cublasSgemm(handle[0], CUBLAS_OP_N, CUBLAS_OP_N,
                N, N, N, &alpha, d_a, N, d_b, N, &beta, d_c, N);
    cublasSgemm(handle[1], CUBLAS_OP_N, CUBLAS_OP_N,
                N, N, N, &alpha, d_x, N, d_y, N, &beta, d_z, N);

    // Back to host
    cublasGetMatrixAsync(N, N, sizeof(float), d_c, N, h_c, N, streams[0]);
    cublasGetMatrixAsync(N, N, sizeof(float), d_z, N, h_z, N, streams[1]);

    // Wait for streams to complete
    for (int i = 0; i < streamsNum; i++) {
        cudaStreamSynchronize(streams[i]);
    }

    // Cleanup
    for (int i = 0; i < streamsNum; i++) {
        cublasDestroy(handle[i]);
        cudaStreamDestroy(streams[i]);
    }

    cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
    cudaFree(d_x); cudaFree(d_y); cudaFree(d_z);
    cudaFreeHost(h_a); cudaFreeHost(h_b); cudaFreeHost(h_c);
    cudaFreeHost(h_x); cudaFreeHost(h_y); cudaFreeHost(h_z);

    return 0;
}

In [20]:
%%cuda --compiler-args "$CUBLAS_COMPILER_ARGS"
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <cuda_runtime.h>
#include "math.h"
#include "cublas_v2.h"

int main()
{
    const int streamsNum = 2;
    int N = 1 << 10; // Default 1024 x 1024 matrix

    // pinned host memory
    float *h_a, *h_b, *h_c, *h_x, *h_y, *h_z;
    cudaMallocHost((void**)&h_a, sizeof(float) * N * N);
    cudaMallocHost((void**)&h_b, sizeof(float) * N * N);
    cudaMallocHost((void**)&h_c, sizeof(float) * N * N);
    cudaMallocHost((void**)&h_x, sizeof(float) * N * N);
    cudaMallocHost((void**)&h_y, sizeof(float) * N * N);
    cudaMallocHost((void**)&h_z, sizeof(float) * N * N);

    // only set diagonal elements
    for (int i = 0; i < N; i++) {
        int idx = i * N + i; // Diagonal index
        h_a[idx] = sin(idx);
        h_b[idx] = sin(idx);
        h_c[idx] = cos(idx) * cos(idx);
        h_x[idx] = sin(idx);
        h_y[idx] = sin(idx);
        h_z[idx] = cos(idx) * cos(idx);
    }

    // allocate device memory
    float *d_a, *d_b, *d_c, *d_x, *d_y, *d_z;
    cudaMalloc((void**)&d_a, sizeof(float) * N * N);
    cudaMalloc((void**)&d_b, sizeof(float) * N * N);
    cudaMalloc((void**)&d_c, sizeof(float) * N * N);
    cudaMalloc((void**)&d_x, sizeof(float) * N * N);
    cudaMalloc((void**)&d_y, sizeof(float) * N * N);
    cudaMalloc((void**)&d_z, sizeof(float) * N * N);

    // create streams
    cudaStream_t streams[streamsNum];
    for (int i = 0; i < streamsNum; i++) {
        cudaStreamCreate(&streams[i]);
    }

    // create cuBLAS handles
    cublasHandle_t handle[streamsNum];
    for (int i = 0; i < streamsNum; i++) {
        cublasCreate(&handle[i]);
        cublasSetStream(handle[i], streams[i]); // Associate handle with stream
    }

    float alpha = 1.0f, beta = 1.0f;

    // async memory copies to device
    // cublasSetMatrixAsync(int rows, int cols, int elemSize, const void *A,
    //                      int lda, void *B, int ldb, cudaStream_t stream)

    cublasSetMatrixAsync(N, N, sizeof(float), h_a, N, d_a, N, streams[0]);
    cublasSetMatrixAsync(N, N, sizeof(float), h_b, N, d_b, N, streams[0]);
    cublasSetMatrixAsync(N, N, sizeof(float), h_c, N, d_c, N, streams[0]);
    cublasSetMatrixAsync(N, N, sizeof(float), h_x, N, d_x, N, streams[1]);
    cublasSetMatrixAsync(N, N, sizeof(float), h_y, N, d_y, N, streams[1]);
    cublasSetMatrixAsync(N, N, sizeof(float), h_z, N, d_z, N, streams[1]);

    // SGEMM: C = alpha * A * B + beta * C
    // cublasSgemm(cublasHandle_t handle,
    //                     cublasOperation_t transa, cublasOperation_t transb,
    //                     int m, int n, int k,
    //                     const float           *alpha,
    //                     const float           *A, int lda,
    //                     const float           *B, int ldb,
    //                     const float           *beta,
    //                     float           *C, int ldc)

    cublasSgemm(handle[0], CUBLAS_OP_N, CUBLAS_OP_N,
                N, N, N, &alpha, d_a, N, d_b, N, &beta, d_c, N);
    cublasSgemm(handle[1], CUBLAS_OP_N, CUBLAS_OP_N,
                N, N, N, &alpha, d_x, N, d_y, N, &beta, d_z, N);

    // Back to host
    cublasGetMatrixAsync(N, N, sizeof(float), d_c, N, h_c, N, streams[0]);
    cublasGetMatrixAsync(N, N, sizeof(float), d_z, N, h_z, N, streams[1]);

    // Wait for streams to complete
    for (int i = 0; i < streamsNum; i++) {
        cudaStreamSynchronize(streams[i]);
    }

    // Cleanup
    for (int i = 0; i < streamsNum; i++) {
        cublasDestroy(handle[i]);
        cudaStreamDestroy(streams[i]);
    }

    cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
    cudaFree(d_x); cudaFree(d_y); cudaFree(d_z);
    cudaFreeHost(h_a); cudaFreeHost(h_b); cudaFreeHost(h_c);
    cudaFreeHost(h_x); cudaFreeHost(h_y); cudaFreeHost(h_z);

    return 0;
}


