<a href="https://colab.research.google.com/github/KCREEK/Assembly/blob/main/gpu_info_cu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%writefile gpu_info.cu

#include <stdio.h>
#include <cuda_runtime.h>

int main() {
    int deviceCount;
    cudaError_t error_id = cudaGetDeviceCount(&deviceCount);

    if (error_id != cudaSuccess) {
        printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id));
        printf("Result = FAIL\n");
        return EXIT_FAILURE;
    }

    printf("Detected %d CUDA-capable GPU(s).\n", deviceCount);

    for (int dev = 0; dev < deviceCount; ++dev) {
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, dev);

        printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);
        printf("  CUDA Capability Major/Minor version number: %d.%d\n", deviceProp.major, deviceProp.minor);
        printf("  Total amount of global memory: %.2f GB\n", (float)deviceProp.totalGlobalMem / (1024 * 1024 * 1024));
        printf("  Multiprocessors: %d\n", deviceProp.multiProcessorCount);

        // Calculate cores per multiprocessor based on GPU architecture
        int coresPerMultiprocessor;
        switch (deviceProp.major) {
            case 2: // Fermi
                coresPerMultiprocessor = (deviceProp.minor == 1) ? 48 : 32;
                break;
            case 3: // Kepler
                coresPerMultiprocessor = 192;
                break;
            case 5: // Maxwell
                coresPerMultiprocessor = 128;
                break;
            case 6: // Pascal
                coresPerMultiprocessor = (deviceProp.minor == 1) ? 128 : 64;
                break;
            case 7: // Volta and Turing
                coresPerMultiprocessor = 64;
                break;
            case 8: // Ampere
                coresPerMultiprocessor = (deviceProp.minor == 6) ? 128 : 64;
                break;
            default:
                coresPerMultiprocessor = 0;
                printf("Unknown device type\n");
                break;
        }

        int totalCores = coresPerMultiprocessor * deviceProp.multiProcessorCount;
        printf("  Total number of cores: %d\n", totalCores);
        printf("  Total amount of constant memory: %lu bytes\n", deviceProp.totalConstMem);
        printf("  Total amount of shared memory per block: %lu bytes\n", deviceProp.sharedMemPerBlock);
        printf("  Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
        printf("  Warp size: %d\n", deviceProp.warpSize);
        printf("  Maximum threads per multiprocessor: %d\n", deviceProp.maxThreadsPerMultiProcessor);
        printf("  Maximum threads per block: %d\n", deviceProp.maxThreadsPerBlock);
        printf("  Max dimension size of a thread block (x, y, z): (%d, %d, %d)\n",
               deviceProp.maxThreadsDim[0],
               deviceProp.maxThreadsDim[1],
               deviceProp.maxThreadsDim[2]);
        printf("  Max dimension size of a grid size    (x, y, z): (%d, %d, %d)\n",
               deviceProp.maxGridSize[0],
               deviceProp.maxGridSize[1],
               deviceProp.maxGridSize[2]);
    }

    return 0;
}


Writing gpu_info.cu


In [3]:
!nvcc gpu_info.cu -o gpu_info


In [4]:
!./gpu_info


Detected 1 CUDA-capable GPU(s).

Device 0: "Tesla T4"
  CUDA Capability Major/Minor version number: 7.5
  Total amount of global memory: 14.75 GB
  Multiprocessors: 40
  Total number of cores: 2560
  Total amount of constant memory: 65536 bytes
  Total amount of shared memory per block: 49152 bytes
  Total number of registers available per block: 65536
  Warp size: 32
  Maximum threads per multiprocessor: 1024
  Maximum threads per block: 1024
  Max dimension size of a thread block (x, y, z): (1024, 1024, 64)
  Max dimension size of a grid size    (x, y, z): (2147483647, 65535, 65535)
