# 02 : GPU Check

This is a simple test to see if the GPU is available and working correctly.

- https://stackoverflow.com/questions/76581229/is-it-possible-to-check-if-gpu-is-available-without-using-deep-learning-packages
- https://docs.mlrun.org/en/v1.7.2/runtimes/configuring-job-resources.html
- https://docs.k3s.io/advanced#nvidia-container-runtime

In [1]:
import mlrun

In [2]:
# Show the API server URL
mlrun.get_run_db()

HTTPRunDB('http://dragon.local:30070')

In [3]:
# Set the base project name
project_name = "mlrun-demo"

# Initialize the MLRun project object
project = mlrun.get_or_create_project(
    name=project_name, 
    context="./",
    user_project=True)

# Display the current project name
project_name = project.metadata.name
print(f'Full project name: {project_name}')

> 2025-08-06 13:06:27,891 [info] Project loaded successfully: {"project_name":"mlrun-demo-johannes"}
Full project name: mlrun-demo-johannes


## Get GPU Function

In [4]:
%%writefile 02_get_gpu_info.py

import GPUtil
import subprocess

def get_gpu_info(context):    
    gpus = GPUtil.getGPUs()
    gpu_info = []
    for gpu in gpus:
        gpu_info.append({
            'id': gpu.id,
            'name': gpu.name,
            'load': gpu.load,
            'memory_total': gpu.memoryTotal,
            'memory_free': gpu.memoryFree,
            'memory_used': gpu.memoryUsed,
        })

    print(f"GPU Info v4: {gpu_info}")
    context.logger.info(f"GPU Info: {gpu_info}")

    # execute the nvidia-smi command on the cli to get detailed GPU info
    try:
        nvidia_smi_output = subprocess.check_output(['nvidia-smi'], universal_newlines=True)
        print("NVIDIA-SMI Output:")
        print(nvidia_smi_output)
        context.logger.info(f"NVIDIA-SMI Output:\n{nvidia_smi_output}")
    except Exception as e:
        error_msg = f"Error running nvidia-smi: {str(e)}"
        print(error_msg)
        context.logger.warning(error_msg)
    
    return gpu_info

Overwriting 02_get_gpu_info.py


## ML Run Function

In [5]:
#image = "mlrun/mlrun-gpu:1.9.1-py39"
image =  "registry-service.mlrun.svc.cluster.local/mlrun/mlrun-gpu:1.9.1-py39"
#image =  "mlrun/mlrun-gpu:1.7.2"
#image =  "mlrun/mlrun:1.7.2"

fn_gpu_check = project.set_function(
    func="02_get_gpu_info.py",
    name="gpu-check",
    tag="latest",
    kind="job",
    image=image,
    handler="get_gpu_info",
    requirements=["GPUtil==1.4.0"])

# Then set the GPU resources on the function's spec
fn_gpu_check.with_limits(mem="2G", cpu=2, gpus=1)  # upper bound

# build the function
project.build_function(function='gpu-check')

> 2025-08-06 13:06:28,035 [info] Started building image: .mlrun/func-mlrun-demo-johannes-gpu-check:latest


The `overwrite_build_params` parameter default will change from 'False' to 'True' in 1.10.0.


[36mINFO[0m[0000] Retrieving image manifest registry-service.mlrun.svc.cluster.local/mlrun/mlrun-gpu:1.9.1-py39 
[36mINFO[0m[0000] Retrieving image registry-service.mlrun.svc.cluster.local/mlrun/mlrun-gpu:1.9.1-py39 from registry registry-service.mlrun.svc.cluster.local 
[36mINFO[0m[0000] Built cross stage deps: map[]                
[36mINFO[0m[0000] Retrieving image manifest registry-service.mlrun.svc.cluster.local/mlrun/mlrun-gpu:1.9.1-py39 
[36mINFO[0m[0000] Returning cached image manifest              
[36mINFO[0m[0000] Executing 0 build triggers                   
[36mINFO[0m[0000] Building stage 'registry-service.mlrun.svc.cluster.local/mlrun/mlrun-gpu:1.9.1-py39' [idx: '0', base-idx: '-1'] 
[36mINFO[0m[0000] Unpacking rootfs as cmd RUN echo 'Installing /empty/requirements.txt...'; cat /empty/requirements.txt requires it. 
[36mINFO[0m[0092] RUN echo 'Installing /empty/requirements.txt...'; cat /empty/requirements.txt 
[36mINFO[0m[0092] Initializing snapshotte

BuildStatus(ready=True, outputs={'image': '.mlrun/func-mlrun-demo-johannes-gpu-check:latest'})

In [6]:
# run the function locally
fn_gpu_check.run(
    local=False,
    handler="get_gpu_info",
    auto_build=True
)

> 2025-08-06 13:09:44,469 [info] Storing function: {"db":"http://dragon.local:30070","name":"gpu-check-get-gpu-info","uid":"07c8642390ed425583220b4b17cc210a"}
> 2025-08-06 13:09:44,932 [info] Job is running in the background, pod: gpu-check-get-gpu-info-vssz5
GPU Info v4: [{'id': 0, 'name': 'NVIDIA GeForce RTX 3090 Ti', 'load': 0.39, 'memory_total': 24564.0, 'memory_free': 24056.0, 'memory_used': 183.0}]
> 2025-08-06 11:13:45,641 [info] GPU Info: [{'id': 0, 'name': 'NVIDIA GeForce RTX 3090 Ti', 'load': 0.39, 'memory_total': 24564.0, 'memory_free': 24056.0, 'memory_used': 183.0}]
NVIDIA-SMI Output:
Wed Aug  6 11:13:45 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.163.01             Driver Version: 550.163.01     CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. E

project,uid,iter,start,end,state,kind,name,labels,inputs,parameters,results
mlrun-demo-johannes,...cc210a,0,Aug 06 11:13:45,2025-08-06 11:13:45.767476+00:00,completed,run,gpu-check-get-gpu-info,v3io_user=johanneskind=jobowner=johannesmlrun/client_version=1.9.1mlrun/client_python_version=3.9.23host=gpu-check-get-gpu-info-vssz5,,,"return=[{'id': 0, 'name': 'NVIDIA GeForce RTX 3090 Ti', 'load': 0.39, 'memory_total': 24564.0, 'memory_free': 24056.0, 'memory_used': 183.0}]"





> 2025-08-06 13:13:57,405 [info] Run execution finished: {"name":"gpu-check-get-gpu-info","status":"completed"}


<mlrun.model.RunObject at 0x161225f70>

In [7]:
# Refresh the function from the saved .py file (this is instant)
project.set_function(func="02_get_gpu_info.py", name="gpu-check")

# run the function again [you can modify the py file and set the function again = this will not rebuild the entire thing] 
project.run_function(
    function='gpu-check',
    handler='get_gpu_info'
)

> 2025-08-06 13:13:57,502 [info] Storing function: {"db":"http://dragon.local:30070","name":"gpu-check-get-gpu-info","uid":"5e7a3eae792f4232a65dde2d5e22a51f"}
> 2025-08-06 13:14:04,971 [info] Job is running in the background, pod: gpu-check-get-gpu-info-5g2jm
GPU Info v4: [{'id': 0, 'name': 'NVIDIA GeForce RTX 3090 Ti', 'load': 0.49, 'memory_total': 24564.0, 'memory_free': 24056.0, 'memory_used': 183.0}]
> 2025-08-06 11:14:59,202 [info] GPU Info: [{'id': 0, 'name': 'NVIDIA GeForce RTX 3090 Ti', 'load': 0.49, 'memory_total': 24564.0, 'memory_free': 24056.0, 'memory_used': 183.0}]
NVIDIA-SMI Output:
Wed Aug  6 11:14:59 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.163.01             Driver Version: 550.163.01     CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. E

project,uid,iter,start,end,state,kind,name,labels,inputs,parameters,results
mlrun-demo-johannes,...22a51f,0,Aug 06 11:14:59,2025-08-06 11:14:59.326283+00:00,completed,run,gpu-check-get-gpu-info,v3io_user=johanneskind=jobowner=johannesmlrun/client_version=1.9.1mlrun/client_python_version=3.9.23host=gpu-check-get-gpu-info-5g2jm,,,"return=[{'id': 0, 'name': 'NVIDIA GeForce RTX 3090 Ti', 'load': 0.49, 'memory_total': 24564.0, 'memory_free': 24056.0, 'memory_used': 183.0}]"





> 2025-08-06 13:15:08,355 [info] Run execution finished: {"name":"gpu-check-get-gpu-info","status":"completed"}


<mlrun.model.RunObject at 0x16121bb20>