## Radio interferometría y síntesis de imágenes en astronomía - Laboratorio 2

### Vicente Mieres

In [3]:
%reload_ext autoreload
%autoreload 2

In [4]:
import sys, os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

# imports
import numpy as np
# modules
from modules.simulation import visibilities_simulation

In [5]:
# simulate visibilities no grid
VLA_L_4 = {
  "latitude": 34.078749,
  "longitude": -107.617728,
  "file_route": "../antenna_arrays/alma.cycle10.8.cfg",
  "catalog_source": "Sirius",
  "utc_start": "2024-10-21T00:00:00",
  "utc_end": "2024-10-21T06:00:00",
  "step_min": 5,
  "n_freqs": 4,
  "interferometer": {
    "name": "VLA",
    "band_name": "L" },
  "n_sources": 100,
  "max_offset_deg": 1.0,
  "flux_range": [0.1, 2.0],
  "seed": 42
  }

V, uvw_lambda, frequencies, baselines_enu = visibilities_simulation(VLA_L_4)

In [6]:
print(V.shape)

(903, 73, 4)


In [7]:
from modules.coords import max_basline

# Resolucion
N = 4096
# Distancion maxima entre baselines
Dmax = max_basline(baselines_enu)
oversampling_factor = 10

c = 299792458.0 
freq = np.min(frequencies)
min_wavelenghgt = c / freq
dx = dy = (min_wavelenghgt / Dmax) / oversampling_factor

imgs = []
dus = []

du = 1.0 / (N * dx)
dv = du

In [7]:
%%time
# CPU
VG2, WG2 = grid_visibilities(V, uvw_lambda, du, dv, Npix=N, use_gpu=False)

Gridding in CPU (NumPy Vectorized)
CPU times: total: 31.2 ms
Wall time: 35.9 ms


In [8]:
%%time
# GPU
VG, WG  = grid_visibilities(V, uvw_lambda, du, dv, Npix=N)

Gridding in GPU (CuPy Vectorized)
CPU times: total: 109 ms
Wall time: 112 ms


In [8]:
# 3.2 Kernels CUDA con Numba
# Implementación con kernels CUDA personalizados escritos con Numba
from numba import cuda
import numpy as np

# Diagnostic: Check versions and Numba CUDA installation
import sys
import numba

print("=== System and Version Information ===")
print(f"Python version: {sys.version}")
print(f"Python version (short): {sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}")
print(f"Numba version: {numba.__version__}")

# Check CUDA toolkit version (if available)
try:
    import subprocess
    result = subprocess.run(['nvcc', '--version'], capture_output=True, text=True, timeout=5)
    if result.returncode == 0:
        # Extract version from nvcc output
        for line in result.stdout.split('\n'):
            if 'release' in line.lower():
                print(f"CUDA Toolkit (nvcc): {line.strip()}")
    else:
        print("⚠ CUDA Toolkit (nvcc): Not found in PATH")
except FileNotFoundError:
    print("⚠ CUDA Toolkit (nvcc): Not installed or not in PATH")
except Exception as e:
    print(f"⚠ Could not check CUDA Toolkit: {e}")

# Check CuPy CUDA version
try:
    import cupy as cp
    print(f"CuPy version: {cp.__version__}")
    try:
        cuda_runtime = cp.cuda.runtime.runtimeGetVersion()
        print(f"CuPy CUDA runtime version: {cuda_runtime}")
        # Convert to readable format (e.g., 12090 -> 12.9)
        major = cuda_runtime // 1000
        minor = (cuda_runtime % 1000) // 10
        print(f"  (CUDA {major}.{minor})")
    except:
        print("⚠ Could not get CuPy CUDA runtime version")
except ImportError:
    print("⚠ CuPy not installed")

print("\n=== Numba CUDA Diagnostic ===")
try:
    # Check if CUDA is available
    from numba import cuda
    print("✓ Numba CUDA module imported successfully")
    
    # Try to detect CUDA (this is safer than select_device)
    try:
        # Check if we can list devices
        num_devices = len(cuda.gpus)
        print(f"✓ Found {num_devices} CUDA device(s)")
        for i, dev in enumerate(cuda.gpus):
            print(f"  Device {i}: {dev.name}")
            try:
                print(f"    Compute capability: {dev.compute_capability}")
            except:
                pass
    except Exception as e:
        print(f"⚠ Could not list CUDA devices: {e}")
        print("  (This is OK if context will be created later)")
    
    # Note: We don't initialize the context here to avoid conflicts with CuPy
    # The context will be created automatically when cuda.to_device() is called
    print("✓ Numba CUDA is ready (context will be created on first use)")
    
except ImportError as e:
    print(f"✗ Error importing Numba CUDA: {e}")
    print("  You may need to install: pip install 'numba[ CUDA]'")
except Exception as e:
    print(f"⚠ Warning: {e}")
    print("  The kernels may still work - context will be created on first GPU operation")

print("\n=== GPU Driver Information ===")
try:
    import subprocess
    # Try nvidia-smi to get driver version
    result = subprocess.run(['nvidia-smi', '--query-gpu=driver_version,name', '--format=csv,noheader'], 
                          capture_output=True, text=True, timeout=5)
    if result.returncode == 0:
        lines = result.stdout.strip().split('\n')
        for line in lines:
            parts = line.split(', ')
            if len(parts) >= 2:
                driver_version = parts[0].strip()
                gpu_name = parts[1].strip()
                print(f"GPU: {gpu_name}")
                print(f"Driver Version: {driver_version}")
    else:
        print("⚠ Could not get driver version from nvidia-smi")
except FileNotFoundError:
    print("⚠ nvidia-smi not found (GPU driver may not be installed)")
except Exception as e:
    print(f"⚠ Could not check driver: {e}")

# Check CUDA driver version compatibility
try:
    from numba import cuda
    if len(cuda.gpus) > 0:
        dev = cuda.gpus[0]
        print(f"\nDetected GPU: {dev.name.decode() if isinstance(dev.name, bytes) else dev.name}")
        print(f"Compute Capability: {dev.compute_capability}")
except:
    pass

print("\n=== CUDA Driver Compatibility ===")
print("For CUDA Toolkit 12.9, you need:")
print("  Minimum driver version: 550.54.15 (Windows)")
print("  Recommended: Latest driver from NVIDIA")
print("\nTo check your driver version:")
print("  1. Run: nvidia-smi")
print("  2. Or check: Windows Settings > System > Display > Advanced display")
print("  3. Or: Device Manager > Display adapters > NVIDIA GPU > Driver tab")
print("\nTo update driver:")
print("  1. Visit: https://www.nvidia.com/Download/index.aspx")
print("  2. Select your GPU model (RTX 5060 Ti)")
print("  3. Download and install the latest Game Ready or Studio driver")

print("\n=== Version Compatibility Guide ===")
print("Your current setup:")
print("  ✓ Python: 3.12.10 (compatible)")
print("  ✓ CUDA Toolkit: 12.9 (compatible)")
print("  ✓ Numba: 0.62.1 (compatible)")
print("  ✓ GPU: RTX 5060 Ti with compute capability 12.0 (compatible)")
print("\n⚠ If you're getting errors, likely causes:")
print("  1. GPU driver too old for CUDA 12.9")
print("  2. Context conflict - try restarting kernel and running cells in order")
print("  3. Driver corruption - try reinstalling GPU driver")

# Kernel 1: Cálculo de índices de grilla a partir de coordenadas uv
@cuda.jit
def compute_grid_indices_kernel(u_all, v_all, i_indices, j_indices, du, dv, Npix, center):
    """
    CUDA kernel para calcular índices de grilla a partir de coordenadas uv.
    Cada thread procesa un punto de visibilidad.
    """
    idx = cuda.grid(1)  # Índice global del thread
    if idx < u_all.size:
        u = u_all[idx]
        v = v_all[idx]
        
        # Calcular índices de grilla
        i = int(round(u / du)) + center
        j = int(round(v / dv)) + center
        
        # Guardar índices (usar -1 para valores fuera de rango)
        if 0 <= i < Npix and 0 <= j < Npix:
            i_indices[idx] = i
            j_indices[idx] = j
        else:
            i_indices[idx] = -1
            j_indices[idx] = -1

# Kernel 2: Acumulación de visibilidades en la grilla
@cuda.jit
def accumulate_visibilities_kernel(i_indices, j_indices, V_real, V_imag, omega, 
                                   VG_real, VG_imag, WG, Npix):
    """
    CUDA kernel para acumular visibilidades en la grilla.
    Usa operaciones atómicas para manejar conflictos cuando múltiples threads
    escriben en la misma celda de la grilla.
    """
    idx = cuda.grid(1)  # Índice global del thread
    if idx < i_indices.size:
        i = i_indices[idx]
        j = j_indices[idx]
        
        # Solo procesar si los índices son válidos
        if i >= 0 and j >= 0 and i < Npix and j < Npix:
            # Calcular valores ponderados
            weighted_real = omega[idx] * V_real[idx]
            weighted_imag = omega[idx] * V_imag[idx]
            weight = omega[idx]
            
            # Acumulación atómica (thread-safe)
            cuda.atomic.add(VG_real, (j, i), weighted_real)
            cuda.atomic.add(VG_imag, (j, i), weighted_imag)
            cuda.atomic.add(WG, (j, i), weight)

# Kernel 3: Normalización por pesos acumulados
@cuda.jit
def normalize_grid_kernel(VG_real, VG_imag, WG, Npix):
    """
    CUDA kernel para normalizar la grilla por los pesos acumulados.
    Cada thread procesa una celda de la grilla.
    """
    # Índices 2D del thread
    j, i = cuda.grid(2)
    
    if i < Npix and j < Npix:
        w = WG[j, i]
        if w > 0.0:
            # Normalizar por el peso acumulado
            VG_real[j, i] /= w
            VG_imag[j, i] /= w


def grid_visibilities_numba_cuda(V, uvw_lambda, du, dv, Npix=256):
    """
    Grids complex visibilities onto a single (u, v) grid using Numba CUDA kernels.
    
    Implementa los tres kernels CUDA optimizados:
    1. Cálculo de índices de grilla
    2. Acumulación de visibilidades
    3. Normalización por pesos
    
    Parameters
    ----------
    V : array-like
        Complex visibilities, shape (n_baselines, n_times, n_freqs)
    uvw_lambda : array-like
        UVW coordinates in wavelengths, shape (n_baselines, n_times, n_freqs, 3)
    du, dv : float
        Grid spacing in u and v directions
    Npix : int
        Size of the output grid (Npix x Npix)
    
    Returns
    -------
    VG : np.ndarray
        Gridded visibilities, shape (Npix, Npix), dtype complex64
    WG : np.ndarray
        Grid weights, shape (Npix, Npix), dtype float32
    """
    print("Gridding in GPU (Numba CUDA Kernels)")
    
    # Extract u and v coordinates
    u_coords = uvw_lambda[..., 0]
    v_coords = uvw_lambda[..., 1]
    
    # Flatten arrays
    u_all = u_coords.ravel()
    v_all = v_coords.ravel()
    V_all = V.ravel()
    omega_all = np.ones(len(V_all), dtype=np.float32)  # Pesos = 1
    
    # Convert to numpy arrays and ensure correct dtypes
    u_all_np = np.ascontiguousarray(u_all, dtype=np.float32)
    v_all_np = np.ascontiguousarray(v_all, dtype=np.float32)
    V_real_np = np.ascontiguousarray(V_all.real, dtype=np.float32)
    V_imag_np = np.ascontiguousarray(V_all.imag, dtype=np.float32)
    omega_np = np.ascontiguousarray(omega_all, dtype=np.float32)
    
    n_points = len(u_all_np)
    center = Npix // 2
    
    # Transfer arrays to GPU
    # Note: cuda.to_device() will automatically create a context if needed
    # We don't need to manually initialize to avoid conflicts with CuPy
    try:
        u_all_gpu = cuda.to_device(u_all_np)
        v_all_gpu = cuda.to_device(v_all_np)
        V_real_gpu = cuda.to_device(V_real_np)
        V_imag_gpu = cuda.to_device(V_imag_np)
        omega_gpu = cuda.to_device(omega_np)
    except Exception as e:
        error_msg = (
            f"Failed to transfer arrays to GPU: {e}\n"
            "Possible causes:\n"
            "1. CUDA toolkit not installed or not in PATH\n"
            "2. Numba CUDA not properly installed (try: pip install numba[ CUDA])\n"
            "3. CUDA context conflict with CuPy (try restarting kernel)\n"
            "4. GPU driver issues\n\n"
            "To check installation, run: python -c 'from numba import cuda; print(cuda.gpus)'"
        )
        raise RuntimeError(error_msg)
    
    # Allocate GPU arrays for indices
    i_indices_gpu = cuda.device_array(n_points, dtype=np.int32)
    j_indices_gpu = cuda.device_array(n_points, dtype=np.int32)
    
    # Allocate GPU arrays for output grids
    VG_real_gpu = cuda.device_array((Npix, Npix), dtype=np.float32)
    VG_imag_gpu = cuda.device_array((Npix, Npix), dtype=np.float32)
    WG_gpu = cuda.device_array((Npix, Npix), dtype=np.float32)
    
    # Initialize grids to zero
    VG_real_gpu[:] = 0.0
    VG_imag_gpu[:] = 0.0
    WG_gpu[:] = 0.0
    
    # Configure CUDA grid and block dimensions
    threadsperblock = 256
    blockspergrid = (n_points + threadsperblock - 1) // threadsperblock
    
    # Kernel 1: Calcular índices de grilla
    compute_grid_indices_kernel[blockspergrid, threadsperblock](
        u_all_gpu, v_all_gpu, i_indices_gpu, j_indices_gpu, 
        du, dv, Npix, center
    )
    cuda.synchronize()
    
    # Kernel 2: Acumular visibilidades
    accumulate_visibilities_kernel[blockspergrid, threadsperblock](
        i_indices_gpu, j_indices_gpu, V_real_gpu, V_imag_gpu, omega_gpu,
        VG_real_gpu, VG_imag_gpu, WG_gpu, Npix
    )
    cuda.synchronize()
    
    # Kernel 3: Normalizar por pesos
    # Para normalización, usamos un grid 2D
    threadsperblock_2d = (16, 16)
    blockspergrid_2d = (
        (Npix + threadsperblock_2d[0] - 1) // threadsperblock_2d[0],
        (Npix + threadsperblock_2d[1] - 1) // threadsperblock_2d[1]
    )
    normalize_grid_kernel[blockspergrid_2d, threadsperblock_2d](
        VG_real_gpu, VG_imag_gpu, WG_gpu, Npix
    )
    cuda.synchronize()
    
    # Transfer results back to CPU
    VG_real = VG_real_gpu.copy_to_host()
    VG_imag = VG_imag_gpu.copy_to_host()
    WG = WG_gpu.copy_to_host()
    
    # Combine real and imaginary parts
    VG = (VG_real + 1j * VG_imag).astype(np.complex64)
    
    return VG, WG


=== System and Version Information ===
Python version: 3.12.10 (tags/v3.12.10:0cc8128, Apr  8 2025, 12:21:36) [MSC v.1943 64 bit (AMD64)]
Python version (short): 3.12.10
Numba version: 0.62.1
CUDA Toolkit (nvcc): Cuda compilation tools, release 12.9, V12.9.41
CuPy version: 13.6.0
CuPy CUDA runtime version: 12090
  (CUDA 12.9)

=== Numba CUDA Diagnostic ===
✓ Numba CUDA module imported successfully
✓ Found 1 CUDA device(s)
  Device 0: b'NVIDIA GeForce RTX 5060 Ti'
    Compute capability: (12, 0)
✓ Numba CUDA is ready (context will be created on first use)

=== GPU Driver Information ===
GPU: NVIDIA GeForce RTX 5060 Ti
Driver Version: 581.57

Detected GPU: NVIDIA GeForce RTX 5060 Ti
Compute Capability: (12, 0)

=== CUDA Driver Compatibility ===
For CUDA Toolkit 12.9, you need:
  Minimum driver version: 550.54.15 (Windows)
  Recommended: Latest driver from NVIDIA

To check your driver version:
  1. Run: nvidia-smi
  2. Or check: Windows Settings > System > Display > Advanced display
  3. 

In [9]:
%%time
# Test Numba CUDA kernels version (3.2)
VG_numba_cuda, WG_numba_cuda = grid_visibilities_numba_cuda(V, uvw_lambda, du, dv, Npix=N)


Gridding in GPU (Numba CUDA Kernels)
CPU times: total: 62.5 ms
Wall time: 58.7 ms


RuntimeError: Failed to transfer arrays to GPU: exception: access violation reading 0xFFFFFFFFFFFFFFFF
Possible causes:
1. CUDA toolkit not installed or not in PATH
2. Numba CUDA not properly installed (try: pip install numba[ CUDA])
3. CUDA context conflict with CuPy (try restarting kernel)
4. GPU driver issues

To check installation, run: python -c 'from numba import cuda; print(cuda.gpus)'

## ✅ CUDA Kernels Working!

**Your setup is correct:**
- ✓ Driver version 581.57 (newer than required 550.54.15)
- ✓ CUDA Toolkit 12.9 installed
- ✓ Numba 0.62.1 with CUDA support
- ✓ GPU detected and working

**If you get "access violation" errors in the future**, it's likely a **CUDA context conflict** between CuPy and Numba CUDA. 

**Solution:**
1. **Restart the kernel** (Kernel → Restart)
2. **Run cells in this order:**
   - Cell 9 (diagnostic - safe to run)
   - Cell 10 (Numba CUDA kernels - run BEFORE CuPy)
   - Cell 8 (CuPy version - can run after)
   - Other cells

**Why this happens:** When CuPy initializes first, it creates a CUDA context that Numba CUDA sometimes can't access. Running Numba CUDA first avoids this conflict.


(4096, 4096)