<a href="https://colab.research.google.com/github/JRamon19/CUDA_LABS/blob/main/ECU1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# LIBRARY

!uv pip install -q --system numba-cuda==0.4.0
import numpy as np
from numba import cuda
import time
import os
from numba import config
config.CUDA_ENABLE_PYNVJITLINK = 1

In [2]:
# EXAMLE 1: INITIALIZE THE GPU AND CPU

# Libraryes
!uv pip install -q --system numba-cuda==0.4.0
import numpy as np
import time
import os
from numba import cuda
from numba import config

# --- Configuration & Data Preparation ---

config.CUDA_ENABLE_PYNVJITLINK = 1

@cuda.jit
def first_kernel(a, result):
  idx = cuda.grid(1)
  if idx < a.size:
    result[idx] = a[idx]

# HOST CPU
def main():
  #2. Initialize data on CPU
  N = 10_000_000
  a_cpu = np.arange(N, dtype = np.float32)

  #-----------------------------------------
  #CPU computation
  #-----------------------------------------
  start = time.time()
  result_cpu = a_cpu
  cpu_time = time.time() - start
  print(f"CPU Time: {cpu_time * 1e3:.2f}")

  #-----------------------------------------
  #GPU computation
  #-----------------------------------------
  #2.- Transfer from CPU to GPU
  start = time.time()
  a_gpu = cuda.to_device(a_cpu)
  result_gpu = cuda.device_array_like(a_cpu) # reserve memory
  transfer_in_time = time.time() - start

  # Kernel launch
  """
  Cada bloque contiene 128 hilos
  """
  threads_per_block = 128
  blocks_per_grid = (N + threads_per_block -1) //threads_per_block # (10_000_000 + 127) / 128 = 78,125 blocks
  start = time.time()
  first_kernel[blocks_per_grid, threads_per_block](a_cpu, result_gpu) # lunch kernel
  cuda.synchronize()
  kernel_time = time.time() - start

  #Copy back
  start = time.time()
  result_from_gpu = result_gpu.copy_to_host()
  cuda.synchronize()
  transfer_out_time = time.time() - start

  #Report
  print(f"GPU transfer to device: {transfer_in_time * 1e3:.2f} ms")
  print(f"GPU kernel execution: {kernel_time * 1e3:.2f} ms")
  print(f"GPU transfer to host: {transfer_out_time * 1e3:.2f} ms")
  print(f"Total GPU time: {transfer_in_time + kernel_time + transfer_out_time * 1e3:.2f} ms")

  #cleanup
  del a_gpu, result_gpu
  cuda.close()

if __name__ == "__main__":
  main()

CPU Time: 0.00
GPU transfer to device: 192.74 ms
GPU kernel execution: 1875.93 ms
GPU transfer to host: 17.43 ms
Total GPU time: 19.49 ms


