<a href="https://colab.research.google.com/github/HadasRavikovitch/Final-Project---GPU/blob/main/cross_time_tests_copy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

LLG Kernel - Basic function to run on GPU

In [None]:
import numpy as np
from numba import cuda, float64, int64
import math

len_matrix = (100000,3)
len_M0_norm = 100000

@cuda.jit(device=True)
def cross_product(a, b, result):  # Pass result array as an argument
  """
  Calculates the cross product of two 3D vectors.
  """
  result[0] = a[1] * b[2] - a[2] * b[1]
  result[1] = a[2] * b[0] - a[0] * b[2]
  result[2] = a[0] * b[1] - a[1] * b[0]
  #return result  # No need to return, result is modified in-place

@cuda.jit(device=True)
def norm(array):  # Pass result array as an argument
    x = array[0]
    y = array[1]
    z = array[2]
    return math.sqrt(x**2 + y**2 + z**2)

@cuda.jit
#def LLG_kernel(array1, array2, dt, alpha, llg_result, M_0, an_res, bn_res, cn_res, dn_res):
def LLG_kernel(array1, array2, dt, alpha, llg_result):
  llg_gama = gama/((1+alpha**2))
  llg_lamda = gama*alpha/(1+alpha**2)

  cross1 = cuda.local.array(len_matrix, dtype=float64)  # Allocate cross product result arrays
  cross2 = cuda.local.array(len_matrix, dtype=np.float64)
  cross12 = cuda.local.array(len_matrix, dtype=np.float64)
  cross22 = cuda.local.array(len_matrix, dtype=np.float64)
  cross13 = cuda.local.array(len_matrix, dtype=np.float64)
  cross23 = cuda.local.array(len_matrix, dtype=np.float64)
  cross14 = cuda.local.array(len_matrix, dtype=np.float64)
  cross24 = cuda.local.array(len_matrix, dtype=np.float64)
  M_norm = cuda.local.array(len_M0_norm, dtype=np.float64)

  an = cuda.local.array(len_matrix, dtype=np.float64)
  bn = cuda.local.array(len_matrix, dtype=np.float64)
  cn = cuda.local.array(len_matrix, dtype=np.float64)
  dn = cuda.local.array(len_matrix, dtype=np.float64)

  sum_bn = cuda.local.array(len_matrix, dtype=np.float64)
  sum_cn = cuda.local.array(len_matrix, dtype=np.float64)
  sum_dn = cuda.local.array(len_matrix, dtype=np.float64)

  idx = cuda.grid(1)
  if idx < array1.shape[0]:
    # Calculate M0 (norm) manually
    #M_norm[idx] = norm_row(array1[idx])
    temp_norm_result = cuda.local.array(1, dtype=float64)
    temp_norm_result = norm(array1[idx]) # Call norm_row with two arguments
    M_norm[idx] = temp_norm_result

    cross_product(array1[idx], array2[idx], cross1[idx])  # Calculate cross products using modified function
    cross_product(array1[idx], cross1[idx], cross2[idx])

    # Update llg_result directly
    # Modify to use element-wise operations:

    #an = -gma_LL * miu * np.cross(M, H, axis=1) - (LL_lambda * miu / M0) * np.cross(M, np.cross(M, H, axis=1), axis=1)
    #bn = -gma_LL * miu * np.cross(M + (dt / 2) * an, H, axis=1) - (LL_lambda * miu / M0) * np.cross(M + (dt / 2) * an, np.cross(M + (dt / 2) * an, H, axis=1), axis=1)
    #cn = -gma_LL * miu * np.cross(M + (dt / 2) * bn, H, axis=1) - (LL_lambda * miu / M0) * np.cross(M + (dt / 2) * bn, np.cross(M + (dt / 2) * bn, H, axis=1), axis=1)
    for i in range(3):
        an[idx][i] = -llg_gama * miu * cross1[idx][i] - (llg_lamda * miu / M_norm[idx]) * cross2[idx][i]

    for i in range(3):
        sum_bn[idx][i] = array1[idx][i] + (dt/2) * an[idx][i]

    cross_product(sum_bn[idx], array2[idx], cross12[idx])
    cross_product(sum_bn[idx], cross12[idx], cross22[idx])

    # Modify to use element-wise operations:
    for i in range(3):
        bn[idx][i] = -llg_gama * miu * cross12[idx][i] - (llg_lamda * miu / M_norm[idx]) * cross22[idx][i]

    for i in range(3):
        sum_cn[idx][i] = array1[idx][i] + (dt/2) * bn[idx][i]

    cross_product(sum_cn[idx], array2[idx], cross13[idx])
    cross_product(sum_cn[idx], cross13[idx], cross23[idx])

    for i in range(3):
        cn[idx][i] = -llg_gama * miu * cross13[idx][i] - (llg_lamda * miu / M_norm[idx]) * cross23[idx][i]

    for i in range(3):
        sum_dn[idx][i] = array1[idx][i] + (dt) * cn[idx][i]

    cross_product(sum_dn[idx], array2[idx], cross14[idx])
    cross_product(sum_dn[idx], cross14[idx], cross24[idx])
#   dn = -gma_LL * miu * np.cross(M + dt * cn, H, axis=1) - (LL_lambda * miu / M0) * np.cross(M + dt * cn, np.cross(M + dt * cn, H, axis=1), axis=1)

    for i in range(3):
        dn[idx][i] = -llg_gama * miu * cross14[idx][i] - (llg_lamda * miu / M_norm[idx]) * cross24[idx][i]

    for i in range(3):
      llg_result[idx][i] = array1[idx][i] + (dt/6)*(an[idx][i] + 2*bn[idx][i] + 2*cn[idx][i] + dn[idx][i])

    # FOR DEBUG:
    #for i in range(3):
      #an_res[idx][i] = an[idx][i]
      #bn_res[idx][i] = bn[idx][i]
      #cn_res[idx][i] = cn[idx][i]
      #dn_res[idx][i] = dn[idx][i]
    #M_0[idx] = M_norm[idx]


Running the LLG KERNEL for short input arrays (len 3)

> Add blockquote



In [None]:
import numpy as np
from numba import cuda, float64, int64
import math

#physical parameters
eps=8.854e-12 #[F/m]
miu=4*np.pi*1e-7 #[H/m]
c=1/(eps*miu)**0.5
heta = (miu/eps)**0.5
q = 1.60217646e-19    # Elementary charge [Coulombs]
miu = 4 * np.pi * 1e-7    # Magnetic permeability [H/m]
g = 2    # Landau factor
me = 9.1093821545e-31    # Electron mass [kg]
gma_factor = 1
gama = gma_factor * g * q / (2 * me)
alpha = 0

dz = 2e-9/8
dt = 2

x = np.array([[1.,2.,3.], [4.,5.,6.], [7.,2.,5.]], dtype = np.float64)
#norm_x = np.array(np.linalg.norm(x, axis=1))
y = np.array([[4,5,6], [1,7,3], [4,5,6]], dtype = np.float64)
M_norm = np.arange(3, dtype=np.float64)
#an_res = np.arange(3, dtype=np.float64)
#d_x = cuda.to_device(x)
#d_y = cuda.to_device(y)
res = np.empty_like(x)
d_x = cuda.to_device(x)
d_y = cuda.to_device(y)
d_res = cuda.device_array_like(d_x)

# FOR DEBUG - uncomment and print to understand better:
#d_an_res = cuda.device_array_like(d_x)
#d_bn_res = cuda.device_array_like(d_x)
#d_cn_res = cuda.device_array_like(d_x)
#d_dn_res = cuda.device_array_like(d_x)
#d_M_norm = cuda.device_array_like(M_norm)

blocks = 8  # Ensure enough blocks to cover the data
threadsperblock = 64 # Ensure enough threads to cover the data

# Call the kernel with launch configuration
# FOR DEBUG VERSION (with M0, an, bn...):
#LLG_kernel[blocks, threadsperblock](d_x, d_y, dt, alpha, d_res, d_M_norm, d_an_res, d_bn_res, d_cn_res, d_dn_res)

LLG_kernel[blocks, threadsperblock](d_x, d_y, dt, alpha, d_res)
print("res:",d_res.copy_to_host())
#print("M0:",d_M_norm.copy_to_host())
#print("an:",d_an_res.copy_to_host())
#print("bn:",d_bn_res.copy_to_host())
#print("cn:",d_cn_res.copy_to_host())
#print("dn:",d_dn_res.copy_to_host())

ImportError: Minor version compatibility requires ptxcompiler and cubinlinker packages to be available

In [None]:
!nvidia-smi
!nvcc --version

Sun Feb 16 09:21:42 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA L4                      Off |   00000000:00:03.0 Off |                    0 |
| N/A   61C    P0             30W /   72W |     215MiB /  23034MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
!pip install --upgrade numba-cuda



In [None]:
%timeit LLG_kernel[4,16](d_x, d_y, dt, alpha, d_res); cuda.synchronize()



67.7 µs ± 661 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [None]:
!uv pip install -q --system --force-reinstall numba-cuda==0.4.0

In [None]:
from numba import config
config.CUDA_ENABLE_PYNVJITLINK = 1

In [None]:
from numba import config
config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY = True

LLG Step - The basic function to run on CPU

In [None]:
import numpy as np

def LLG_step(M: np.array, H: np.array, dt: float, alpha: float) -> np.array:
    """
    """
    M0 = np.linalg.norm(M, axis=1, keepdims=True)
    #print("M0:", M0)
    gma_LL=gama/((1+alpha**2))
    LL_lambda=gama*alpha/(1+alpha**2)

   # Compute LLG terms
    an = -gma_LL * miu * np.cross(M, H, axis=1) - (LL_lambda * miu / M0) * np.cross(M, np.cross(M, H, axis=1), axis=1)
    #print("an:",an)
    bn = -gma_LL * miu * np.cross(M + (dt / 2) * an, H, axis=1) - (LL_lambda * miu / M0) * np.cross(M + (dt / 2) * an, np.cross(M + (dt / 2) * an, H, axis=1), axis=1)
    #print("bn:",bn)
    cn = -gma_LL * miu * np.cross(M + (dt / 2) * bn, H, axis=1) - (LL_lambda * miu / M0) * np.cross(M + (dt / 2) * bn, np.cross(M + (dt / 2) * bn, H, axis=1), axis=1)
    #print("cn:",cn)
    dn = -gma_LL * miu * np.cross(M + dt * cn, H, axis=1) - (LL_lambda * miu / M0) * np.cross(M + dt * cn, np.cross(M + dt * cn, H, axis=1), axis=1)
    #print("dn:",dn)
    new_M = M + (dt/6)*(an+2*bn+2*cn+dn)
    return new_M

Running the LLG on CPU for short input arrays (len 3)

In [None]:
import numpy as np
from numba import cuda
import math

#physical parameters
eps=8.854e-12 #[F/m]
miu=4*np.pi*1e-7 #[H/m]
c=1/(eps*miu)**0.5
heta = (miu/eps)**0.5
q = 1.60217646e-19    # Elementary charge [Coulombs]
miu = 4 * np.pi * 1e-7    # Magnetic permeability [H/m]
g = 2    # Landau factor
me = 9.1093821545e-31    # Electron mass [kg]
gma_factor = 1
gama = gma_factor * g * q / (2 * me)
alpha = 0

dz = 2e-9/8
dt = 2

x = np.array([[1.,2.,3.], [4.,5.,6.], [7.,2.,5.]], dtype = np.float64)
y = np.array([[4,5,6], [1,7,3], [4,5,6]], dtype = np.float64)
#d_x = cuda.to_device(x)
#d_y = cuda.to_device(y)
res = np.empty_like(x)

LLG_step(x, y, dt, alpha)


array([[-6.24733836e+24, -7.34973942e+23,  4.77737052e+24],
       [ 1.68011243e+25, -9.76156552e+24,  1.71766114e+25],
       [ 3.27066219e+25, -2.27844227e+25, -2.81739568e+24]])

In [None]:
%timeit LLG_step(x, y, dt, alpha)

363 µs ± 39.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


Adding the time propogate loop - Where in each iteration there is a call for the LLG Kernel. Suppose to be less time effictiate beacuse in each iterate there is a data transfer for and to host and device.

In [None]:
%timeit LLG_kernel[blocks,threadsperblock](d_big_arr1, d_big_arr2, dt, alpha, d_res); cuda.synchronize()

104 µs ± 12.9 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
