In [159]:
!nvidia-smi

Fri Nov  3 08:54:00 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   65C    P0    31W /  70W |    111MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [160]:
!lscpu

Architecture:            x86_64
  CPU op-mode(s):        32-bit, 64-bit
  Address sizes:         46 bits physical, 48 bits virtual
  Byte Order:            Little Endian
CPU(s):                  2
  On-line CPU(s) list:   0,1
Vendor ID:               GenuineIntel
  Model name:            Intel(R) Xeon(R) CPU @ 2.00GHz
    CPU family:          6
    Model:               85
    Thread(s) per core:  2
    Core(s) per socket:  1
    Socket(s):           1
    Stepping:            3
    BogoMIPS:            4000.28
    Flags:               fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clf
                         lush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_
                         good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fm
                         a cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hyp
                         ervisor lahf_lm abm 3dnowprefetch invpcid_single ssbd i

In [161]:
import numpy as np
import pandas as pd
from time import time
from numba import cuda
import math
import random


def cpu_mass_search(N, H, R_, found_):
    R = R_.copy()
    found = found_.copy()
    for j in range(R.shape[1]):
        for i in range(R.shape[0]):
            n = N[i]
            for k in range(len(n)):
                 if n[k] == H[j] and j - k >= 0:
                     R[i, j - k] -= 1
                     if R[i, j - k] == 0 and (found[i] > j - k or found[i] == -1):
                         found[i] = j - k
    return R, found


@cuda.jit
def kern(N, H, R, found):
    idx, jdx, kdx = cuda.grid(3)
    if idx < R.shape[0] and jdx < R.shape[1]:
       y, x, k = idx, jdx, kdx
       n = N[y]
       if n[k] == H[x] and x - k >= 0:
          cuda.atomic.sub(R, (y, x - k), 1)
          #v1
          cuda.atomic.add(found, y, - found[y] + x - k if R[y, x - k] == 0 and (found[y] > x - k or found[y] == -1) else 0)
       #v2
      #  cuda.syncthreads()
      #  if k == 0 and R[y, x] == 0:
      #      for i in range(x - 1, -1, -1):
      #           if R[y, i] == 0:
      #               return
      #      cuda.syncthreads()
      #      found[y] = x

def gpu_mass_search(N, H, R, found, size, str_sz):
    threads_per_block = (32,32,1)
    blocks_per_grid = (math.ceil(size[0]/threads_per_block[0]), math.ceil(size[1]/threads_per_block[1]), str_sz)

    startp = cuda.event()
    endp = cuda.event()
    startp.record()

    dev_N = cuda.to_device(N)
    dev_H = cuda.to_device(H)
    dev_R = cuda.to_device(R)
    dev_found = cuda.to_device(found)

    kern[blocks_per_grid, threads_per_block](dev_N, dev_H, dev_R, dev_found)

    endp.record()
    endp.synchronize()
    elapsedp = cuda.event_elapsed_time(startp, endp)

    return dev_R.copy_to_host(), dev_found.copy_to_host(), elapsedp / 1000


def save_one(pathfile, result, mode='a'):
    import json
    with open(pathfile, mode) as file:
        json.dump(result.tolist(), file)

In [163]:
# sixe_N, size_H
sizes = [[10, 5], [100, 50], [1000, 500], [5000, 2500], [10000, 5000]]

# min_sz = max_sz
str_sz = 2

# alph form 0 to alp
alp = 128

df = pd.DataFrame({"gpu time": pd.Series(dtype='float'),
                   "cpu time": pd.Series(dtype='float'),
                   "acceleration": pd.Series(dtype='float'),
                   "cpu_R == gpu_R": pd.Series(dtype='bool'),
                   "cpu_found ~= gpu_found": pd.Series(dtype='bool'),
                   "cpu_found == gpu_found": pd.Series(dtype='bool')})

for size in sizes:

  N = np.random.randint(alp, size=(size[0], str_sz), dtype=np.int64)
  H = np.random.randint(alp, size=size[1], dtype=np.int64)
  R = np.full((size[0], size[1]), fill_value=str_sz)
  found = np.empty(shape=(size[0]), dtype=np.int64)
  found.fill(-1)

  gpu_R, gpu_found, gpu_time = gpu_mass_search(N, H, R, found, size, str_sz)

  start = time()
  cpu_R, cpu_found = cpu_mass_search(N, H, R, found)
  cpu_time = time() - start

  ind = f"N_{size[0]}__H_{size[1]}"

  is_not_strict_eq = True
  for i in range(size[0]):
      if cpu_found[i] == -1:
          if cpu_found[i] == gpu_found[i]:
              pass
          else:
              is_not_strict_eq = False
              break

  df.loc[ind, "gpu time"] = gpu_time
  df.loc[ind, "cpu time"] = cpu_time
  df.loc[ind, "cpu_R == gpu_R"] = np.array_equal(gpu_R, cpu_R)
  df.loc[ind, "cpu_found ~= gpu_found"] = is_not_strict_eq
  df.loc[ind, "cpu_found == gpu_found"] = np.array_equal(gpu_found, cpu_found)

  # warning - many files will be produced

  # save_one(f"gpu_r_{ind}.json", gpu_R)
  # save_one(f"cpu_r_{ind}.json", cpu_R)
  # save_one(f"H_{ind}.json", H)
  # save_one(f"N_{ind}.json", N)
  # save_one(f"gpu_found_{ind}.json", gpu_found)
  # save_one(f"cpu_found_{ind}.json", cpu_found)

df["acceleration"] = df["cpu time"] / df["gpu time"]
df

Unnamed: 0,gpu time,cpu time,acceleration,cpu_R == gpu_R,cpu_found ~= gpu_found,cpu_found == gpu_found
N_10__H_5,0.001454,5.6e-05,0.038693,True,True,True
N_100__H_50,0.001813,0.004182,2.306301,True,True,True
N_1000__H_500,0.003163,0.389862,123.252923,True,True,True
N_5000__H_2500,0.024101,12.084509,501.401936,True,True,True
N_10000__H_5000,0.090636,43.908902,484.451867,True,True,False


In [164]:
for i in range(size[0]):
    if cpu_found[i] != gpu_found[i]:
        print(i, cpu_found[i], gpu_found[i])


3105 2617 5237
4503 4933 9872
6794 975 1960
