In [1]:
!pip install pycuda



In [None]:
!nvidia-smi

# CUDA implementation

In [43]:
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import math
from itertools import accumulate 

In [65]:
def cuda_validation(file):
  if file is None:
    return None
  
  s = ""
  with open(file,'r') as f:
    for line in f:
      line = line[:-1]
      s = s+line
  

  sb = s.encode('utf-8')
  sb_len = len(sb)
  cnt = np.full((sb_len),0,dtype=np.int32)
  cnt_cuda = cuda.mem_alloc(cnt.nbytes)
  cuda.memcpy_htod(cnt_cuda,cnt)

  s_cuda = cuda.mem_alloc(sb_len)
  cuda.memcpy_htod(s_cuda,sb)

  funct_str = SourceModule("""

  __device__ char good_char(char ch){
    return ch!=32;//(ch<48 || ch>57);
  }

  __global__ void faza12(char* s,int* out,int size){
      
      int blockPosition = blockIdx.x * blockDim.x;
      int col = threadIdx.x;
      __shared__ int sum[1024];
      if(col+blockPosition<size){
        
        char ch = s[col+blockPosition];

        if (good_char(ch)){
          sum[col] = 1;
        }else{
          sum[col] = 0;
        }

        
        int offset = 1;
        for(int d = 1024>>1;d>0;d>>=1){
          __syncthreads();
          if(col<d){
            int ai = offset*(2*col+1)-1;
            int bi = offset*(2*col+2)-1;

            sum[bi] += sum[ai];
          }
          offset *= 2;
        }

        if(col == 0){
          sum[1023] = 0;
        }

        for(int d = 1;d<1024;d*=2){
          offset >>= 1;
          __syncthreads();
          if(col < d){
            int ai = offset*(2*col+1)-1;
            int bi = offset*(2*col+2)-1;

            int tmp = sum[ai];
            sum[ai] = sum[bi];
            sum[bi] += tmp;
          }
        }

        __syncthreads();
        if(col%1024 == 0){
          int i = 0;
          for(i=0;i<1024;++i){
            if(i+blockPosition<size){
              out[i+blockPosition] = sum[i];
            }
          }
        }
      }
  }

  __global__ void faza3(char* s,int* pos,int* offsetArr,char* out,int size){
      
      int blockPosition = blockIdx.x;
      int worldPosition = blockPosition * blockDim.x;
      int col = threadIdx.x + worldPosition;
      __shared__ int realOffset;
      char ch = s[col];
      
      if(col%1024 == 0){
        realOffset = offsetArr[blockPosition];
      }
      __syncthreads();
      if(good_char(ch) && realOffset+pos[col]<size){
        out[realOffset+pos[col]] = ch;
      }
  }

  """)
  func = funct_str.get_function("faza12")
  func(s_cuda,cnt_cuda,np.int32(sb_len),block = (1024,1,1), grid = (math.ceil((sb_len/1024)),1,1))
  cuda.memcpy_dtoh(cnt,cnt_cuda)

  ends = cnt[::1023]
  block_offsets = list(accumulate(ends))
  out_len = int(block_offsets[-1]+cnt[-1])+1

  res_arr_np = np.full((out_len),1,dtype=np.byte)
  out_cuda = cuda.mem_alloc(res_arr_np.nbytes)
  cuda.memcpy_htod(out_cuda,res_arr_np)

  off_np = np.array(block_offsets,dtype=np.int32)
  off_cuda = cuda.mem_alloc(off_np.nbytes)
  cuda.memcpy_htod(off_cuda,off_np)

  func3 = funct_str.get_function("faza3")
  func3(s_cuda,cnt_cuda,off_cuda,out_cuda,np.int32(out_len),block = (1024,1,1), grid = (math.ceil((sb_len/1024)),1,1)) # (char* s,int* pos,int* offsetArr,char* out,int size)

  cuda.memcpy_dtoh(res_arr_np,out_cuda)
  result = (res_arr_np).tobytes().decode('UTF-8')
  
  return result

# Sequential implementation

In [63]:
def seq_validation(file):
  if file is None:
    return None
  
  def check_good(ch): 
    ch_asci = ord(ch)
    return ch_asci != 32

  s = ""
  with open(file,'r') as f:
    for line in f:
      line = line[:-1]
      s = s+line
  
  result = ''
  for ch in s:
    if check_good(ch):
      result += ch
  
  return result

# Testing

In [None]:
%%time
cuda_validation('/content/drive/MyDrive/Colab Notebooks/Paralelni algoritmi/test_smpl.txt')

In [None]:
%%time
seq_validation('/content/drive/MyDrive/Colab Notebooks/Paralelni algoritmi/test_smpl.txt')