In [None]:
import pycuda.autoinit
import pycuda.driver as cuda
import numpy as np
from pycuda.compiler import SourceModule
import timeit
import pygame
import random
np.set_printoptions(suppress=True, threshold=np.inf)

In [420]:
boards = None
scores = None
batch_size = 1000

if scores == None:
    scores = np.zeros((4, batch_size), dtype=np.int32)
if boards == None:
    random_number_sample = [1]*9 + [2]
    boards = np.zeros((4, batch_size, 4, 4), dtype=np.int8)
    for z in range(4):
        for i in range(batch_size):
            for _ in range(2):
                j, k = np.random.randint(0, 4, 2)
                while boards[z][i][k][j] != 0:
                    j, k = np.random.randint(0, 4, 2)
                boards[z][i][k][j] = random.choice(random_number_sample)
gpu_boards = cuda.mem_alloc(boards.nbytes)
cuda.memcpy_htod(gpu_boards, boards)
gpu_scores = cuda.mem_alloc(scores.nbytes)
cuda.memcpy_htod(gpu_scores, scores)
print(boards)

[[[[0 0 0 0]
   [0 0 0 1]
   [0 0 0 0]
   [0 0 0 1]]

  [[0 0 0 1]
   [0 0 0 0]
   [0 0 1 0]
   [0 0 0 0]]

  [[0 0 0 0]
   [0 0 0 0]
   [0 0 0 0]
   [0 1 1 0]]

  [[0 0 0 0]
   [0 0 0 0]
   [0 0 1 0]
   [1 0 0 0]]

  [[0 0 0 0]
   [0 1 1 0]
   [0 0 0 0]
   [0 0 0 0]]

  [[0 0 0 0]
   [0 1 1 0]
   [0 0 0 0]
   [0 0 0 0]]

  [[0 0 1 0]
   [1 0 0 0]
   [0 0 0 0]
   [0 0 0 0]]

  [[0 0 0 0]
   [0 0 0 0]
   [0 1 0 0]
   [0 0 0 1]]

  [[0 1 0 1]
   [0 0 0 0]
   [0 0 0 0]
   [0 0 0 0]]

  [[0 1 0 1]
   [0 0 0 0]
   [0 0 0 0]
   [0 0 0 0]]

  [[0 1 0 0]
   [0 0 0 0]
   [0 0 1 0]
   [0 0 0 0]]

  [[0 0 0 1]
   [2 0 0 0]
   [0 0 0 0]
   [0 0 0 0]]

  [[2 0 0 0]
   [0 0 0 0]
   [0 1 0 0]
   [0 0 0 0]]

  [[0 0 0 0]
   [0 1 0 0]
   [0 0 0 0]
   [0 0 0 1]]

  [[0 0 0 0]
   [0 0 1 0]
   [0 1 0 0]
   [0 0 0 0]]

  [[0 0 0 1]
   [0 0 0 1]
   [0 0 0 0]
   [0 0 0 0]]

  [[0 0 0 0]
   [0 0 0 0]
   [0 0 0 0]
   [1 0 1 0]]

  [[0 0 1 1]
   [0 0 0 0]
   [0 0 0 0]
   [0 0 0 0]]

  [[0 0 0 0]
   [1 0 0 0]
  

In [421]:
kernel_code = """
__global__ void moveLeftKernel(int8_t *boards, int *scores, int batch_size) {
    int board_id = blockIdx.x; // Identify which board
    int row = threadIdx.x; // Identify which row in the board

    // Calculate moving left on first move
    if (board_id < batch_size && row < 4) {
        int8_t *current_row = &boards[board_id * 16 + row * 4]; // Pointer to the current row

        int didSomethingHappen = 0

        for (int i = 0; i < 4; i++) {
            if (current_row[i] == 0) {
                int j = i + 1;
                while (j < 4) {
                    if (current_row[j] != 0) {
                        current_row[i] = current_row[j];
                        current_row[j] = 0;
                        didSomethingHappen = 1
                        break;
                    }
                    j++;
                }
            }
        }

        for (int i = 0; i < 3; i++) {
            if (current_row[i] != 0 && current_row[i] == current_row[i+1]) {
                current_row[i] += 1;
                current_row[i+1] = 0;
                didSomethingHappen = 1
            }
        }

        for (int i = 0; i < 4; i++) {
            if (current_row[i] == 0) {
                int j = i + 1;
                while (j < 4) {
                    if (current_row[j] != 0) {
                        current_row[i] = current_row[j];
                        current_row[j] = 0;
                        didSomethingHappen = 1
                        break;
                    }
                    j++;
                }
            }
        }
    }
    
}
"""
mod = SourceModule(kernel_code)
threads_per_block = 4
blocks = batch_size

# Launch kernel
move_left_kernel = mod.get_function("moveLeftKernel")
move_left_kernel(gpu_boards, gpu_scores, np.int32(batch_size*4), block=(threads_per_block, 4, 1), grid=(blocks, 1))

# Retrieve data if necessary
print(boards[0,:5])
cuda.memcpy_dtoh(boards, gpu_boards)
cuda.memcpy_dtoh(scores, gpu_scores)
print(boards[0,:5])

[[[0 0 0 0]
  [0 0 0 1]
  [0 0 0 0]
  [0 0 0 1]]

 [[0 0 0 1]
  [0 0 0 0]
  [0 0 1 0]
  [0 0 0 0]]

 [[0 0 0 0]
  [0 0 0 0]
  [0 0 0 0]
  [0 1 1 0]]

 [[0 0 0 0]
  [0 0 0 0]
  [0 0 1 0]
  [1 0 0 0]]

 [[0 0 0 0]
  [0 1 1 0]
  [0 0 0 0]
  [0 0 0 0]]]
[[[0 0 0 0]
  [1 0 0 0]
  [0 0 0 0]
  [1 0 0 0]]

 [[1 0 0 0]
  [0 0 0 0]
  [1 0 0 0]
  [0 0 0 0]]

 [[0 0 0 0]
  [0 0 0 0]
  [0 0 0 0]
  [2 0 0 0]]

 [[0 0 0 0]
  [0 0 0 0]
  [1 0 0 0]
  [1 0 0 0]]

 [[0 0 0 0]
  [2 0 0 0]
  [0 0 0 0]
  [0 0 0 0]]]


In [None]:
a = np.random.randn(N).astype(np.float32)
b = np.random.randn(N).astype(np.float32)
c = np.zeros(N).astype(np.float32)
a_gpu = cuda.mem_alloc(a.nbytes)
b_gpu = cuda.mem_alloc(b.nbytes)
c_gpu = cuda.mem_alloc(c.nbytes)
cuda.memcpy_htod(a_gpu, a)
cuda.memcpy_htod(b_gpu, b)
vector_add = mod.get_function("VectorAdd")

class Game2048:
    tempboard = np.zeros((4,4), dtype=np.int32)
    empty_board = cuda.mem_alloc(tempboard.nbytes)
    cuda.memcpy_htod(empty_board, tempboard)

    tempscore = np.int32(0)
    zero_score = cuda.mem_alloc(tempscore.nbytes)
    cuda.memcpy_htod(zero_score, tempscore)

    def __init__(self, board_score:tuple = None) -> None:
        if board_score == None:
            self.board = cuda.mem_alloc(Game2048.tempboard.nbytes)
            self.board = cuda.memcpy(self.board, Game2048.empty_board)
            self.score = cuda.mem_alloc(Game2048.tempscore.nbytes)
            self.score = cuda.memcpy(self.board, Game2048.zero_score)
            self.addTile()
            self.addTile()
        else:
            self.board = cuda.mem_alloc(Game2048.tempboard.nbytes)
            self.board = cuda.memcpy(self.board, board_score[0])
            self.score = cuda.mem_alloc(Game2048.tempscore.nbytes)
            self.score = cuda.memcpy(self.board, board_score[1])
            self.board = board_score[0].copy()
            self.score = board_score[1].copy()

    def addTile(self):


In [None]:
kernel_code = """
__global__ void VectorAdd(float *a, float *b, float *c, int n)
{
    int i = threadIdx.x + blockIdx.x * blockDim.x;
    if (i < n)
        c[i] = a[i] + b[i];
}
"""
mod = SourceModule(kernel_code)


In [None]:
def gpu_function(a_gpu,b_gpu,c_gpu, N,vector_add):

    # Initialize arrays with random floats
    # a = np.random.randn(N).astype(np.float32)
    # b = np.random.randn(N).astype(np.float32)
    # a = np.array([_ for _ in range(-127,129)]).astype(np.int32)
    # a[2]=-254
    # b = np.array([_ for _ in range(128,-128,-1)]).astype(np.int32)
    # c = np.zeros(N).astype(np.float32)
    # print("a =", a,"\n","b =",b)

    # a_gpu = cuda.mem_alloc(a.nbytes)
    # b_gpu = cuda.mem_alloc(b.nbytes)
    # c_gpu = cuda.mem_alloc(c.nbytes)

    # cuda.memcpy_htod(a_gpu, a)
    # cuda.memcpy_htod(b_gpu, b)

    # vector_add = mod.get_function("VectorAdd")
    vector_add(a_gpu, b_gpu, c_gpu, np.int32(N), block=(1024,1,1), grid=(1,1))
    # cuda.memcpy_dtoh(c, c_gpu)
    # print("c =",c)

def cpu_function(a, b):

    # Initialize arrays with random floats
    # a = np.random.randn(N).astype(np.float32)
    # b = np.random.randn(N).astype(np.float32)
    # a = np.array([_ for _ in range(-127,129)]).astype(np.int32)
    # a[2]=-254
    # b = np.array([_ for _ in range(128,-128,-1)]).astype(np.int32)
    # c = np.zeros(N).astype(np.int32)
    # print("a =", a,"\n","b =",b)
    c = a+b
    # print("c =",c)

In [None]:
N = 10**5
a = np.random.randn(N).astype(np.float32)
b = np.random.randn(N).astype(np.float32)
c = np.zeros(N).astype(np.float32)
a_gpu = cuda.mem_alloc(a.nbytes)
b_gpu = cuda.mem_alloc(b.nbytes)
c_gpu = cuda.mem_alloc(c.nbytes)
cuda.memcpy_htod(a_gpu, a)
cuda.memcpy_htod(b_gpu, b)
vector_add = mod.get_function("VectorAdd")

# timeit.timeit("gpu_function(a_gpu,b_gpu,c_gpu, N,vector_add)", setup="from __main__ import gpu_function,a_gpu,b_gpu,c_gpu, N,vector_add", number=1000000)
timeit.timeit("cpu_function(a,b)", setup="from __main__ import cpu_function,a,b", number=1000000)

In [None]:
np.int32(256)