In [None]:
import pycuda.autoinit
import pycuda.driver as cuda
import numpy as np
from pycuda.compiler import SourceModule
import timeit
import pygame
import random
np.set_printoptions(suppress=True, threshold=np.inf)

In [None]:
def compress_left(board):
    for i in range(4):
        k = 0
        for j in range(4):
            if board[i,j]:
                board[i,k] = board[i,j]
                k += 1
        for j in range(k, 4):
            board[i,j] = 0       
    return board
    
def move_left(board, score):
    # Remove zeros        
    board = compress_left(board)
    
    # Combine adjacent values (replace second val by zero)
    for i in range(4):
        for j in range(3):
            if board[i,j]==board[i,j+1] and board[i,j] != 0:
                board[i,j] += 1                    
                board[i,j+1] = 0
                score += 2**board[i,j]
    # Remove zeros                    
    board = compress_left(board)    
    
    return board, score

def move(board, score, direction='left'):
    if direction=='left':
        board, score = move_left(board, score)
        return board, score
    elif direction=='right':
        board = board[:,::-1]
        board, score = move_left(board, score)
        board = board[:,::-1]
        return board, score
    elif direction=='up':
        board = board.T
        board, score = move_left(board, score)
        board = board.T
        return board, score
    elif direction=='down':
        board = board.T[:,::-1]
        board, score = move_left(board, score)
        board = board[:,::-1].T
        return board, score

In [63]:
# Cuda code
kernel_code = """
__device__ void moveLeft(int8_t *current_row, int *current_score) {
    int didSomethingHappen = 0;
    for (int i = 0; i < 3; i++) {
        if (current_row[i] == 0) {
            int j = i + 1;
            while (j < 4) {
                if (current_row[j] != 0) {
                    current_row[i] = current_row[j];
                    current_row[j] = 0;
                    didSomethingHappen = 1;
                    break;
                }
                j++;
            }
        }
    }

    for (int i = 0; i < 3; i++) {
        if (current_row[i] != 0 && current_row[i] == current_row[i+1]) {
            current_row[i] += 1;
            current_row[i+1] = 0;
            atomicAdd(current_score, (1 << current_row[i]));
            didSomethingHappen = 1;
        }
    }

    for (int i = 0; i < 3; i++) {
        if (current_row[i] == 0) {
            int j = i + 1;
            while (j < 4) {
                if (current_row[j] != 0) {
                    current_row[i] = current_row[j];
                    current_row[j] = 0;
                    didSomethingHappen = 1;
                    break;
                }
                j++;
            }
        }
    }
}

__device__ void moveRight(int8_t *current_row, int *current_score) {
    int didSomethingHappen = 0;
    for (int i = 3; i > 0; i--) {
        if (current_row[i] == 0) {
            int j = i - 1;
            while (j >= 0) {
                if (current_row[j] != 0) {
                    current_row[i] = current_row[j];
                    current_row[j] = 0;
                    didSomethingHappen = 1;
                    break;
                }
                j--;
            }
        }
    }

    for (int i = 3; i > 0; i--) {
        if (current_row[i] != 0 && current_row[i] == current_row[i-1]) {
            current_row[i] += 1;
            current_row[i-1] = 0;
            atomicAdd(current_score, (1 << current_row[i]));
            didSomethingHappen = 1;
        }
    }

    for (int i = 3; i > 0; i--) {
        if (current_row[i] == 0) {
            int j = i - 1;
            while (j >= 0) {
                if (current_row[j] != 0) {
                    current_row[i] = current_row[j];
                    current_row[j] = 0;
                    didSomethingHappen = 1;
                    break;
                }
                j--;
            }
        }
    }
}

__device__ void moveUp(int8_t *current_row, int *current_score) {
    int didSomethingHappen = 0;
    for (int i = 0; i < 12; i += 4) {
        if (current_row[i] == 0) {
            int j = i + 4;
            while (j < 16) {
                if (current_row[j] != 0) {
                    current_row[i] = current_row[j];
                    current_row[j] = 0;
                    didSomethingHappen = 1;
                    break;
                }
                j += 4;
            }
        }
    }

    for (int i = 0; i < 12; i += 4) {
        if (current_row[i] != 0 && current_row[i] == current_row[i+4]) {
            current_row[i] += 1;
            current_row[i+4] = 0;
            atomicAdd(current_score, (1 << current_row[i]));
            didSomethingHappen = 1;
        }
    }

    for (int i = 0; i < 12; i += 4) {
        if (current_row[i] == 0) {
            int j = i + 4;
            while (j < 16) {
                if (current_row[j] != 0) {
                    current_row[i] = current_row[j];
                    current_row[j] = 0;
                    didSomethingHappen = 1;
                    break;
                }
                j += 4;
            }
        }
    }
}

__device__ void moveDown(int8_t *current_row, int *current_score) {
    int didSomethingHappen = 0;
    for (int i = 12; i > 0; i -= 4) {
        if (current_row[i] == 0) {
            int j = i - 4;
            while (j >= 0) {
                if (current_row[j] != 0) {
                    current_row[i] = current_row[j];
                    current_row[j] = 0;
                    didSomethingHappen = 1;
                    break;
                }
                j -= 4;
            }
        }
    }

    for (int i = 12; i > 0; i -= 4) {
        if (current_row[i] != 0 && current_row[i] == current_row[i-4]) {
            current_row[i] += 1;
            current_row[i-4] = 0;
            atomicAdd(current_score, (1 << current_row[i]));
            didSomethingHappen = 1;
        }
    }

    for (int i = 12; i > 0; i -= 4) {
        if (current_row[i] == 0) {
            int j = i - 4;
            while (j >= 0) {
                if (current_row[j] != 0) {
                    current_row[i] = current_row[j];
                    current_row[j] = 0;
                    didSomethingHappen = 1;
                    break;
                }
                j -= 4;
            }
        }
    }
}

__global__ void firstMoves(int8_t *g_initial_board, int8_t *g_final_board, int g_initial_score, int *g_final_scores) {
    for (int i = 0; i < 4; i++) {
        for (int j = 0; j < 16; j++) {
            g_final_board[i * 16 + j] = g_initial_board[j];
        }
        g_final_scores[i] = g_initial_score;
    }
    int colrow = threadIdx.x;
    if (colrow < 4) {
        moveLeft(&g_final_board[4*colrow], &g_final_scores[0]);
        moveUp(&g_final_board[16+colrow], &g_final_scores[1]);
        moveRight(&g_final_board[32+4*colrow], &g_final_scores[2]);
        moveDown(&g_final_board[48+colrow], &g_final_scores[3]);
        //g_final_board[4*i] = 10;
        //g_final_board[16+i] = 11;
        //g_final_board[32+4*i] = 12;
        //g_final_board[48+i] = 13;
    }
}


//__global__ void Simulate(int8_t *boards, int *scores, int batch_size) {

__global__ void Simulate(int8_t *boards, int *scores, int batch_size) {
    int board_id = blockIdx.x; // Identify which board
    int row = threadIdx.x; // Identify which row in the board

    // Calculate moving left on first move
    if (board_id < batch_size && row < 4) {
        int *current_score = &scores[board_id]; // Pointer to the current score
        int8_t *current_row = &boards[board_id * 16 + row * 4]; // Pointer to the current row
        //int8_t *current_row = &boards[board_id * 16 + row]; // Pointer to the current column

        moveLeft(current_row, current_score);
    }
}
"""

In [64]:
# Initialize board and score
batch_size = 1000
initial_board = np.array([[1, 1, 1, 1],
                          [0, 2, 0, 0],
                          [1, 1, 4, 1],
                          [3, 0, 5, 0]], dtype=np.int8)
initial_score = np.int32(0)
g_initial_board = cuda.mem_alloc(initial_board.nbytes)
cuda.memcpy_htod(g_initial_board, initial_board)
# g_initial_score = cuda.mem_alloc(initial_score.nbytes)
# cuda.memcpy_htod(g_initial_score, initial_score)

# Make room for and format outputs
final_boards = np.zeros((4, batch_size, 4, 4), dtype=np.int8)
g_final_boards = cuda.mem_alloc(final_boards.nbytes)
final_scores = np.zeros((4, batch_size), dtype=np.int32)
g_final_scores = cuda.mem_alloc(final_scores.nbytes)


In [66]:
mod = SourceModule(kernel_code)
threads_per_block = 4
blocks = batch_size*4
firstMoves = mod.get_function("firstMoves")

firstMoves(g_initial_board, g_final_boards, initial_score, g_final_scores, block=(4, 1, 1), grid=(1, 1, 1))
four_boards = np.zeros((4, 4, 4), dtype=np.int8)
four_scores = np.zeros((4), dtype=np.int32)
cuda.memcpy_dtoh(four_boards, g_final_boards)
cuda.memcpy_dtoh(four_scores, g_final_scores)
print(initial_board)
print(four_boards)
print(four_scores)

[[1 1 1 1]
 [0 2 0 0]
 [1 1 4 1]
 [3 0 5 0]]
[[[2 2 0 0]
  [2 0 0 0]
  [2 4 1 0]
  [3 5 0 0]]

 [[2 1 1 2]
  [3 2 4 0]
  [0 1 5 0]
  [0 0 0 0]]

 [[0 0 2 2]
  [0 0 0 2]
  [0 2 4 1]
  [0 0 3 5]]

 [[0 0 0 0]
  [0 1 1 0]
  [2 2 4 0]
  [3 1 5 2]]]
[12  8 12  8]


In [29]:
boards = None
scores = None
batch_size = 1000

if scores == None:
    scores = np.zeros((4, batch_size), dtype=np.int32)
if boards == None:
    random_number_sample = [3]*9 + [4]
    boards = np.zeros((4, batch_size, 4, 4), dtype=np.int8)
    for z in range(4):
        for i in range(batch_size):
            for _ in range(16):
                j, k = np.random.randint(0, 4, 2)
                while boards[z][i][k][j] != 0:
                    j, k = np.random.randint(0, 4, 2)
                boards[z][i][k][j] = random.choice(random_number_sample)
gpu_boards = cuda.mem_alloc(boards.nbytes)
cuda.memcpy_htod(gpu_boards, boards)
gpu_scores = cuda.mem_alloc(scores.nbytes)
cuda.memcpy_htod(gpu_scores, scores)
# print(boards)

In [30]:
# Launch kernel
Simulate = mod.get_function("Simulate")
Simulate(gpu_boards, gpu_scores, np.int32(batch_size*4), block=(threads_per_block, 1, 1), grid=(blocks,1))

# Retrieve data if necessary
print(boards[:,0])
cuda.memcpy_dtoh(boards, gpu_boards)
cuda.memcpy_dtoh(scores, gpu_scores)
print(boards[:,0],scores[:,0])

[[[3 4 3 3]
  [3 3 3 3]
  [3 3 3 3]
  [3 3 3 3]]

 [[4 3 3 3]
  [4 3 3 3]
  [3 3 3 3]
  [4 3 3 3]]

 [[3 3 3 3]
  [3 3 3 3]
  [3 4 3 3]
  [3 3 3 3]]

 [[3 3 3 3]
  [3 3 4 3]
  [3 3 3 3]
  [4 3 4 3]]]
[[[3 4 4 0]
  [4 4 0 0]
  [4 4 0 0]
  [4 4 0 0]]

 [[4 4 3 0]
  [4 4 3 0]
  [4 4 0 0]
  [4 4 3 0]]

 [[4 4 0 0]
  [4 4 0 0]
  [3 4 4 0]
  [4 4 0 0]]

 [[4 4 0 0]
  [4 4 3 0]
  [4 4 0 0]
  [4 3 4 3]]] [32 48 32 32]
