In [384]:
import pycuda.autoinit
import pycuda.driver as cuda
import numpy as np
from pycuda.compiler import SourceModule
import timeit
import pygame
import random
import time
np.set_printoptions(suppress=True, threshold=np.inf)

In [385]:
# CPU moveset

def compress_left(board):
    for i in range(4):
        k = 0
        for j in range(4):
            if board[i,j]:
                board[i,k] = board[i,j]
                k += 1
        for j in range(k, 4):
            board[i,j] = 0       
    return board
    
def move_left(board, score):
    # Remove zeros        
    board = compress_left(board)
    
    # Combine adjacent values (replace second val by zero)
    for i in range(4):
        for j in range(3):
            if board[i,j]==board[i,j+1] and board[i,j] != 0:
                board[i,j] += 1                    
                board[i,j+1] = 0
                score += 2**board[i,j]
    # Remove zeros                    
    board = compress_left(board)    
    
    return board, score

def move(board, score, direction='left'):
    if direction=='left':
        board, score = move_left(board, score)
        return board, score
    elif direction=='right':
        board = board[:,::-1]
        board, score = move_left(board, score)
        board = board[:,::-1]
        return board, score
    elif direction=='up':
        board = board.T
        board, score = move_left(board, score)
        board = board.T
        return board, score
    elif direction=='down':
        board = board.T[:,::-1]
        board, score = move_left(board, score)
        board = board[:,::-1].T
        return board, score

In [386]:
# Cuda code
kernel_code = r"""
#include <curand_kernel.h>

extern "C" {
__global__ void setupCurand(curandState *state, unsigned long seed) {
    int idx = blockIdx.x;
    curand_init(seed, idx, 0, &state[idx]);
}

__device__ void moveLeft(int8_t *current_row, int *current_score) {
    int didSomethingHappen = 0;
    for (int i = 0; i < 3; i++) {
        if (current_row[i] == 0) {
            int j = i + 1;
            while (j < 4) {
                if (current_row[j] != 0) {
                    current_row[i] = current_row[j];
                    current_row[j] = 0;
                    didSomethingHappen = 1;
                    break;
                }
                j++;
            }
        }
    }

    for (int i = 0; i < 3; i++) {
        if (current_row[i] != 0 && current_row[i] == current_row[i+1]) {
            current_row[i] += 1;
            current_row[i+1] = 0;
            printf("BlockID: %d, ThreadID: %d, Score: %d\n", blockIdx.x, threadIdx.x, current_score[0]);
            atomicAdd(current_score, (1 << current_row[i]));
            didSomethingHappen = 1;
        }
    }

    for (int i = 0; i < 3; i++) {
        if (current_row[i] == 0) {
            int j = i + 1;
            while (j < 4) {
                if (current_row[j] != 0) {
                    current_row[i] = current_row[j];
                    current_row[j] = 0;
                    didSomethingHappen = 1;
                    break;
                }
                j++;
            }
        }
    }
}

__device__ void moveRight(int8_t *current_row, int *current_score) {
    int didSomethingHappen = 0;
    for (int i = 3; i > 0; i--) {
        if (current_row[i] == 0) {
            int j = i - 1;
            while (j >= 0) {
                if (current_row[j] != 0) {
                    current_row[i] = current_row[j];
                    current_row[j] = 0;
                    didSomethingHappen = 1;
                    break;
                }
                j--;
            }
        }
    }

    for (int i = 3; i > 0; i--) {
        if (current_row[i] != 0 && current_row[i] == current_row[i-1]) {
            current_row[i] += 1;
            current_row[i-1] = 0;
            atomicAdd(current_score, (1 << current_row[i]));
            didSomethingHappen = 1;
        }
    }

    for (int i = 3; i > 0; i--) {
        if (current_row[i] == 0) {
            int j = i - 1;
            while (j >= 0) {
                if (current_row[j] != 0) {
                    current_row[i] = current_row[j];
                    current_row[j] = 0;
                    didSomethingHappen = 1;
                    break;
                }
                j--;
            }
        }
    }
}

__device__ void moveUp(int8_t *current_row, int *current_score) {
    int didSomethingHappen = 0;
    for (int i = 0; i < 12; i += 4) {
        if (current_row[i] == 0) {
            int j = i + 4;
            while (j < 16) {
                if (current_row[j] != 0) {
                    current_row[i] = current_row[j];
                    current_row[j] = 0;
                    didSomethingHappen = 1;
                    break;
                }
                j += 4;
            }
        }
    }

    for (int i = 0; i < 12; i += 4) {
        if (current_row[i] != 0 && current_row[i] == current_row[i+4]) {
            current_row[i] += 1;
            current_row[i+4] = 0;
            atomicAdd(current_score, (1 << current_row[i]));
            didSomethingHappen = 1;
        }
    }

    for (int i = 0; i < 12; i += 4) {
        if (current_row[i] == 0) {
            int j = i + 4;
            while (j < 16) {
                if (current_row[j] != 0) {
                    current_row[i] = current_row[j];
                    current_row[j] = 0;
                    didSomethingHappen = 1;
                    break;
                }
                j += 4;
            }
        }
    }
}

__device__ void moveDown(int8_t *current_row, int *current_score) {
    int didSomethingHappen = 0;
    for (int i = 12; i > 0; i -= 4) {
        if (current_row[i] == 0) {
            int j = i - 4;
            while (j >= 0) {
                if (current_row[j] != 0) {
                    current_row[i] = current_row[j];
                    current_row[j] = 0;
                    didSomethingHappen = 1;
                    break;
                }
                j -= 4;
            }
        }
    }

    for (int i = 12; i > 0; i -= 4) {
        if (current_row[i] != 0 && current_row[i] == current_row[i-4]) {
            current_row[i] += 1;
            current_row[i-4] = 0;
            atomicAdd(current_score, (1 << current_row[i]));
            didSomethingHappen = 1;
        }
    }

    for (int i = 12; i > 0; i -= 4) {
        if (current_row[i] == 0) {
            int j = i - 4;
            while (j >= 0) {
                if (current_row[j] != 0) {
                    current_row[i] = current_row[j];
                    current_row[j] = 0;
                    didSomethingHappen = 1;
                    break;
                }
                j -= 4;
            }
        }
    }
}

__device__ void addTile(int8_t *current_row, curandState *state) {
    int idx = blockIdx.x;
    curandState localState = state[idx];
    
    int numZeros = 0;
    for (int i = 0; i < 16; i++) {
        if (current_row[i] == 0) {
            numZeros++;
        }  
    }
    int randIndex = curand(&localState) % numZeros;
    int zeroCount = 0;
    for (int i = 0; i < 16; i++) {
        if (current_row[i] == 0) {
            if (zeroCount == randIndex) {
                current_row[i] = (curand_uniform(&localState) > 0.9) ? 2 : 1;
                break;
            }
            zeroCount++;
        }
    }
    state[idx] = localState;
}

__global__ void firstMoves(int8_t *g_initial_board, int8_t *g_final_board, int g_initial_score, curandState *states, int *g_final_scores) {
    for (int i = 0; i < 4; i++) {
        for (int j = 0; j < 16; j++) {
            g_final_board[i * 16 + j] = g_initial_board[j];
        }
        g_final_scores[i] = g_initial_score;
    }
    int colrow = blockIdx.x;
    if (colrow < 4) {
        moveLeft(&g_final_board[4*colrow], &g_final_scores[0]);
        moveUp(&g_final_board[16+colrow], &g_final_scores[1]);
        moveRight(&g_final_board[32+4*colrow], &g_final_scores[2]);
        moveDown(&g_final_board[48+colrow], &g_final_scores[3]);
        //addTile(&g_final_board[16*colrow], states);
        //g_final_board[4*i] = 10;
        //g_final_board[16+i] = 11;
        //g_final_board[32+4*i] = 12;
        //g_final_board[48+i] = 13;
    }
}


//__global__ void Simulate(int8_t *boards, int *scores, int batch_size) {

__global__ void Simulate(int8_t *boards, int *scores, int batch_size) {
    int board_id = blockIdx.x; // Identify which board
    int row = threadIdx.x; // Identify which row in the board

    // Calculate moving left on first move
    if (board_id < batch_size && row < 4) {
        int *current_score = &scores[board_id]; // Pointer to the current score
        int8_t *current_row = &boards[board_id * 16 + row * 4]; // Pointer to the current row
        //int8_t *current_row = &boards[board_id * 16 + row]; // Pointer to the current column

        moveLeft(current_row, current_score);
    }
}
}
"""

In [387]:
# Initialize board and score 
batch_size = 1000
# initial_board = np.array([[1, 1, 1, 1],
#                           [0, 2, 0, 0],
#                           [1, 1, 4, 1],
#                           [3, 0, 5, 0]], dtype=np.int8)
initial_board = np.array([[1, 1, 0, 0],
                          [1, 1, 0, 0],
                          [0, 1, 0, 0],
                          [0, 1, 0, 0]], dtype=np.int8)
initial_score = np.int32(0)
g_initial_board = cuda.mem_alloc(initial_board.nbytes)
cuda.memcpy_htod(g_initial_board, initial_board)
# g_initial_score = cuda.mem_alloc(initial_score.nbytes)
# cuda.memcpy_htod(g_initial_score, initial_score)

# Make room for and format outputs
final_boards = np.zeros((4, batch_size, 4, 4), dtype=np.int8)
g_final_boards = cuda.mem_alloc(final_boards.nbytes)
final_scores = np.zeros((4, batch_size), dtype=np.int32)
g_final_scores = cuda.mem_alloc(final_scores.nbytes)


In [388]:
# Decide parameters
threads_per_block = 4
blocks = batch_size*4

# Load kernel code
mod = SourceModule(kernel_code, no_extern_c=True)

# Setup curand
curand_states = cuda.mem_alloc(blocks * 256)
mod.get_function("setupCurand")(curand_states, np.uint64(time.time()), block=(1, 1, 1), grid=(blocks, 1, 1))

# Setup boards

firstMoves = mod.get_function("firstMoves")

firstMoves(g_initial_board, g_final_boards, initial_score, g_final_scores, curand_states, block=(1, 1, 1), grid=(4, 1, 1))
four_boards = np.zeros((4, 4, 4), dtype=np.int8)
four_scores = np.zeros((4), dtype=np.int32)
cuda.memcpy_dtoh(four_boards, g_final_boards)
cuda.memcpy_dtoh(four_scores, g_final_scores)
print(initial_board)
print(four_boards)
print(four_scores)

[[1 1 0 0]
 [1 1 0 0]
 [0 1 0 0]
 [0 1 0 0]]
[[[2 0 0 0]
  [2 0 0 0]
  [1 0 0 0]
  [1 0 0 0]]

 [[2 2 0 0]
  [0 2 0 0]
  [0 0 0 0]
  [0 0 0 0]]

 [[0 0 0 2]
  [0 0 0 2]
  [0 0 0 1]
  [0 0 0 1]]

 [[0 0 0 0]
  [0 0 0 0]
  [0 2 0 0]
  [2 2 0 0]]]
[-2002515301 -1557981668           0           0]


In [None]:
boards = None
scores = None
batch_size = 1000

if scores == None:
    scores = np.zeros((4, batch_size), dtype=np.int32)
if boards == None:
    random_number_sample = [3]*9 + [4]
    boards = np.zeros((4, batch_size, 4, 4), dtype=np.int8)
    for z in range(4):
        for i in range(batch_size):
            for _ in range(16):
                j, k = np.random.randint(0, 4, 2)
                while boards[z][i][k][j] != 0:
                    j, k = np.random.randint(0, 4, 2)
                boards[z][i][k][j] = random.choice(random_number_sample)
gpu_boards = cuda.mem_alloc(boards.nbytes)
cuda.memcpy_htod(gpu_boards, boards)
gpu_scores = cuda.mem_alloc(scores.nbytes)
cuda.memcpy_htod(gpu_scores, scores)
# print(boards)

In [None]:
# Launch kernel
Simulate = mod.get_function("Simulate")
Simulate(gpu_boards, gpu_scores, np.int32(batch_size*4), block=(threads_per_block, 1, 1), grid=(blocks,1))

# Retrieve data if necessary
print(boards[:,0])
cuda.memcpy_dtoh(boards, gpu_boards)
cuda.memcpy_dtoh(scores, gpu_scores)
print(boards[:,0],scores[:,0])

In [111]:
code2="""
#include <curand_kernel.h>

extern "C" {
__global__ void setupCurand(curandState *state, unsigned long seed) {
    int idx = blockIdx.x;
    curand_init(seed, idx, 0, &state[idx]);
}

__global__ void generateRandomNumbers(curandState *state, int *array) {
    int idx = blockIdx.x;
    array[idx] = curand(&state[idx]) % 7;
}

}
"""
# Decide parameters
threads_per_block = 4
blocks = batch_size*4

# Load kernel code
code2 = SourceModule(code2, no_extern_c=True)

# Setup curand
curand_states = cuda.mem_alloc(10000*256)
code2.get_function("setupCurand")(curand_states, np.uint64(time.time()), block=(1, 1, 1), grid=(10000, 1, 1))

# Setup boards
random_numbers = np.zeros((10000), dtype=np.int32)
gpu_random_numbers = cuda.mem_alloc(random_numbers.nbytes)

generate = code2.get_function("generateRandomNumbers")
generate(curand_states, gpu_random_numbers, block=(1, 1, 1), grid=(10000, 1, 1))

cuda.memcpy_dtoh(random_numbers, gpu_random_numbers)
# print(random_numbers)
forprint = str(random_numbers)
forprint = forprint.replace(" ", ",").replace("[", "c(").replace("]", ")")
print(forprint)

c(2,1,6,6,1,4,1,0,5,1,2,4,4,3,0,1,2,4,0,1,2,2,2,2,0,6,0,0,0,6,2,0,5,6,1,3,2
,5,2,3,5,3,2,2,4,4,0,4,2,4,5,5,4,6,4,1,5,2,0,3,2,5,5,5,1,1,4,5,6,0,3,5,0,0
,2,0,4,0,4,4,4,1,0,0,2,1,3,2,1,0,6,4,1,4,6,3,1,2,3,1,2,2,6,2,5,2,0,6,1,0,3
,3,2,2,2,3,5,5,3,4,0,1,0,4,6,3,0,1,1,6,5,4,3,1,1,4,3,5,1,1,2,6,4,2,0,5,5,5
,6,4,3,6,5,4,5,5,5,4,0,2,6,6,5,0,1,3,0,0,6,0,6,2,1,4,1,4,2,5,5,2,5,2,1,4,3
,5,1,1,5,1,1,6,0,2,2,4,3,4,2,6,3,5,1,4,6,6,6,6,5,4,5,0,2,4,1,3,4,0,4,0,0,2
,0,3,5,0,0,6,6,3,3,6,4,2,4,6,4,5,1,6,5,3,1,6,0,5,1,2,3,5,5,3,1,2,3,4,3,0,4
,2,1,3,1,1,3,5,5,3,2,5,0,4,3,3,4,2,4,6,3,0,4,2,6,6,1,2,3,5,5,6,5,5,4,2,3,0
,6,4,0,3,4,4,0,0,6,5,1,0,3,6,1,5,0,3,6,4,4,5,0,1,5,2,4,3,2,4,5,2,0,6,5,0,4
,4,3,1,5,4,2,0,5,3,5,4,5,0,5,0,6,3,6,6,5,6,0,3,0,1,4,6,3,5,1,2,4,6,3,6,0,1
,5,4,5,0,6,1,6,6,2,5,1,6,6,3,6,6,0,5,2,5,5,5,0,4,0,4,5,0,3,2,6,2,1,5,4,4,4
,6,2,2,3,6,6,1,1,6,1,2,4,2,5,5,2,3,0,0,0,6,5,3,2,4,4,5,4,5,0,1,4,1,1,5,4,3
,3,1,5,5,4,6,4,2,3,4,5,1,1,0,1,1,5,2,3,4,0,2,1,3,3,5,1,2,1,1,0,4,3,3,6,5,3
,4,5,5,2,0,4,3,5,2,4,0,3