In [1]:
import random

import numpy as np
import copy

In [2]:
WIDTH = 10
HEIGHT = 20
DISCOUNT = 0.9
LAMBDA = 0.2
FEATURE_COUNT = 2 * WIDTH + 2
LEARNING_RATE = 0.0001
PIECES = {
    'I': [
        np.array([
            [1, 1, 1, 1],
            [0, 0, 0, 0],
            [0, 0, 0, 0],
            [0, 0, 0, 0],
        ]),
        np.array([
            [1, 0, 0, 0],
            [1, 0, 0, 0],
            [1, 0, 0, 0],
            [1, 0, 0, 0],
        ])
    ],
    'T': [
        np.array([
            [1, 1, 1, 0],
            [0, 1, 0, 0],
            [0, 0, 0, 0],
            [0, 0, 0, 0],
        ]),
        np.array([
            [1, 0, 0, 0],
            [1, 1, 0, 0],
            [1, 0, 0, 0],
            [0, 0, 0, 0],
        ]),
        np.array([
            [0, 1, 0, 0],
            [1, 1, 1, 0],
            [0, 0, 0, 0],
            [0, 0, 0, 0],
        ]),
        np.array([
            [0, 1, 0, 0],
            [1, 1, 0, 0],
            [0, 1, 0, 0],
            [0, 0, 0, 0],
        ])
    ],
    'Z': [
        np.array([
            [1, 1, 0, 0],
            [0, 1, 1, 0],
            [0, 0, 0, 0],
            [0, 0, 0, 0],
        ]),
        np.array([
            [0, 1, 0, 0],
            [1, 1, 0, 0],
            [1, 0, 0, 0],
            [0, 0, 0, 0],
        ]),
    ],
    'S': [
        np.array([
            [0, 1, 1, 0],
            [1, 1, 0, 0],
            [0, 0, 0, 0],
            [0, 0, 0, 0],
        ]),
        np.array([
            [1, 0, 0, 0],
            [1, 1, 0, 0],
            [0, 1, 0, 0],
            [0, 0, 0, 0],
        ]),
    ],
    'O': [
        np.array([
            [1, 1, 0, 0],
            [1, 1, 0, 0],
            [0, 0, 0, 0],
            [0, 0, 0, 0],
        ])
    ],
    'L': [
        np.array([
            [1, 0, 0, 0],
            [1, 0, 0, 0],
            [1, 1, 0, 0],
            [0, 0, 0, 0],
        ]),
        np.array([
            [0, 0, 1, 0],
            [1, 1, 1, 0],
            [0, 0, 0, 0],
            [0, 0, 0, 0],
        ]),
        np.array([
            [1, 1, 0, 0],
            [0, 1, 0, 0],
            [0, 1, 0, 0],
            [0, 0, 0, 0],
        ]),
        np.array([
            [1, 1, 1, 0],
            [1, 0, 0, 0],
            [0, 0, 0, 0],
            [0, 0, 0, 0],
        ]),
    ],
    'J': [
        np.array([
            [0, 1, 0, 0],
            [0, 1, 0, 0],
            [1, 1, 0, 0],
            [0, 0, 0, 0],
        ]),
        np.array([
            [1, 0, 0, 0],
            [1, 1, 1, 0],
            [0, 0, 0, 0],
            [0, 0, 0, 0],
        ]),
        np.array([
            [1, 1, 0, 0],
            [1, 0, 0, 0],
            [1, 0, 0, 0],
            [0, 0, 0, 0],
        ]),
        np.array([
            [1, 1, 1, 0],
            [0, 0, 1, 0],
            [0, 0, 0, 0],
            [0, 0, 0, 0],
        ]),
    ],
}

PIECES_NAMES = PIECES.keys()

START_BOARD = [[0] * WIDTH for _ in range(HEIGHT)]

In [3]:
def can_be_placed_x_y(board, piece, x, y):
    for i in range(4):
        for j in range(4):
            if piece[i][j] == 1 and (x + i >= HEIGHT or y + j >= WIDTH or board[x + i][y + j] == 1):
                return False
    return True


def can_be_placed_y(board, piece, y):
    can_place_at = -1
    for i in range(HEIGHT):
        if can_be_placed_x_y(board, piece, i, y):
            can_place_at = i
        else:
            break
    return can_place_at


def place(board, piece, x, y):
    new_board = copy.deepcopy(board)
    n = len(piece)
    m = len(piece[0])
    mrange = range(m)
    for i in range(n):
        for j in mrange:
            if piece[i][j] == 1:
                new_board[x + i][y + j] = 1
    return new_board


def print_board(board, name=""):
    print(name)
    for row in board:
        print("".join("#" if x else "." for x in row))
    print("\n")


def get_piece_actions(board, piece):
    actions = []
    for rotation in piece:
        for i in range(WIDTH):
            placed = can_be_placed_y(board, rotation, i)
            if placed > -1:
                actions.append((placed, i, rotation))

    return actions

def get_reward(board):
    new_board = [row for row in board if not all(row)]
    cleared = HEIGHT - len(new_board)
    while len(new_board) < HEIGHT:
        new_board.insert(0, [0] * WIDTH)
    return new_board, cleared

In [4]:
def get_features(board):
    board = np.array(board)
    height, width = board.shape
    h = np.zeros(width, dtype=int)

    for k in range(width):
        column = board[:, k]
        non_zero = np.where(column == 1)[0]
        h[k] = height - non_zero[0] if len(non_zero) > 0 else 0

    height_diffs = np.abs(np.diff(h))

    max_h = np.max(h)

    holes = 0
    for k in range(width):
        col = board[:, k]
        filled_indices = np.where(col == 1)[0]
        if len(filled_indices) > 0:
            top = filled_indices[0]
            holes += np.sum(col[top + 1:] == 0)

    features = np.concatenate([
        [1.0],
        h,
        height_diffs,
        [max_h],
        [holes]
    ])

    return features


In [5]:
def get_value_with_weights(board, weights):
    features = get_features(board)
    return np.dot(features, weights)

In [6]:
def get_best_action_with_weights(weights, initial_board, initial_piece):
    actions = get_piece_actions(initial_board, PIECES[initial_piece])
    best_value = None
    best_action = None
    best_reward = None
    best_new_board = None

    for action in actions:
        (x, y, rotation) = action
        new_board = place(initial_board, rotation, x, y)
        new_board, reward = get_reward(new_board)
        value = reward + get_value_with_weights(new_board, weights)
        if best_value is None or value > best_value:
            best_value = value
            best_action = action
            best_reward = reward
            best_new_board = new_board

    return best_action, best_value, best_reward, best_new_board

In [7]:
def get_target(index, value_estimates, errors):
    total = 0

    total += value_estimates[index]

    for i in range(len(errors) - index):
        total += (LAMBDA ** i) * errors[index + i]

    return total

In [8]:
def play_with_weights(weights):
    new_board = START_BOARD

    total_reward = 0

    while True:
        next_piece = random.choice(list(PIECES_NAMES))

        action, _, reward, updated_board = get_best_action_with_weights(weights, new_board, next_piece)

        if (action is None):
            break

        new_board = updated_board
        total_reward += reward

    return total_reward

In [9]:
def play_avg_reward(weights, count=50):
    total_reward = 0
    for _ in range(count):
        total_reward += play_with_weights(weights)
    return total_reward / count

In [10]:
def get_weight_update_and_traces(old_state, new_state, reward, weights, traces):
    current_value = get_value_with_weights(old_state, weights)

    if (new_state is not None):
        next_value = get_value_with_weights(new_state, weights)
        error = reward + DISCOUNT * next_value - current_value
    else:
        error = -current_value

    new_traces = DISCOUNT * LAMBDA * traces + get_features(old_state)

    weight_update = LEARNING_RATE * error * traces

    return weight_update, new_traces


In [11]:
def update_weights_per_episode(weights_to_play_with, value_function_weights):

    traces = np.zeros(FEATURE_COUNT)

    new_board = START_BOARD

    counter = 0

    while counter < 1000:
        next_piece = random.choice(list(PIECES_NAMES))

        action, value, reward, updated_board = get_best_action_with_weights(weights_to_play_with, new_board, next_piece)

        if (action is not None):
            weight_update, new_traces = get_weight_update_and_traces(new_board, updated_board, reward, value_function_weights, traces)
            traces = new_traces

            value_function_weights += weight_update
        else:
            weight_update, new_traces = get_weight_update_and_traces(new_board, None, None, value_function_weights, traces)
            value_function_weights += weight_update
            break

        new_board = updated_board
        counter += 1

    return value_function_weights

In [None]:
strategy_weights = np.zeros(FEATURE_COUNT)

episodes = 100

for i in range(75):
    print(i, flush=True)
    print(play_avg_reward(strategy_weights), flush=True)
    value_function_weights = strategy_weights
    for j in range(episodes):
        value_function_weights = update_weights_per_episode(strategy_weights, value_function_weights)

    strategy_weights = value_function_weights

print(strategy_weights)
print(play_avg_reward(strategy_weights), flush=True)