In [None]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
import random
from IPython import display

# Constants for the board and Q-learning
grid_size = (10, 10)  # 10x10 grid
MATRIX_SIZE = 10
Q = {}
ALPHA = 0.1  # learning rate
GAMMA = 0.9  # discount factor
EPSILON = 0.2  # for epsilon-greedy strategy

def render_game_board(board_position, score, episode):

    while board_position == food_position:
        board_position = generate_random_coordinates(MATRIX_SIZE)
        render_game_board(board_position, score)

    # Load and display the game board
    board_img = mpimg.imread('board.png')
    # Path to the robot icon image
    snake_icon_head = 'snake_head.png'
    food_icon = 'food.png'

    fig, ax = plt.subplots()
    ax.imshow(board_img)

    # Grid size and cell size calculation
    cell_size_x = board_img.shape[1] / grid_size[1]
    cell_size_y = board_img.shape[0] / grid_size[0]

    # Calculate snake icon position (assuming bottom left is (0,0))
    snake_x = board_position[0] * cell_size_x + cell_size_x / 2
    snake_y = (grid_size[1] - board_position[1] - 1) * cell_size_y + cell_size_y / 2

    #Calculate the food position
    food_x = food_position[0] * cell_size_x + cell_size_x / 2
    food_y = (grid_size[1] - food_position[1] - 1) * cell_size_y + cell_size_y / 2

    # # Load and overlay the snake icon
    snake_icon = mpimg.imread(snake_icon_head)
    food_icon = mpimg.imread(food_icon)
    zoom_factor = 0.45  # Adjust zoom factor to fit the icon in the cell
    snake_imagebox = OffsetImage(snake_icon, zoom=zoom_factor)
    snake_ab = AnnotationBbox(snake_imagebox, (snake_x, snake_y), frameon=False)

    food_imagebox = OffsetImage(food_icon, zoom =zoom_factor)
    food_ab = AnnotationBbox(food_imagebox, (food_x, food_y), frameon=False)

    ax.add_artist(snake_ab)
    ax.add_artist(food_ab)

    ax.set_xlabel(f"Score: {score} | epoch: {episode}", fontsize=12, color='blue')

    plt.show()
    plt.close()


def generate_random_coordinates(size):
    return (random.randint(0, size - 1), random.randint(0, size - 1))

food_position = (9, 9)
pos = generate_random_coordinates(MATRIX_SIZE)

def initialize_q_table(Q):
    for x in range(MATRIX_SIZE):
        for y in range(MATRIX_SIZE):
            for action in range(4):  # four possible actions
                Q[((x, y), action)] = 0

def choose_action(state, Q, epsilon=0.2):
    if random.uniform(0, 1) < epsilon:
        return random.randint(0, 3)  # explore: choose a random action
    else:
        # exploit: choose the best action based on current Q-values
        actions = [Q[(state, action)] for action in range(4)]
        return actions.index(max(actions))

def update_q_value(prev_state, action, reward, next_state, Q):
    max_future_q = max([Q[(next_state, a)] for a in range(4)])
    current_q = Q[(prev_state, action)]
    # Q-learning formula
    new_q = (1 - ALPHA) * current_q + ALPHA * (reward + GAMMA * max_future_q)
    Q[(prev_state, action)] = new_q

def get_reward(new_position, food_position):
    # Set of possible actions
    action_space = {
        0:(-1,0), # Move left
        1:(1,0), # Move right
        2:(0,1), # Move up
        3:(0,-1), # Move down
    }
    # calculate the next position
    new_position = (new_position[0]+action_space[action][0],
                    new_position[1]+action_space[action][1])
    
    # ensure the new_position is within the board boundry
    new_position = (max(0, min(new_position[0], grid_size[0]-1)),
                    max(0, min(new_position[1], grid_size[1]-1)))
  
    if new_position == food_position:
        reward = 10  # reward for reaching the food
    else:
        reward = -0.5  # small penalty for each move
    return new_position, reward

initialize_q_table(Q)  # Initialize the Q-table

for episode in range(500):
    pos = (0,0)
    total_reward = 0
    steps = 0
    while pos != food_position and steps < 50:
        display.clear_output(wait=True)
        render_game_board(pos, total_reward, episode)
        action = choose_action(pos, Q, EPSILON)
        new_position, reward = get_reward(pos, action)
        update_q_value(pos, action, reward, new_position, Q)
        pos = new_position
        total_reward += reward
        steps += 1