In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from collections import deque
import random
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
from IPython import display
import time

In [2]:
#Define the neural network
def build_model(input_size, output_size, learning_rate=0.001):
    model = Sequential([
        Dense(64, input_dim=input_size, activation='relu'),
        Dense(64, activation='relu'),
        Dense(output_size, activation='linear')
    ])
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mse')
    return model


In [3]:
# Constants for the board and Q-learning
grid_size = (10, 10)  # 10x10 grid
food_position = (9, 9)
MATRIX_SIZE = 10
Q = {}
ALPHA = 0.1  # learning rate
GAMMA = 0.9  # discount factor
EPSILON = 0.2  # for epsilon-greedy strategy

"""==================================================================================="""
# Initialize network and memory buffer
state_size = 2  # because the state is given by (x, y) coordinates
action_size = 4  # possible actions: up, down, left, right
model = build_model(state_size, action_size)
target_model = build_model(state_size, action_size)
target_model.set_weights(model.get_weights())
memory = deque(maxlen=2000)
batch_size = 32
update_every = 5  # update target network weights every 5 episodes

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
def generate_random_coordinates(size):
    return (random.randint(0, size - 1), random.randint(0, size - 1))

def render_game_board(board_position, score, episode):
    """This function takes the specified position of the snake, score values and episode
    and renders the board for the plot.

    Args:
        board_position (tuple): poisiton vectors passed into the function
        score (int): score
        episode (int): number of epochs being run
    """
    # Load and display the game board
    board_img = mpimg.imread('board.png')
    # Path to the robot icon image
    snake_icon_head = 'snake_head.png'
    food_icon = 'food.png'
    fig, ax = plt.subplots()
    ax.imshow(board_img)
    # Grid size and cell size calculation
    cell_size_x = board_img.shape[1] / grid_size[1]
    cell_size_y = board_img.shape[0] / grid_size[0]
    # Calculate snake icon position (assuming bottom left is (0,0))
    snake_x = board_position[0] * cell_size_x + cell_size_x / 2
    snake_y = (grid_size[1] - board_position[1] - 1) * cell_size_y + cell_size_y / 2
    #Calculate the food position
    food_x = food_position[0] * cell_size_x + cell_size_x / 2
    food_y = (grid_size[1] - food_position[1] - 1) * cell_size_y + cell_size_y / 2
    # # Load and overlay the snake icon
    snake_icon = mpimg.imread(snake_icon_head)
    food_icon = mpimg.imread(food_icon)
    zoom_factor = 0.50  # Adjust zoom factor to fit the icon in the cell
    # get the image of the snake
    snake_imagebox = OffsetImage(snake_icon, zoom=zoom_factor)
    snake_ab = AnnotationBbox(snake_imagebox, (snake_x, snake_y), frameon=False)
    # get the image of the mouse
    food_imagebox = OffsetImage(food_icon, zoom =zoom_factor)
    food_ab = AnnotationBbox(food_imagebox, (food_x, food_y), frameon=False)
    #display the images on the graph.
    ax.add_artist(snake_ab)
    ax.add_artist(food_ab)
    # add an x axix label
    ax.set_xlabel(f"Total Reward: {score} | Epoch: {episode}", fontsize=12, color='blue')
    plt.show()
    plt.close()

def initialize_q_table(Q):
    """creates the Q table based on the actions

    Args:
        Q (dict): this initializes the Q table list that takes an empty dictionary
    """
    for x in range(MATRIX_SIZE):
        for y in range(MATRIX_SIZE):
            for action in range(4):  # four possible actions
                Q[((x, y), action)] = 0

def choose_action(state, epsilon=0.2):
    if np.random.rand() <= epsilon:
        return random.randint(0, action_size -1)
    state = np.array([state])
    q_values = model.predict(state)
    return np.argmax(q_values[0])

# def update_q_value(prev_state, action, reward, next_state, Q):
#     """this function takes a set of inputs and updates the Q table

#     Args:
#         prev_state (tuple): position of the snake.
#         action (int): the next direction to take for the snake.
#         reward (int): the score value
#         next_state (tuple): the next calculated position
#         Q (list): Q list of the table.
#     """
#     max_future_q = max([Q[(next_state, a)] for a in range(4)])
#     current_q = Q[(prev_state, action)]
#     # Q-learning formula
#     new_q = (1 - ALPHA) * current_q + ALPHA * (reward + GAMMA * max_future_q)
#     Q[(prev_state, action)] = new_q

def replay():
    if len(memory) < batch_size:
        return
    minibatch = random.sample(memory, batch_size)
    states = np.array([i[0] for i in minibatch])
    actions = np.array([i[1] for i in minibatch])
    rewards = np.array([i[2] for i in minibatch])
    next_states = np.array([i[3] for i in minibatch])
    dones = np.array([i[4] for i in minibatch])

    target = rewards + GAMMA * np.amax(target_model.predict(next_states), axis=1) * (1 - dones)
    target_full = model.predict(states)
    target_full[np.arange(batch_size), actions] = target

    model.fit(states, target_full, epochs=1, verbose=0)

def update_target_model():
    target_model.set_weights(model.get_weights())

def manhattan_distance(point1,point2):
    """calculates the distance between point A and B.

    Args:
        point1 (tuple): x,y cooridnates of the snake
        point2 (tuple): x,y cooridnates of the snake

    Returns:
        int: the manhattan distance between the two points
    """
    return abs(point1[0] - point2[0]) + abs(point1[1] - point2[1])

def get_reward(old_position, food_position, action):
    """this function takes the poisiton tuples and returns the next
    action to take as well as the reward value

    Args:
        old_position (tuple): old position of the snake
        food_position (tuple): position of the mouse
        action (int): direction value

    Returns:
        tuple: the new x,y coordinate to take
        reward: the reward value
    """
    # Set of possible actions
    action_space = {
        0:(-1,0), # Move left
        1:(1,0), # Move right
        2:(0,1), # Move up
        3:(0,-1), # Move down
    }
    # calculate the next position
    new_position = (old_position[0]+action_space[action][0],
                    old_position[1]+action_space[action][1])
    
    # ensure the new_position is within the board boundry
    new_position = (max(0, min(new_position[0], grid_size[0]-1)),
                    max(0, min(new_position[1], grid_size[1]-1)))
    
    old_step = manhattan_distance(old_position, food_position)
    new_step = manhattan_distance(new_position, food_position)
  
    if new_position == food_position:
        reward = 10  # reward for reaching the food
    elif new_step < old_step:
        reward = 4
    elif new_step > old_step:
        reward = -1
    else:
        reward = 0  # small penalty for each move
    return new_position, reward

initialize_q_table(Q)  # Initialize the Q-table


for episode in range(20): #creating the loop based on the number of epochs
    pos = (0,0)
    total_reward = 0
    steps = 0
    #food_position = generate_random_coordinates(MATRIX_SIZE)
    food_position = (9,9)
    while pos != food_position and steps < 50: #Looping the code to run but limiting to 50 so it does not go into an infinite loop
        display.clear_output(wait=True) #Removes the old graph
        render_game_board(pos, total_reward, episode) #Function renders the graph
        action = choose_action(pos, EPSILON) #Decides the next action to take
        new_position, reward = get_reward(pos, food_position, action) #got the reward
        done = new_position == food_position
        memory.append((np.array(pos), action, reward, np.array(new_position), done))
        replay()

        # update_q_value(pos, action, reward, new_position, Q) #updating the Q values 
        pos = new_position
        total_reward += reward
        steps += 1
        print(f"Number of Steps: ", {steps})

    if episode % update_every == 0:
        update_target_model()