### Imports

In [1]:
import numpy as np
import random
import sys         
import pandas as pd
pd.options.display.float_format = '{:,.3f}'.format

In [2]:
ACTIONS = ("up", "down", "left", "right") 
REWARDS = {" ": 0, ".": 0.1, "+": 10, "-": -10}
TERMINALS = ("+", "-", "#")
OBSTACLES = ("#")

gamma = 1
rand_move_probability = 0

class World:  
  def __init__(self, width, height):
    self.width = width
    self.height = height
    self.grid = np.full((width, height), ' ', dtype='U1')
  
  def add_obstacle(self, start_x, start_y, end_x=None, end_y=None):
    """
    Create an obstacle in either a single cell or rectangle.
    """
    if end_x == None: end_x = start_x
    if end_y == None: end_y = start_y
    
    self.grid[start_x:end_x + 1, start_y:end_y + 1] = OBSTACLES[0]

  def add_reward(self, x, y, reward):
    assert reward in REWARDS, f"{reward} not in {REWARDS}"
    self.grid[x, y] = reward

  def add_terminal(self, x, y, terminal):
    assert terminal in TERMINALS, f"{terminal} not in {TERMINALS}"
    self.grid[x, y] = terminal

  def is_obstacle(self, x, y):
    if x < 0 or x >= self.width or y < 0 or y >= self.height:
      return True
    else:
      return self.grid[x ,y] in OBSTACLES 

  def is_terminal(self, x, y):
    return self.grid[x ,y] in TERMINALS

  def get_reward(self, x, y):
    """ 
    Return the reward associated with a given location
    """ 
    return REWARDS[self.grid[x, y]]

  def get_next_state(self, current_state, action):
    """
    Get the next state given a current state and an action. The outcome can be
    stochastic  where rand_move_probability determines the probability of 
    ignoring the action and performing a random move.
    """    
    assert action in ACTIONS, f"Unknown acion {action} must be one of {ACTIONS}"
    
    x, y = current_state 
  
    if self.grid[x, y] in TERMINALS:
      return None

    if np.random.rand() < rand_move_probability:
      action = np.random.choice(ACTIONS)

    if action == "up":      y -= 1
    elif action == "down":  y += 1
    elif action == "left":  x -= 1
    elif action == "right": x += 1

    return (x, y) if not self.is_obstacle(x, y) else current_state


In [None]:
def q_learning(world, alpha=0.1, epsilon=0.1, gamma=1.0, episodes=1000):

    Q = {}
    for x in range(world.width):
        for y in range(world.height):
            if world.grid[x, y] not in TERMINALS: 
                Q[(x, y)] = {action: 0.0 for action in ACTIONS}

    def choose_action(state):
        if np.random.rand() < epsilon:
            return np.random.choice(ACTIONS)
        else:
            q_values = Q[state]
            return max(q_values, key=q_values.get)

    def update_Q(state, action, reward, next_state):
        if next_state is None or world.is_terminal(next_state[0], next_state[1]):
            next_q_value = 0
        else:
            next_q_value = max(Q[next_state].values())
        Q[state][action] += alpha * (reward + gamma * next_q_value - Q[state][action])

    for _ in range(episodes):
        start_x = np.random.randint(world.width)
        start_y = np.random.randint(world.height)
        while world.is_terminal(start_x, start_y):
            start_x = np.random.randint(world.width)
            start_y = np.random.randint(world.height)

        state = (start_x, start_y)

        while not world.is_terminal(state[0], state[1]):
            action = choose_action(state)
            next_state = world.get_next_state(state, action)
            reward = world.get_reward(*next_state)

            update_Q(state, action, reward, next_state)

            state = next_state

    return Q

world = World(5, 5) 
world.add_obstacle(2, 2)  
world.add_terminal(4, 4, '+')
world.add_terminal(0, 0, '-')

Q = q_learning(world, alpha=0.1, epsilon=0.1, gamma=0.9, episodes=100000)

print(world.grid.T)
def generate_policy_grid(world, Q):
    """
    Generate a grid showing the best action for each state based on the Q-values.
    
    Parameters:
    - world: the World object representing the environment
    - Q: the learned Q-value table
    
    Returns:
    - policy_grid: a grid with 'L', 'R', 'U', 'D' for each state, indicating the best action
    """
    policy_grid = np.full((world.width, world.height), ' ', dtype='U1')
    
    for x in range(world.width):
        for y in range(world.height):
            if world.is_terminal(x, y):
                policy_grid[x, y] = world.grid[x, y]
            else:
                state = (x, y)
                best_action = max(Q[state], key=Q[state].get) 
                
                if best_action == "left":
                    policy_grid[x, y] = 'L'
                elif best_action == "right":
                    policy_grid[x, y] = 'R'
                elif best_action == "up":
                    policy_grid[x, y] = 'U'
                elif best_action == "down":
                    policy_grid[x, y] = 'D'
    
    return policy_grid

policy_grid = generate_policy_grid(world, Q)
print()
print(policy_grid.T)
for item in Q.items():
    print(item)


[['-' ' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ' ' ']
 [' ' ' ' '#' ' ' ' ']
 [' ' ' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ' '+']]

[['-' 'R' 'R' 'D' 'D']
 ['D' 'D' 'R' 'D' 'D']
 ['R' 'D' '#' 'D' 'D']
 ['R' 'D' 'D' 'D' 'D']
 ['R' 'R' 'R' 'R' '+']]
((0, 1), {'up': -9.999999999305235, 'down': 5.314409999999974, 'left': 4.782958966220196, 'right': 5.3143897448276505})
((0, 2), {'up': 4.782968999986369, 'down': 5.904899999699186, 'left': 5.314409999973163, 'right': 5.904899999999975})
((0, 3), {'up': 5.314409999999974, 'down': 6.560981470767228, 'left': 5.904888414796627, 'right': 6.560999999999977})
((0, 4), {'up': 5.90489668749574, 'down': 6.560584033944391, 'left': 6.560993915514217, 'right': 7.289999999999978})
((1, 0), {'up': 4.782968964917476, 'down': 5.314407293956412, 'left': -9.999999999999993, 'right': 5.314409999999974})
((1, 1), {'up': 4.782968999999973, 'down': 5.904899999999975, 'left': 4.782968277996572, 'right': 5.904898167663782})
((1, 2), {'up': 5.314409999999974, 'down': 6.560999999999