### Imports

In [1]:
import numpy as np
import random
import sys    
import pandas as pd
pd.options.display.float_format = '{:,.3f}'.format

In [2]:
ACTIONS = ("up", "down", "left", "right") 
REWARDS = {" ": 0, ".": 0.1, "+": 10, "-": -10}
TERMINALS = ("+", "-")
OBSTACLES = ("#")

gamma = 1

rand_move_probability = 0

class World:  
  def __init__(self, width, height):
    self.width = width
    self.height = height
    self.grid = np.full((width, height), ' ', dtype='U1')
  
  def add_obstacle(self, start_x, start_y, end_x=None, end_y=None):
    """
    Create an obstacle in either a single cell or rectangle.
    """
    if end_x == None: end_x = start_x
    if end_y == None: end_y = start_y
    
    self.grid[start_x:end_x + 1, start_y:end_y + 1] = OBSTACLES[0]

  def add_reward(self, x, y, reward):
    assert reward in REWARDS, f"{reward} not in {REWARDS}"
    self.grid[x, y] = reward

  def add_terminal(self, x, y, terminal):
    assert terminal in TERMINALS, f"{terminal} not in {TERMINALS}"
    self.grid[x, y] = terminal

  def is_obstacle(self, x, y):
    if x < 0 or x >= self.width or y < 0 or y >= self.height:
      return True
    else:
      return self.grid[x ,y] in OBSTACLES 

  def is_terminal(self, x, y):
    return self.grid[x ,y] in TERMINALS

  def get_reward(self, x, y):
    """ 
    Return the reward associated with a given location
    """ 
    return REWARDS[self.grid[x, y]]

  def get_next_state(self, current_state, action):
    """
    Get the next state given a current state and an action. The outcome can be
    stochastic  where rand_move_probability determines the probability of 
    ignoring the action and performing a random move.
    """    
    assert action in ACTIONS, f"Unknown acion {action} must be one of {ACTIONS}"

    x, y = current_state 
    
    if self.grid[x, y] in TERMINALS:
      return None

    if np.random.rand() < rand_move_probability:
      action = np.random.choice(ACTIONS)

    if action == "up":      y -= 1
    elif action == "down":  y += 1
    elif action == "left":  x -= 1
    elif action == "right": x += 1

    return (x, y) if not self.is_obstacle(x, y) else current_state


In [3]:
world = World(8, 8)
world.add_terminal(7, 7, "+")

Q = {}
for x in range(world.width):
    for y in range(world.height):
        for action in ACTIONS:
            Q[((x, y), action)] = 0.0

            
def epsilon_greedy_policy(state, epsilon=0.1):
    if random.uniform(0, 1) < epsilon:
        return random.choice(ACTIONS) 
    else:
        q_values = [Q[(state, action)] for action in ACTIONS]
        max_q = max(q_values)
        max_actions = [action for action, q in zip(ACTIONS, q_values) if q == max_q]
        return random.choice(max_actions)




def expected_sarsa(world, alpha=0.99, gamma=0.9, epsilon=0.1, episodes=1000):
    for _ in range(episodes):
        state = (random.randint(0, world.width - 1), random.randint(0, world.height - 1))
        action = epsilon_greedy_policy(state, epsilon)

        while not world.is_terminal(*state):
            next_state = world.get_next_state(state, action)
            reward = world.get_reward(*next_state)

            expected_value = sum(
            [Q[(next_state, a)] * (epsilon / len(ACTIONS)) for a in ACTIONS]
            )
            expected_value += Q[(next_state, action)] * (1 - epsilon + epsilon / len(ACTIONS))

            Q[(state, action)] += alpha * (reward + gamma * expected_value - Q[(state, action)])

            state = next_state
            action = epsilon_greedy_policy(state, epsilon)



In [4]:
expected_sarsa(world, alpha=0.99, gamma=0.99, epsilon=0.1, episodes=1000)

for action in ACTIONS:
    print(f"Q((7, 6), {action}): {Q[((7, 6), action)]}", "position above terminal")
    print(f"Q((6, 7), {action}): {Q[((6, 7), action)]}", "position to the left of terminal")


Q((7, 6), up): 0.5417716272790034 position above terminal
Q((6, 7), up): 0.21677358897261062 position to the left of terminal
Q((7, 6), down): 10.0 position above terminal
Q((6, 7), down): 1.148172296772975 position to the left of terminal
Q((7, 6), left): 0.10592751836884097 position above terminal
Q((6, 7), left): 0.4347915747057224 position to the left of terminal
Q((7, 6), right): 3.1617957362950757 position above terminal
Q((6, 7), right): 10.0 position to the left of terminal


In [None]:
rand_move_probability = 0.0
epsilon = 0.1
gamma = 0.9

def sarsa(world, alpha, gamma=0.9, epsilon=0.1, episodes=1000):
    for _ in range(episodes):
        state = (0,0)
        action = epsilon_greedy_policy(state, epsilon)

        while not world.is_terminal(*state):
            next_state = world.get_next_state(state, action)
            reward = world.get_reward(*next_state)
            
            next_action = epsilon_greedy_policy(next_state, epsilon)

            Q[(state, action)] += alpha * (reward + gamma * Q[(next_state, next_action)] - Q[(state, action)])
            state = next_state
            action = next_action
 
def measure_steps_to_goal(world):
    steps = 0
    state = (0, 0)
    while not world.is_terminal(*state):
        action = epsilon_greedy_policy(state, epsilon=0.1)
        next_state = world.get_next_state(state, action)
        state = next_state
        steps += 1
    return steps


def run_experiment(method, alpha, episodes=1000, runs=100):
    results = []
    for _ in range(runs):
        world = World(8, 8)
        world.add_terminal(7, 7, "+")
        
        if method == "SARSA":
            sarsa(world, alpha, gamma=0.9, epsilon=0.1, episodes=episodes)
        elif method == "Expected SARSA":
            expected_sarsa(world, alpha, gamma=0.9, epsilon=0.1, episodes=episodes)
        
        steps = measure_steps_to_goal(world)
        results.append(steps)
    
    return sum(results) / len(results) 

alphas = [0.1, 0.5, 0.99]
sarsa_results = [run_experiment("SARSA", alpha) for alpha in alphas]
expected_sarsa_result = run_experiment("Expected SARSA", alpha=0.99)

print("SARSA Results (Different Alphas, 0.1, 0.5, 0.99):", sarsa_results)
print("Expected SARSA Result:", expected_sarsa_result)