### Initial Class

In [6]:
import random
import timeit
import numpy as np
from enum import IntEnum
from copy import deepcopy
from collections import defaultdict
import matplotlib.pyplot as plt
from tabulate import tabulate

plt.style.use('seaborn-notebook')
plt.style.use('seaborn-whitegrid')
import matplotlib.colors as mcolors

  plt.style.use('seaborn-notebook')
  plt.style.use('seaborn-whitegrid')


In [7]:
class Action(IntEnum):
    up = 0
    right = 1
    down = 2
    left = 3

action_to_str = {
    Action.up : "up",
    Action.right : "right",
    Action.down : "down",
    Action.left : "left",
}

action_to_offset = {
    Action.up : (-1, 0),
    Action.right : (0, 1),
    Action.down : (1, 0),
    Action.left : (0, -1),
}

### Grid World Class

In [8]:
class GridWorld:
    def __init__(self, height, width, goal, start, danger, blocked, goal_value=1.0,  danger_value=-1.0, noise=0.0):
        """
        Initialize the GridWorld environment.

        Parameters:
        - height (int): Number of rows.
        - width (int): Number of columns.
        - goal (int): Index number of the goal cell.
        - goal_value (float): Reward given for the goal cell.
        - danger (list of int): Indices of cells marked as danger.
        - danger_value (float): Reward given for danger cell.
        - blocked (list of int): Indices of cells marked as blocked (cannot enter).
        - noise (float): Probability of resulting state not being what was expected.
        """
        self._width = width
        self._height = height
        self._grid_values = np.zeros(height * width)
        self._goal_value = goal_value
        self._danger_value = danger_value
        self._goal_cell = goal
        self._danger_cells = danger
        self._blocked_cells = blocked
        self._start_cell = start
        self._noise = noise
        assert 0 <= noise < 1  # Ensure valid noise value
        self.create_next_values()

    def reset(self):
        """
        Reset the state values to their initial state.
        """
        self._grid_values = np.zeros(self._height * self._width)
        self.create_next_values()

    def _inbounds(self, state):
        """
        Check if a state index is within the grid boundaries.
        """
        return 0 <= state < self._width * self._height

    def _inbounds_rc(self, state_r, state_c):
        """
        Check if row and column indices are within the grid boundaries.
        """
        return 0 <= state_r < self._height and 0 <= state_c < self._width

    def _state_to_rc(self, state):
        """
        Convert a state index to row and column indices.
        """
        return state // self._width, state % self._width

    def _state_from_action(self, state, action):
        """
        Gets the state as a result of applying the given action
        """
        dr, dc = action_to_offset[action]
        new_r, new_c = self._state_to_rc(state)
        new_r += dr
        new_c += dc
        new_state = new_r * self._width + new_c

        if self._inbounds(new_state) and new_state != self._blocked_cells:
            return new_state
        return state

    def is_terminal(self, state):
        """
        Returns true if a state is terminal (goal or danger).
        """
        return state == self._goal_cell or state == self._danger_cells

    def get_states(self):
        """
        Gets all non-terminal states in the environment.
        """
        return [s for s in range(self._width * self._height) if not self.is_terminal(s) and s != self._blocked_cells]

    def get_actions(self, state):
        """
        Returns a list of valid actions given the current state.
        """
        actions = []
        for action in Action:
            next_state = self._state_from_action(state, action)
            if next_state != state:
                actions.append(action)
        return actions

    def get_reward(self, state):
        """
        Get the reward for being in the current state.
        """
        if state == self._goal_cell:
            return self._goal_value
        elif state == self._danger_cells:
            return self._danger_value
        else:
            return -0.1

    def get_transitions(self, state, action):
        """
        Get a list of transitions as a result of attempting the action in the current state.
        Each item in the list is a tuple containing the probability of reaching that state and the next state itself.
        """
        next_state = self._state_from_action(state, action)
        if next_state == state:
            return [(1.0, state)]
        return [(1.0 - self._noise, next_state), (self._noise, state)]

    def get_value(self, state):
        """
        Get the current value of the state.
        """
        return self._grid_values[state]

    def create_next_values(self):
        """
        Creates a temporary storage for state value updating.
        """
        self._next_values = np.zeros_like(self._grid_values)

    def set_next_values(self):
        """
        Set the state values from the temporary copied values.
        """
        self._grid_values = np.copy(self._next_values)

    def set_value(self, state, value):
        """
        Set the value of the state into the temporary copy.
        This value will not update into the main storage until self.set_next_values() is called.
        """
        self._next_values[state] = value

    def __str__(self):
        """
        Pretty print the state values.
        """
        out_str = ""
        for r in range(self._height):
            for c in range(self._width):
                cell = r * self._width + c
                if cell == self._blocked_cells:
                    out_str += "{:>6}".format("----")
                elif cell == self._goal_cell:
                    out_str += "{:>6}".format("GOAL")
                elif cell == self._danger_cells:
                    out_str += "{:>6.2f}".format(self._danger_value)
                else:
                    out_str += "{:>6.2f}".format(self._grid_values[cell])
                out_str += " "
            out_str += "\n"
        return out_str

    def print_policy(self, Q):
        """
        Pretty print the policy values.
        """
        out_str = ""
        for r in range(self._height):
            for c in range(self._width):
                cell = r * self._width + c
                if cell == self._blocked_cells:
                    out_str += "{:>6}".format("----")
                elif cell == self._goal_cell:
                    out_str += "{:>6}".format("GOAL")
                elif cell == self._danger_cells:
                    out_str += "{:>6}".format("DANGR")
                else:
                    if cell in Q:
                        best_action = np.argmax(Q[cell])
                        out_str += "{:>6}".format(action_to_str[best_action])
                    else:
                        out_str += "{:>6}".format("????")
                out_str += " "
            out_str += "\n"
        print(out_str)

In [9]:
# Initialize your GridWorld
simple_gw = GridWorld(height=3, width=4, goal=3, start=8, danger=7, blocked=5, noise=0.0)
print(simple_gw)


  0.00   0.00   0.00   GOAL 
  0.00   ----   0.00  -1.00 
  0.00   0.00   0.00   0.00 





---



### Algorithms Implementation

##### On-policy MC with exploring starts (10 points)

In [10]:
def on_policy_mc_with_exploring_starts(env, num_episodes, gamma=1.0):
    """Implement the on policy MC with exploring starts Algorithm"""
    # intialize variables to count the average
    Q = defaultdict(lambda: np.zeros(len(Action)))
    returns_sum = defaultdict(lambda: np.zeros(len(Action)))
    returns_count = defaultdict(lambda: np.zeros(len(Action)))

    def policy(state):
        # epsilon greedy to find the next action
        if np.random.rand() < 0.1:
            return np.random.choice(list(Action))
        else:
            return np.argmax(Q[state])

    for _ in range(num_episodes):
        # Generating episodes with exploring starts
        state = np.random.choice(env.get_states())
        episode = []
        while not env.is_terminal(state):
            # select the state & action chosen and add the pair
            action = policy(state)
            next_state = env._state_from_action(state, action)
            reward = env.get_reward(next_state)
            episode.append((state, action, reward))
            state = next_state

        G = 0  # G - discounted return
        # Iterate over trajectory in reverse
        for t in range(len(episode) - 1, -1, -1):
            # calculate the average returns and update the Q values and policy
            state, action, reward = episode[t]
            G = reward + gamma * G
            if (state, action) not in [(ep_state, ep_action) for ep_state, ep_action, _ in episode[:t]]:
                # condition to use first visit evaluation
                returns_sum[state][action] += G
                returns_count[state][action] += 1.0
                Q[state][action] = returns_sum[state][action] / returns_count[state][action]

    return Q


In [11]:
print("On-policy MC with Exploring Starts:")
Q_on_policy_exploring_starts = on_policy_mc_with_exploring_starts(simple_gw, num_episodes=10000)

for state in range(simple_gw._height * simple_gw._width):
    if state in simple_gw.get_states():
        print(f"State {state}: {action_to_str[np.argmax(Q_on_policy_exploring_starts[state])]}")

On-policy MC with Exploring Starts:
State 0: right
State 1: right
State 2: right
State 4: left
State 6: up
State 8: right
State 9: right
State 10: up
State 11: left


In [12]:
simple_gw.print_policy(Q_on_policy_exploring_starts)

 right  right  right   GOAL 
  left   ----     up  DANGR 
 right  right     up   left 



Experiments

In [19]:
epsides = [10000, 50000, 100000]
gamma = 1.0
es_results = []

for i, n_episodes in enumerate(epsides):
    print(f"On-policy MC with Exploring Starts for {n_episodes} episodes:")
    Q_on_policy_exploring_starts = on_policy_mc_with_exploring_starts(simple_gw, num_episodes=n_episodes, gamma=gamma)
    simple_gw.print_policy(Q_on_policy_exploring_starts)
    es_results.append((i, n_episodes, gamma))
    print()

On-policy MC with Exploring Starts for 10000 episodes:
  down  right  right   GOAL 
  left   ----     up  DANGR 
 right  right     up   left 


On-policy MC with Exploring Starts for 50000 episodes:
  down  right  right   GOAL 
  left   ----     up  DANGR 
    up   left     up   left 


On-policy MC with Exploring Starts for 100000 episodes:
  down  right  right   GOAL 
  left   ----     up  DANGR 
    up  right     up   left 




In [22]:
Gamma= [0.9, 0.95, 1.1]
n_episodes = 10000

for i, gamma in enumerate(Gamma):
    print(f"On-policy MC with Exploring Starts for {gamma} episodes:")
    Q_on_policy_exploring_starts = on_policy_mc_with_exploring_starts(simple_gw, num_episodes=n_episodes, gamma=gamma)
    simple_gw.print_policy(Q_on_policy_exploring_starts)
    es_results.append((i, n_episodes, gamma))
    print()

On-policy MC with Exploring Starts for 0.9 episodes:
  down  right  right   GOAL 
  left   ----     up  DANGR 
    up   left     up   left 


On-policy MC with Exploring Starts for 0.95 episodes:
  down  right  right   GOAL 
  left   ----     up  DANGR 
    up   left     up   left 


On-policy MC with Exploring Starts for 1.1 episodes:
 right  right  right   GOAL 
  left   ----  right  DANGR 
  left   left     up     up 




In [21]:
headers = ["#Trial", "#Episodes", "Gamma"]
print(tabulate(es_results, headers=headers, tablefmt="grid"))

+----------+-------------+---------+
|   #Trial |   #Episodes |   Gamma |
|        0 |       10000 |    1    |
+----------+-------------+---------+
|        1 |       50000 |    1    |
+----------+-------------+---------+
|        2 |      100000 |    1    |
+----------+-------------+---------+
|        0 |       10000 |    0.9  |
+----------+-------------+---------+
|        1 |       10000 |    0.95 |
+----------+-------------+---------+
|        2 |       10000 |    1.1  |
+----------+-------------+---------+




---


##### On-policy MC control without exploring starts

In [23]:
def on_policy_mc_control(env, num_episodes, gamma=1.0, epsilon=0.1):
    """ returns the Q function, which approximates the expected return for each state-action pair"""
    # Initialize Q, C
    env.reset()
    Q = defaultdict(lambda: np.zeros(len(Action)))
    returns_sum = defaultdict(lambda: np.zeros(len(Action)))
    returns_count = defaultdict(lambda: np.zeros(len(Action)))

    def policy(state):
        # Deinfe the epsioln greedy policy
        if np.random.rand() < epsilon:
            return np.random.choice(list(Action))
        else:
            return np.argmax(Q[state])

    for _ in range(num_episodes):
        # always start from the same state (8)
        state = 8
        episode = []
        while not env.is_terminal(state):
            action = policy(state)
            next_state = env._state_from_action(state, action)
            reward = env.get_reward(next_state)
            episode.append((state, action, reward))
            state = next_state

        G = 0  # G - discounted return
        # Iterate over trajectory in reverse
        for t in range(len(episode) - 1, -1, -1):
            state, action, reward = episode[t]
            G = reward + gamma * G
            # condition to use first visit evaluation
            if (state, action) not in [(ep_state, ep_action) for ep_state, ep_action, _ in episode[:t]]:
                returns_sum[state][action] += G
                returns_count[state][action] += 1.0
                # Update Q values
                Q[state][action] = returns_sum[state][action] / returns_count[state][action]

    return Q


In [24]:
print("On-policy MC Control without Exploring Starts:")
Q_on_policy_control = on_policy_mc_control(simple_gw, num_episodes=10000)
for state in range(simple_gw._height * simple_gw._width):
    if state in simple_gw.get_states():
        print(f"State {state}: {action_to_str[np.argmax(Q_on_policy_control[state])]}")


On-policy MC Control without Exploring Starts:
State 0: down
State 1: right
State 2: right
State 4: left
State 6: up
State 8: up
State 9: right
State 10: up
State 11: left


In [25]:
simple_gw.print_policy(Q_on_policy_control)

  down  right  right   GOAL 
  left   ----     up  DANGR 
    up  right     up   left 



Experiments

In [26]:
epsides = [10000, 50000, 100000]
Gammas= [0.9, 0.95, 1.1]
Epsilons= [0.01, 0.2, 0.3]
control_results = []

gamma = 1.0
ep = 0.1
for i, n_episodes in enumerate(epsides):
    print(f"On-policy MC Control without Exploring Starts for {n_episodes} episodes:")
    Q_on_policy_control = on_policy_mc_control(simple_gw, num_episodes=n_episodes, gamma=gamma, epsilon=ep)
    simple_gw.print_policy(Q_on_policy_control)
    control_results.append((i, n_episodes, gamma, ep))
    print()

On-policy MC Control without Exploring Starts for 10000 episodes:
  down  right  right   GOAL 
  left   ----     up  DANGR 
    up  right     up   left 


On-policy MC Control without Exploring Starts for 50000 episodes:
 right  right  right   GOAL 
  left   ----     up  DANGR 
    up   left     up     up 


On-policy MC Control without Exploring Starts for 100000 episodes:
  down  right  right   GOAL 
  left   ----     up  DANGR 
    up  right     up   left 




In [27]:
n_episodes = 10000

for i, gamma in enumerate(Gammas):
    print(f"On-policy MC Control without Exploring Starts for gamma = {gamma}:")
    Q_on_policy_control = on_policy_mc_control(simple_gw, num_episodes=n_episodes, gamma=gamma, epsilon=ep)
    simple_gw.print_policy(Q_on_policy_control)
    control_results.append((i+3, n_episodes, gamma, ep))
    print()

On-policy MC Control without Exploring Starts for gamma = 0.9:
 right  right  right   GOAL 
  left   ----   down  DANGR 
    up   left   left  right 


On-policy MC Control without Exploring Starts for gamma = 0.95:
  down   left  right   GOAL 
  left   ----     up  DANGR 
    up   left   left  right 


On-policy MC Control without Exploring Starts for gamma = 1.1:
 right   down  right   GOAL 
  left   ----     up  DANGR 
  left  right     up   down 




In [28]:
gamma = 1.0

for i, ep in enumerate(Epsilons):
    print(f"On-policy MC Control without Exploring Starts for epsilon = {ep}:")
    Q_on_policy_control = on_policy_mc_control(simple_gw, num_episodes=n_episodes, gamma=gamma, epsilon=ep)
    simple_gw.print_policy(Q_on_policy_control)
    control_results.append((i+6, n_episodes, gamma, ep))
    print()

On-policy MC Control without Exploring Starts for epsilon = 0.01:
  left  right  right   GOAL 
  left   ----     up  DANGR 
  left  right     up  right 


On-policy MC Control without Exploring Starts for epsilon = 0.2:
  down   left  right   GOAL 
  left   ----  right  DANGR 
    up   left   left   down 


On-policy MC Control without Exploring Starts for epsilon = 0.3:
  down  right  right   GOAL 
  left   ----     up  DANGR 
    up   left     up   left 




In [29]:
headers = ["#Trial", "#Episodes", "Gammas", "Epsilons"]
print(tabulate(control_results, headers=headers, tablefmt="grid"))

+----------+-------------+----------+------------+
|   #Trial |   #Episodes |   Gammas |   Epsilons |
|        0 |       10000 |     1    |       0.1  |
+----------+-------------+----------+------------+
|        1 |       50000 |     1    |       0.1  |
+----------+-------------+----------+------------+
|        2 |      100000 |     1    |       0.1  |
+----------+-------------+----------+------------+
|        3 |       10000 |     0.9  |       0.1  |
+----------+-------------+----------+------------+
|        4 |       10000 |     0.95 |       0.1  |
+----------+-------------+----------+------------+
|        5 |       10000 |     1.1  |       0.1  |
+----------+-------------+----------+------------+
|        6 |       10000 |     1    |       0.01 |
+----------+-------------+----------+------------+
|        7 |       10000 |     1    |       0.2  |
+----------+-------------+----------+------------+
|        8 |       10000 |     1    |       0.3  |
+----------+-------------+-----



---


##### Off-policy MC prediction (10 points)

In [30]:
# Define the epsilon-greedy policy
def epsilon_greedy_policy(Q, state, epsilon=0.1, n_actions=4):
    A = np.ones(n_actions, dtype=float) * epsilon / n_actions
    best_action = np.argmax(Q[state])
    A[best_action] += (1.0 - epsilon)
    return A

# Off-policy MC prediction
def off_policy_mc_prediction(env, num_episodes, gamma=1.0, epsilon=0.1):
    env.reset()
    # Initialize Q and C arbitrarily
    Q = defaultdict(lambda: np.zeros(4))
    C = defaultdict(lambda: np.zeros(4))

    # Generate episodes
    for i_episode in range(1, num_episodes + 1):
        # Generate an episode using behavior policy (uniform random policy)
        episode = []
        state = env._start_cell
        while True:
            probs = np.ones(4) / 4
            action = np.random.choice(np.arange(len(probs)), p=probs)
            _, next_state = env.get_transitions(state, Action(action))[0]
            reward = env.get_reward(next_state)
            episode.append((state, action, reward))
            state = next_state
            if env.is_terminal(state):
                break

        # Initialize variables
        G = 0
        W = 1

        # Iterate over trajectory in reverse
        for t in range(len(episode) - 1, -1, -1):
            state, action, reward = episode[t]
            G = gamma * G + reward
            C[state][action] += W
            Q[state][action] += (W / C[state][action]) * (G - Q[state][action])
            if action != np.argmax(epsilon_greedy_policy(Q, state, epsilon)):
                break
            W *= 1.0 / (0.25)

    return Q


In [31]:
Q_off_policy = off_policy_mc_prediction(simple_gw, num_episodes=10000)

print("Q-values:")
for state in range(simple_gw._width * simple_gw._height):
    if state in Q_off_policy:
        print(f"State {state}: {Q_off_policy[state]}")

Q-values:
State 0: [0.8 0.8 0.9 0.8]
State 1: [0.8 0.9 0.8 0.8]
State 2: [0.9 1.  0.8 0.8]
State 4: [0.7974359 0.9       0.8       1.       ]
State 6: [ 0.9 -1.   0.7  0.8]
State 8: [ 0.9  0.7  0.8 -1. ]
State 9: [0.7 0.7 0.7 0.8]
State 10: [0.8 0.6 0.7 0.7]
State 11: [-1.   0.6  0.6  0.7]


In [32]:
simple_gw.print_policy(Q_off_policy)

  down  right  right   GOAL 
  left   ----     up  DANGR 
    up   left     up   left 



Experiments

In [33]:
epsides = [10000, 50000, 100000]
Gammas= [0.9, 0.95, 1.1]
Epsilons= [0.01, 0.2, 0.3]
control_results = []

gamma = 1.0
ep = 0.1
for i, n_episodes in enumerate(epsides):
    print(f"Off-policy MC Prediction for {n_episodes} episodes:")
    Q_off_policy = off_policy_mc_prediction(simple_gw, num_episodes=n_episodes, gamma=gamma, epsilon=ep)
    simple_gw.print_policy(Q_off_policy)
    control_results.append((i, n_episodes, gamma, ep))
    print()

Off-policy MC Prediction for 10000 episodes:
  down  right  right   GOAL 
  left   ----     up  DANGR 
    up   left     up   left 


Off-policy MC Prediction for 50000 episodes:
  down  right  right   GOAL 
  left   ----     up  DANGR 
    up   left     up   left 


Off-policy MC Prediction for 100000 episodes:
  down  right  right   GOAL 
  left   ----     up  DANGR 
    up   left     up   left 




In [35]:
n_episodes = 10000

for i, gamma in enumerate(Gammas):
    print(f"Off-policy MC Prediction for gamma = {gamma}: ")
    Q_off_policy = off_policy_mc_prediction(simple_gw, num_episodes=n_episodes, gamma=gamma, epsilon=ep)
    simple_gw.print_policy(Q_off_policy)
    control_results.append((i+3, n_episodes, gamma, ep))
    print()

Off-policy MC Prediction for gamma = 0.9: 
  down  right  right   GOAL 
  left   ----     up  DANGR 
    up   left     up   left 


Off-policy MC Prediction for gamma = 0.95: 
  down  right  right   GOAL 
  left   ----     up  DANGR 
    up   left     up   left 


Off-policy MC Prediction for gamma = 1.1: 
  down   ????     up   GOAL 
    up   ----     up  DANGR 
    up   ????     up  right 




In [37]:
n_episodes = 100000
gamma = 1

for i, ep in enumerate(Epsilons):
    print(f"Off-policy MC Prediction for epsilon = {ep}: ")
    Q_off_policy = off_policy_mc_prediction(simple_gw, num_episodes=n_episodes, gamma=gamma, epsilon=ep)
    simple_gw.print_policy(Q_off_policy)
    control_results.append((i+6, n_episodes, gamma, ep))
    print()

Off-policy MC Prediction for epsilon = 0.01: 
  down  right  right   GOAL 
  left   ----     up  DANGR 
    up   left     up   left 


Off-policy MC Prediction for epsilon = 0.2: 
  down  right  right   GOAL 
  left   ----     up  DANGR 
    up   left     up   left 


Off-policy MC Prediction for epsilon = 0.3: 
  down  right  right   GOAL 
  left   ----     up  DANGR 
    up   left     up   left 




In [38]:
headers = ["#Trial", "#Episodes", "Gammas", "Epsilons"]
print(tabulate(control_results, headers=headers, tablefmt="grid"))

+----------+-------------+----------+------------+
|   #Trial |   #Episodes |   Gammas |   Epsilons |
|        0 |       10000 |     1    |       0.1  |
+----------+-------------+----------+------------+
|        1 |       50000 |     1    |       0.1  |
+----------+-------------+----------+------------+
|        2 |      100000 |     1    |       0.1  |
+----------+-------------+----------+------------+
|        3 |      100000 |     0.9  |       0.1  |
+----------+-------------+----------+------------+
|        4 |      100000 |     0.95 |       0.1  |
+----------+-------------+----------+------------+
|        5 |      100000 |     1.1  |       0.1  |
+----------+-------------+----------+------------+
|        6 |      100000 |     1    |       0.01 |
+----------+-------------+----------+------------+
|        7 |      100000 |     1    |       0.2  |
+----------+-------------+----------+------------+
|        8 |      100000 |     1    |       0.3  |
+----------+-------------+-----



---


##### Off-policy MC control (10 points)

In [39]:
# Generate an episode using the behavior policy
def generate_episode(env):
    episode = []
    state = env._start_cell
    while not env.is_terminal(state):
        probs = np.ones(4) / 4  # Uniform random policy
        action = np.random.choice(np.arange(len(probs)), p=probs)
        transitions = env.get_transitions(state, action)
        next_state = transitions[0][1]
        reward = env.get_reward(next_state)
        episode.append((state, action, reward))
        state = next_state
    return episode

In [40]:
def off_policy_mc_control(env, num_episodes, gamma=1.0, epsilon=0.1):
    env.reset()
    Q = defaultdict(lambda: np.zeros(4))
    C = defaultdict(lambda: np.zeros(4))
    policy = {}

    for i_episode in range(1, num_episodes + 1):
        episode = generate_episode(env)

        G = 0
        W = 1

        # Iterate over trajectory in reverse
        for t in range(len(episode) - 1, -1, -1):
            state, action, reward = episode[t]
            G = gamma * G + reward
            C[state][action] += W
            Q[state][action] += (W / C[state][action]) * (G - Q[state][action])
            policy[state] = np.argmax(Q[state])
            if action != policy[state]:
                break
            W *= 1.0 / (1.0 / len(env.get_actions(state)))

    return Q, policy

In [41]:
Q_off_policy_control, policy = off_policy_mc_control(simple_gw, num_episodes=10000)

print("Q-values:")
for state in range(simple_gw._width * simple_gw._height):
    if state in Q_off_policy_control:
        print(f"State {state}: {Q_off_policy_control[state]}")

Q-values:
State 0: [0.8 0.8 0.9 0.8]
State 1: [0.8 0.9 0.8 0.8]
State 2: [0.9 1.  0.8 0.8]
State 4: [0.8 0.9 0.8 1. ]
State 6: [ 0.9 -1.   0.7  0.8]
State 8: [ 0.9  0.7  0.8 -1. ]
State 9: [0.7 0.7 0.7 0.8]
State 10: [0.8 0.6 0.7 0.7]
State 11: [-1.   0.6  0.6  0.7]


In [42]:
simple_gw.print_policy(Q_off_policy_control)

  down  right  right   GOAL 
  left   ----     up  DANGR 
    up   left     up   left 



Experiments

In [43]:
epsides = [10000, 50000, 100000]
Gammas= [0.9, 0.95, 1.1]
Epsilons= [0.01, 0.2, 0.3]
control_results = []

gamma = 1.0
ep = 0.1
for i, n_episodes in enumerate(epsides):
    print(f"Off-policy MC Control for {n_episodes} episodes:")
    Q_off_policy_control, policy = off_policy_mc_control(simple_gw, num_episodes=n_episodes, gamma=gamma, epsilon=ep)
    simple_gw.print_policy(Q_off_policy_control)
    control_results.append((i, n_episodes, gamma, ep))
    print()

Off-policy MC Control for 10000 episodes:
  down  right  right   GOAL 
  left   ----     up  DANGR 
    up   left     up   left 


Off-policy MC Control for 50000 episodes:
  down  right  right   GOAL 
  left   ----     up  DANGR 
    up   left     up   left 


Off-policy MC Control for 100000 episodes:
  down  right  right   GOAL 
  left   ----     up  DANGR 
    up   left     up   left 




In [44]:
n_episodes = 1000

for i, gamma in enumerate(Gammas):
    print(f"Off-policy MC Control for gamma = {gamma}:")
    Q_off_policy_control, policy = off_policy_mc_control(simple_gw, num_episodes=n_episodes, gamma=gamma, epsilon=ep)
    simple_gw.print_policy(Q_off_policy_control)
    control_results.append((i+3, n_episodes, gamma, ep))
    print()

Off-policy MC Control for gamma = 0.9:
  down  right  right   GOAL 
  left   ----     up  DANGR 
    up   left     up  right 


Off-policy MC Control for gamma = 0.95:
  down  right  right   GOAL 
  left   ----     up  DANGR 
    up   left     up   left 


Off-policy MC Control for gamma = 1.1:
  ????   ????     up   GOAL 
 right   ----     up  DANGR 
    up     up   ????  right 




In [45]:
gamma = 1
n_episodes = 50000

for i, ep in enumerate(Epsilons):
    print(f"Off-policy MC Control for Epsilon = {ep}:")
    Q_off_policy_control, policy = off_policy_mc_control(simple_gw, num_episodes=n_episodes, gamma=gamma, epsilon=ep)
    simple_gw.print_policy(Q_off_policy_control)
    control_results.append((i+6, n_episodes, gamma, ep))
    print()

Off-policy MC Control for Epsilon = 0.01:
  down  right  right   GOAL 
  left   ----     up  DANGR 
    up   left     up   left 


Off-policy MC Control for Epsilon = 0.2:
  down  right  right   GOAL 
  left   ----     up  DANGR 
    up   left     up   left 


Off-policy MC Control for Epsilon = 0.3:
  down  right  right   GOAL 
  left   ----     up  DANGR 
    up   left     up   left 




In [46]:
headers = ["#Trial", "#Episodes", "Gammas", "Epsilons"]
print(tabulate(control_results, headers=headers, tablefmt="grid"))

+----------+-------------+----------+------------+
|   #Trial |   #Episodes |   Gammas |   Epsilons |
|        0 |       10000 |     1    |       0.1  |
+----------+-------------+----------+------------+
|        1 |       50000 |     1    |       0.1  |
+----------+-------------+----------+------------+
|        2 |      100000 |     1    |       0.1  |
+----------+-------------+----------+------------+
|        3 |        1000 |     0.9  |       0.1  |
+----------+-------------+----------+------------+
|        4 |        1000 |     0.95 |       0.1  |
+----------+-------------+----------+------------+
|        5 |        1000 |     1.1  |       0.1  |
+----------+-------------+----------+------------+
|        6 |       50000 |     1    |       0.01 |
+----------+-------------+----------+------------+
|        7 |       50000 |     1    |       0.2  |
+----------+-------------+----------+------------+
|        8 |       50000 |     1    |       0.3  |
+----------+-------------+-----



---



### Resources & Refrences
* https://github.com/nums11/rl/tree/main/chapter_5_monte_carlo