### Imports

In [10]:
import numpy as np
import random
import sys       
import pandas as pd
pd.options.display.float_format = '{:,.3f}'.format

In [11]:
ACTIONS = ("up", "down", "left", "right")

REWARDS = {" ": 0, ".": 0.1, "+": 10, "-": -10}
TERMINALS = ("+", "-")
OBSTACLES = ("#")

gamma = 1

rand_move_probability = 0

class World:
  def __init__(self, width, height):
    self.width = width
    self.height = height
    # Create an empty world where the agent can move to all cells
    self.grid = np.full((width, height), ' ', dtype='U1')

  def add_obstacle(self, start_x, start_y, end_x=None, end_y=None):
    """
    Create an obstacle in either a single cell or rectangle.
    """
    if end_x == None: end_x = start_x
    if end_y == None: end_y = start_y

    self.grid[start_x:end_x + 1, start_y:end_y + 1] = OBSTACLES[0]

  def add_reward(self, x, y, reward):
    assert reward in REWARDS, f"{reward} not in {REWARDS}"
    self.grid[x, y] = reward

  def add_terminal(self, x, y, terminal):
    assert terminal in TERMINALS, f"{terminal} not in {TERMINALS}"
    self.grid[x, y] = terminal

  def is_obstacle(self, x, y):
    if x < 0 or x >= self.width or y < 0 or y >= self.height:
      return True
    else:
      return self.grid[x ,y] in OBSTACLES

  def is_terminal(self, x, y):
    return self.grid[x ,y] in TERMINALS

  def get_reward(self, x, y):
    """
    Return the reward associated with a given location
    """
    return REWARDS[self.grid[x, y]]

  def get_next_state(self, current_state, action):
    """
    Get the next state given a current state and an action. The outcome can be
    stochastic  where rand_move_probability determines the probability of
    ignoring the action and performing a random move.
    """
    assert action in ACTIONS, f"Unknown acion {action} must be one of {ACTIONS}"

    x, y = current_state

    if self.grid[x, y] in TERMINALS:
      return None

    if np.random.rand() < rand_move_probability:
      action = np.random.choice(ACTIONS)

    if action == "up":      y -= 1
    elif action == "down":  y += 1
    elif action == "left":  x -= 1
    elif action == "right": x += 1

    return (x, y) if not self.is_obstacle(x, y) else current_state


In [12]:
world = World(4, 4)

world.add_terminal(3, 3, "+")

def equiprobable_random_policy(x, y):
  return { k:1/len(ACTIONS) for k in ACTIONS }

print(world.grid.T)

[[' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ']
 [' ' ' ' ' ' '+']]


In [13]:
def generate_episode(world, policy, start_state):
    current_state = start_state
    episode = []
    while not world.is_terminal(*current_state):
        possible_actions = policy(*current_state)

        action = random.choices(population=list(possible_actions.keys()),
                                weights=possible_actions.values(), k=1)

        next_state = world.get_next_state(current_state, action[0])
        reward = world.get_reward(*next_state)
        episode.append([current_state, action[0], reward])
        current_state = next_state

    return episode

In [14]:
for i in range(5):
    print(f"Episode {i}:")
    episode = generate_episode(world, equiprobable_random_policy, (0, 0))
    print(pd.DataFrame(episode, columns=["State", "Action", "Reward"]), end="\n\n")

Episode 0:
      State Action  Reward
0    (0, 0)     up       0
1    (0, 0)     up       0
2    (0, 0)   down       0
3    (0, 1)     up       0
4    (0, 0)   left       0
..      ...    ...     ...
131  (0, 2)  right       0
132  (1, 2)  right       0
133  (2, 2)  right       0
134  (3, 2)  right       0
135  (3, 2)   down      10

[136 rows x 3 columns]

Episode 1:
     State Action  Reward
0   (0, 0)     up       0
1   (0, 0)  right       0
2   (1, 0)   down       0
3   (1, 1)   left       0
4   (0, 1)     up       0
5   (0, 0)     up       0
6   (0, 0)   down       0
7   (0, 1)     up       0
8   (0, 0)  right       0
9   (1, 0)   left       0
10  (0, 0)  right       0
11  (1, 0)  right       0
12  (2, 0)  right       0
13  (3, 0)  right       0
14  (3, 0)     up       0
15  (3, 0)  right       0
16  (3, 0)     up       0
17  (3, 0)   left       0
18  (2, 0)   down       0
19  (2, 1)     up       0
20  (2, 0)  right       0
21  (3, 0)   down       0
22  (3, 1)     up       0
23  (

Implementation of an on-policy Monte Carlo-based control with an $\epsilon$-soft policy for estimation of action values

In [31]:
gamma = 0.9
epsilon = 0.1
iterations = 100000

def policy(x,y,policy_dict):
    state = (x,y)
    return policy_dict[state]

def MonteCarloControl(world,policy, iterations):
    # initialisere
    states = [(x, y) for x in range(world.width) for y in range(world.height)]
    Q = {state: {action: 0 for action in ACTIONS} for state in states}
    Returns = {state: {action: [] for action in ACTIONS} for state in states}
    policy_dict = {state: {action: 1 / len(ACTIONS) for action in ACTIONS} for state in states}

    for _ in range(iterations):
        episode = generate_episode(world, lambda x,y: policy(x,y, policy_dict), (0, 0))
        G = 0
        visited = set()
        
        for t in reversed(range(len(episode))):
            state, action, reward = episode[t]
            G = gamma * G + reward
            
            if (state, action) not in visited:
                visited.add((state, action))
                Returns[state][action].append(G)
                Q[state][action] = sum(Returns[state][action]) / len(Returns[state][action])
                max_action = max(Q[state], key=Q[state].get)
                
                for a in ACTIONS:
                    if a == max_action:
                        policy_dict[state][a] = 1 - epsilon + epsilon / len(ACTIONS)
                    else:
                        policy_dict[state][a] = epsilon / len(ACTIONS)
    
    return Q, policy_dict

Q, learned_policy = MonteCarloControl(world, policy, iterations)
for state in Q:
    if not world.is_terminal(state[0],state[1]):
        print(f"State {state}: ")
        print(f"Q-values: {Q[state]}")
        print(f"Best action: {max(learned_policy[state], key=learned_policy[state].get)}")
        print(f"Action probabilities: {learned_policy[state]}")
        print()    


State (0, 0): 
Q-values: {'up': 5.043828780198102, 'down': 5.58181546268142, 'left': 5.043727515901997, 'right': 5.644629285297216}
Best action: right
Action probabilities: {'up': 0.025, 'down': 0.025, 'left': 0.025, 'right': 0.925}

State (0, 1): 
Q-values: {'up': 5.012854492419005, 'down': 6.287415537474025, 'left': 5.6430082690783765, 'right': 6.108725594039598}
Best action: down
Action probabilities: {'up': 0.025, 'down': 0.925, 'left': 0.025, 'right': 0.025}

State (0, 2): 
Q-values: {'up': 5.631122419342105, 'down': 7.045635377565554, 'left': 6.320587592647055, 'right': 7.045537989754582}
Best action: down
Action probabilities: {'up': 0.025, 'down': 0.925, 'left': 0.025, 'right': 0.025}

State (0, 3): 
Q-values: {'up': 6.1187443514532855, 'down': 6.850741616110698, 'left': 6.894616014495413, 'right': 7.913054537543168}
Best action: right
Action probabilities: {'up': 0.025, 'down': 0.025, 'left': 0.025, 'right': 0.925}

State (1, 0): 
Q-values: {'up': 5.646452979074558, 'down': 6.