In [1]:
import gym
import numpy as np

In [2]:
env = gym.make('CliffWalking-v0')

  and should_run_async(code)
  deprecation(
  deprecation(


Inputs:

env: The environment for which the optimal policy is to be learned.
n_episodes: Number of episodes to run the algorithm. Default is 500.

Variables:

Q: Q-table representing state-action values.
N: Counter for state-action pairs visited.
gamma: Discount factor.
total_steps: List to store the number of steps taken in each episode.

Iteration:

Runs for n_episodes.
For each episode, it starts with a random state-action pair and follows the policy until the episode terminates.
At each step, it updates the episode trajectory and counts the number of steps taken.

Policy Update:

After each episode, it updates the Q-values using the episode trajectory and the returns obtained.
Returns are calculated recursively from the end of the episode.
Q-values are updated using the incremental mean formula.

Policy Derivation:

Once all episodes are completed, it derives the optimal policy by selecting actions with the highest Q-values for each state.

Output:

Returns the derived optimal policy, Q-values, and the total number of steps taken in each episode.
Overall, Monte Carlo ES learns the optimal policy by iteratively exploring the environment through episodes and updating Q-values accordingly. Finally, it extracts the optimal policy from the learned Q-values.

In [3]:
def monte_carlo_es(env, n_episodes=500):
    Q = np.zeros((env.observation_space.n, env.action_space.n))
    N = np.zeros((env.observation_space.n, env.action_space.n))
    gamma = 1.0
    total_steps = []

    for i in range(n_episodes):
        state = env.reset()
        episode = []
        done = False
        steps = 0

        # generate an episode using exploring starts
        while not done:
            action = np.random.choice(env.action_space.n)
            next_state, reward, done, info = env.step(action)
            episode.append((state, action, reward))
            state = next_state
            steps += 1
        total_steps.append(steps)

        # update Q values using the episode
        returns = 0
        for j in range(len(episode)-1, -1, -1):
            state, action, reward = episode[j]
            returns = gamma*returns + reward
            N[state][action] += 1
            Q[state][action] += (returns - Q[state][action])/N[state][action]

    # derive optimal policy from Q values
    policy = np.argmax(Q, axis=1)

    return policy, Q, total_steps

In [4]:
def on_policy_mc_control(env, n_episodes=500, epsilon=0.1):
    Q = np.zeros((env.observation_space.n, env.action_space.n))
    N = np.zeros((env.observation_space.n, env.action_space.n))
    gamma = 1.0
    total_steps = []

    for i in range(n_episodes):
        state = env.reset()
        done = False
        steps = 0

        # generate an episode using Ɛ-soft policy
        while not done:
            if np.random.uniform(0, 1) < epsilon:
                action = env.action_space.sample()
            else:
                action = np.argmax(Q[state])
            next_state, reward, done, info = env.step(action)
            N[state][action] += 1
            Q[state][action] += (reward + gamma*np.max(Q[next_state]) - Q[state][action])/N[state][action]
            state = next_state
            steps += 1
        total_steps.append(steps)

    # derive optimal policy from Q values
    policy = np.argmax(Q, axis=1)

    return policy, Q, total_steps

In [5]:
monte_carlo_es_policy, monte_carlo_es_q, total_steps_es = monte_carlo_es(env)
on_policy_mc_control_policy, on_policy_mc_control_q, total_steps_control = on_policy_mc_control(env)

  if not isinstance(terminated, (bool, np.bool8)):


In [6]:
print(str.format('Total Number of Steps taken to reach Optimal Policy using Monte Carlo ES: {}', sum(total_steps_es)))
print(str.format('Total Number of Steps taken to reach Optimal Policy using On-Policy First-Visit MC Control: {}', sum(total_steps_control)))


Total Number of Steps taken to reach Optimal Policy using Monte Carlo ES: 3484417
Total Number of Steps taken to reach Optimal Policy using On-Policy First-Visit MC Control: 18064


In [7]:
print(str.format('Average Number of Steps per Episode taken to reach Optimal Policy using Monte Carlo ES: {}', sum(total_steps_es)/len(total_steps_es)))
print(str.format('Average Number of Steps per Episode taken to reach Optimal Policy using On-Policy First-Visit MC Control: {}', sum(total_steps_control)/len(total_steps_control)))


Average Number of Steps per Episode taken to reach Optimal Policy using Monte Carlo ES: 6968.834
Average Number of Steps per Episode taken to reach Optimal Policy using On-Policy First-Visit MC Control: 36.128
