In [15]:
pip install git+https://github.com/mimoralea/gym-walk#egg=gym-walk gymnasium

Collecting gym-walk
  Cloning https://github.com/mimoralea/gym-walk to /tmp/pip-install-10e4zxpa/gym-walk_dbb6fcb8247e4c8f89e7711a0f6a1c91
  Running command git clone --filter=blob:none --quiet https://github.com/mimoralea/gym-walk /tmp/pip-install-10e4zxpa/gym-walk_dbb6fcb8247e4c8f89e7711a0f6a1c91
  Resolved https://github.com/mimoralea/gym-walk to commit b915b94cf2ad16f8833a1ad92ea94e88159279f5
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [74]:
import warnings ; warnings.filterwarnings('ignore')

import gymnasium as gym # changed import gym, gym_walk to gymnasium as gym, gym_walk
import gym_walk
import numpy as np

import random
import warnings

warnings.filterwarnings('ignore', category=DeprecationWarning)
np.set_printoptions(suppress=True)
random.seed(123); np.random.seed(123)

In [75]:
def print_policy(pi, P, action_symbols=('<', 'v', '>', '^'), n_cols=4, title='Policy:'):
    print(title)
    arrs = {k:v for k,v in enumerate(action_symbols)}
    for s in range(len(P)):
        a = pi[s]
        print("| ", end="")
        if np.all([done for action in P[s].values() for _, _, _, done in action]):
            print("".rjust(9), end=" ")
        else:
            print(str(s).zfill(2), arrs[a].rjust(6), end=" ")
        if (s + 1) % n_cols == 0: print("|")

In [87]:
def print_state_value_function(V, P, n_cols=4, prec=3, title='State-value function:'):
    print(title)
    for s in range(len(P)):
        v = V[s]
        print("| ", end="")
        if np.all([done for action in P[s].values() for _, _, _, done in action]):
            print("".rjust(9), end=" ")
        else:
            print(str(s).zfill(2), '{}'.format(np.round(v, prec)).rjust(6), end=" ")
        if (s + 1) % n_cols == 0: print("|")

In [116]:
def probability_success(env, pi, goal_state, n_episodes=100, max_steps=200):
    random.seed(123); np.random.seed(123)
    results = []
    for _ in range(n_episodes):
        state, _ = env.reset(seed=123) # Unpack state from the tuple
        done, steps = False, 0
        while not done and steps < max_steps:
            next_state, reward, terminated, truncated, _ = env.step(pi[state])
            done = terminated or truncated # Check for termination or truncation
            state = next_state # Update state
            steps += 1
        results.append(state == goal_state)
    return np.sum(results)/len(results)

In [99]:
def mean_return(env, pi, n_episodes=100, max_steps=200):
    random.seed(123); np.random.seed(123)
    results = []
    for _ in range(n_episodes):
        state, _ = env.reset(seed=123) # Unpack state from the tuple
        done, steps = False, 0
        results.append(0.0)
        while not done and steps < max_steps:
            next_state, reward, terminated, truncated, _ = env.step(pi[state]) # Unpack results from the 5-tuple
            done = terminated or truncated # Check for termination or truncation
            results[-1] += reward
            state = next_state # Update state
            steps += 1
    return np.mean(results)

In [108]:
env = gym.make('FrozenLake-v1')
P = env.unwrapped.P
init_state = env.reset()
goal_state = 15 # Corrected goal state for 4x4 FrozenLake
LEFT, DOWN, RIGHT, UP = range(4) # Defined all actions

In [109]:
P

{0: {0: [(0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 4, 0.0, False)],
  1: [(0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 4, 0.0, False),
   (0.3333333333333333, 1, 0.0, False)],
  2: [(0.3333333333333333, 4, 0.0, False),
   (0.3333333333333333, 1, 0.0, False),
   (0.3333333333333333, 0, 0.0, False)],
  3: [(0.3333333333333333, 1, 0.0, False),
   (0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 0, 0.0, False)]},
 1: {0: [(0.3333333333333333, 1, 0.0, False),
   (0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 5, 0.0, True)],
  1: [(0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 5, 0.0, True),
   (0.3333333333333333, 2, 0.0, False)],
  2: [(0.3333333333333333, 5, 0.0, True),
   (0.3333333333333333, 2, 0.0, False),
   (0.3333333333333333, 1, 0.0, False)],
  3: [(0.3333333333333333, 2, 0.0, False),
   (0.3333333333333333, 1, 0.0, False),
   (0.3333333333333333, 0, 0.0, False)]},
 2:

Exponentially decaying schedule


In [110]:
def decay_schedule(
    init_value, min_value, decay_ratio,
    max_steps, log_start = -2, log_base=10):

    decay_steps = int(max_steps * decay_ratio)
    rem_steps = max_steps - decay_steps

    values = np.logspace(
        log_start, 0, decay_steps,
        base=log_base, endpoint=True)[::-1]

    values = (values - values.min()) / (values.max() - values.min())
    values = (init_value - min_value) * values + min_value
    values = np.pad(values, (0, rem_steps), 'edge')
    return values

Exploratory Policy Trajectories

In [111]:
from itertools import count
import numpy as np
from tqdm import tqdm

In [112]:
from itertools import count
import numpy as np
from tqdm import tqdm

def generate_trajectory(
    select_action, Q, epsilon,
    env, max_steps=200):
    done, trajectory = False, []

    while True:
        state, _ = env.reset()
        trajectory = []

        for t in count():
            action = select_action(state, Q, epsilon)

            next_state, reward, done, _, _ = env.step(action)

            experience = (state, action, reward, next_state, done)
            trajectory.append(experience)

            if done:
                break
            if t >= max_steps - 1:
                break

            state = next_state

        if len(trajectory) > 0 or done:
            break

    return np.array(trajectory, dtype=object)

Monte Carlo control

In [113]:
def mc_control(env,
               gamma=1.0,
               init_alpha=0.5,
               min_alpha=0.01,
               alpha_decay_ratio=0.5,
               init_epsilon=1.0,
               min_epsilon=0.1,
               epsilon_decay_ratio=0.9,
               n_episodes=3000,
               max_steps=200,
               first_visit=True):

    nS, nA = env.observation_space.n, env.action_space.n

    discounts = np.logspace(
        0, max_steps,
        num=max_steps + 1, base=gamma,
        endpoint=True)

    alphas = decay_schedule(
        init_alpha, min_alpha,
        alpha_decay_ratio,
        n_episodes)

    epsilons = decay_schedule(
        init_epsilon, min_epsilon,
        epsilon_decay_ratio,
        n_episodes)

    pi_track = []
    Q = np.zeros((nS, nA), dtype=np.float64)
    Q_track = np.zeros((n_episodes, nS, nA), dtype=np.float64)

    select_action = lambda state, Q, epsilon:np.argmax(Q[state]) if np.random.random() > epsilon else np.random.randint(len(Q[state]))

    for e in tqdm(range(n_episodes), leave=False):
        trajectory = generate_trajectory(select_action, Q, epsilons[e], env, max_steps)
        visited = np.zeros((nS, nA), dtype=bool)

        for t, (state, action, reward, _, _) in enumerate(trajectory):
            if visited[state][action] and first_visit:
                continue
            visited[state][action] = True

            G = 0
            for i, (_, _, r, _, _) in enumerate(trajectory[t:]):
                G += (gamma**i) * r

            Q[state][action] = Q[state][action] + alphas[e] * (G - Q[state][action])

        Q_track[e] = Q.copy()
        pi_track.append(np.argmax(Q, axis=1))

    v = np.max(Q, axis=1)
    pi = np.argmax(Q, axis=1)
    return Q, v, pi, Q_track, pi_track

In [114]:
print("NAME: JERUSHLIN JOSE JB")
print("REG NO: 212222240039")

optimal_Q, optimal_V, optimal_pi, Q_track, pi_track = mc_control (env)
print_state_value_function(optimal_Q, P, n_cols=4, prec=2, title='\nAction-value function:')
print_state_value_function(optimal_V, P, n_cols=4, prec=2, title='State-value function:')
print_policy(optimal_pi, P)

NAME: JERUSHLIN JOSE JB
REG NO: 212222240039


                                                     


Action-value function:
| 00 [0.23 0.2  0.19 0.2 ] | 01 [0.02 0.05 0.04 0.18] | 02 [0.07 0.13 0.05 0.05] | 03 [0.07 0.03 0.02 0.03] |
| 04 [0.25 0.09 0.15 0.11] |           | 06 [0.17 0.05 0.05 0.02] |           |
| 08 [0.13 0.2  0.16 0.28] | 09 [0.13 0.32 0.18 0.11] | 10 [0.42 0.27 0.28 0.17] |           |
|           | 13 [0.1  0.37 0.26 0.25] | 14 [0.4  0.67 0.69 0.61] |           |
State-value function:
| 00   0.23 | 01   0.18 | 02   0.13 | 03   0.07 |
| 04   0.25 |           | 06   0.17 |           |
| 08   0.28 | 09   0.32 | 10   0.42 |           |
|           | 13   0.37 | 14   0.69 |           |
Policy:
| 00      < | 01      ^ | 02      v | 03      < |
| 04      < |           | 06      < |           |
| 08      ^ | 09      v | 10      < |           |
|           | 13      v | 14      > |           |




In [115]:
print("NAME: JERUSHLIN JOSE JB")
print("REG NO: 212222240039")

print('\nReaches goal {:.2f}%. Obtains an average undiscounted return of {:.4f}.'.format(probability_success(env, optimal_pi, goal_state=goal_state)*100,mean_return(env, optimal_pi)))

NAME: JERUSHLIN JOSE JB
REG NO: 212222240039

Reaches goal 100.00%. Obtains an average undiscounted return of 1.0000.
