In [41]:
pip install git+https://github.com/mimoralea/gym-walk#egg=gym-walk

Collecting gym-walk
  Cloning https://github.com/mimoralea/gym-walk to /tmp/pip-install-7p0vj5j3/gym-walk_8fe43e9fad88412ea3005c6c0b497825
  Running command git clone --filter=blob:none --quiet https://github.com/mimoralea/gym-walk /tmp/pip-install-7p0vj5j3/gym-walk_8fe43e9fad88412ea3005c6c0b497825
  Resolved https://github.com/mimoralea/gym-walk to commit b915b94cf2ad16f8833a1ad92ea94e88159279f5
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [42]:
import warnings ; warnings.filterwarnings('ignore')

import gym
import numpy as np

import random
import warnings

warnings.filterwarnings('ignore', category=DeprecationWarning)
np.set_printoptions(suppress=True)
random.seed(123); np.random.seed(123);

In [43]:
def print_policy(pi, P, action_symbols=('<', 'v', '>', '^'), n_cols=4, title='Policy:'):
    print(title)
    arrs = {k:v for k,v in enumerate(action_symbols)}
    for s in range(len(P)):
        a = pi(s)
        print("| ", end="")
        if np.all([done for action in P[s].values() for _, _, _, done in action]):
            print("".rjust(9), end=" ")
        else:
            print(str(s).zfill(2), arrs[a].rjust(6), end=" ")
        if (s + 1) % n_cols == 0: print("|")

In [67]:
def print_state_value_function(V, P, n_cols=4, prec=3, title='State-value function:'):
    print(title); cell_w = 7 + prec
    for s in range(len(P)):
        v = V[s]
        print("|", end=" ")
        if np.all([done for action in P[s].values() for _, _, _, done in action]):
            print("".rjust(cell_w), end=" ")
        else:
            print(f"{str(s).zfill(2)} {v:.{prec}f}".rjust(cell_w), end=" ")
        if (s + 1) % n_cols == 0: print("|")
    if len(P) % n_cols != 0: print("|")

In [52]:
def probability_success(env, pi, goal_state, n_episodes=100, max_steps=200):
    random.seed(123); np.random.seed(123) ; env.seed(123)
    results = []
    for _ in range(n_episodes):
        state, done, steps = env.reset(), False, 0
        while not done and steps < max_steps:
            state, _, done, h = env.step(pi(state))
            steps += 1
        results.append(state == goal_state)
    return np.sum(results)/len(results)

In [53]:
def mean_return(env, pi, n_episodes=100, max_steps=200):
    random.seed(123); np.random.seed(123) ; env.seed(123)
    results = []
    for _ in range(n_episodes):
        state, done, steps = env.reset(), False, 0
        results.append(0.0)
        while not done and steps < max_steps:
            state, reward, done, _ = env.step(pi(state))
            results[-1] += reward
            steps += 1
    return np.mean(results)

**FROZEN LAKE MDP**

In [54]:
env = gym.make('FrozenLake-v1')
P = env.env.P
init_state = env.reset()
goal_state = 15
LEFT, DOWN, RIGHT, UP = range(4)

In [55]:
P

{0: {0: [(0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 4, 0.0, False)],
  1: [(0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 4, 0.0, False),
   (0.3333333333333333, 1, 0.0, False)],
  2: [(0.3333333333333333, 4, 0.0, False),
   (0.3333333333333333, 1, 0.0, False),
   (0.3333333333333333, 0, 0.0, False)],
  3: [(0.3333333333333333, 1, 0.0, False),
   (0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 0, 0.0, False)]},
 1: {0: [(0.3333333333333333, 1, 0.0, False),
   (0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 5, 0.0, True)],
  1: [(0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 5, 0.0, True),
   (0.3333333333333333, 2, 0.0, False)],
  2: [(0.3333333333333333, 5, 0.0, True),
   (0.3333333333333333, 2, 0.0, False),
   (0.3333333333333333, 1, 0.0, False)],
  3: [(0.3333333333333333, 2, 0.0, False),
   (0.3333333333333333, 1, 0.0, False),
   (0.3333333333333333, 0, 0.0, False)]},
 2:

In [56]:
init_state

0

In [57]:
pi_frozenlake = lambda s: {
    0: RIGHT,
    1: DOWN,
    2: RIGHT,
    3: LEFT,
    4: DOWN,
    5: LEFT,
    6: RIGHT,
    7:LEFT,
    8: UP,
    9: DOWN,
    10:LEFT,
    11:DOWN,
    12:RIGHT,
    13:RIGHT,
    14:DOWN,
    15:LEFT #Stop
}[s]
print_policy(pi_frozenlake, P, action_symbols=('<', 'v', '>', '^'), n_cols=4)

Policy:
| 00      > | 01      v | 02      > | 03      < |
| 04      v |           | 06      > |           |
| 08      ^ | 09      v | 10      < |           |
|           | 13      > | 14      v |           |


In [59]:
print('Reaches goal {:.2f}%. Obtains an average undiscounted return of {:.4f}.'.format(
    probability_success(env, pi_frozenlake, goal_state=goal_state) * 100,
    mean_return(env, pi_frozenlake)))

Reaches goal 10.00%. Obtains an average undiscounted return of 0.1000.


In [60]:
pi_2 = lambda s: {
    0: RIGHT,
    1: DOWN,
    2: LEFT,
    3: UP,
    4: LEFT,
    5: RIGHT,
    6: RIGHT,
    7: LEFT,
    8: RIGHT,
    9: UP,
    10: LEFT,
    11: LEFT,
    12: RIGHT,
    13: DOWN,
    14: DOWN,
    15: DOWN #Stop
}[s]
print("Name: DHARSHINI K")
print("Register Number: 212223230047\n")
print_policy(pi_2, P, action_symbols=('<', 'v', '>', '^'), n_cols=4)

Name: DHARSHINI K
Register Number: 212223230047

Policy:
| 00      > | 01      v | 02      < | 03      ^ |
| 04      < |           | 06      > |           |
| 08      > | 09      ^ | 10      < |           |
|           | 13      v | 14      v |           |


In [61]:
# Find the probability of success and the mean return of your policy
print('Reaches goal {:.2f}%. Obtains an average undiscounted return of {:.4f}.'.format(
    probability_success(env, pi_2, goal_state=goal_state) * 100,
    mean_return(env, pi_2)))

Reaches goal 2.00%. Obtains an average undiscounted return of 0.0200.


In [62]:
# Compare your policy with the first policy
# First policy is better than the second policy

**POLICY EVALUATION**

In [63]:
def policy_evaluation(pi, P, gamma=1.0, theta=1e-10):
    prev_V = np.zeros(len(P))
    while True:
        V = np.zeros(len(P))
        for s in range(len(P)):
            for prob, next_state, reward, done in P[s][pi(s)]:
                V[s] += prob * (reward + gamma * prev_V[next_state] * (not done))
        if np.max(np.abs(prev_V - V)) < theta:
            break
        prev_V = V.copy()
    return V

In [81]:
# Code to evaluate the first policy
V1 = policy_evaluation(pi_frozenlake, P,gamma=0.99)
print_state_value_function(V1, P, n_cols=4, prec=5)

State-value function:
|   00 0.11448 |   01 0.08191 |   02 0.13372 |   03 0.06586 |
|   04 0.15053 |              |   06 0.20562 |              |
|   08 0.30562 |   09 0.46997 |   10 0.48938 |              |
|              |   13 0.62915 |   14 0.80739 |              |


In [82]:
# Code to evaluate the second policy
V2 = policy_evaluation(pi_2, P, gamma=0.99)
print_state_value_function(V2, P, n_cols=4, prec=5)

State-value function:
|   00 0.04451 |   01 0.04176 |   02 0.08202 |   03 0.07961 |
|   04 0.04861 |              |   06 0.12478 |              |
|   08 0.05419 |   09 0.11559 |   10 0.29609 |              |
|              |   13 0.32353 |   14 0.65686 |              |


In [83]:
# Comparing the two policies
# The first policy is better than the second policy

In [93]:
# Compare the two policies based on the value function using the above equation and find the best policy
V1>=V2

array([ True,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True])

In [94]:
if(np.sum(V1>=V2)>=11):
  print("The first policy is the better policy")
elif(np.sum(V2>=V1)>=11):
  print("The second policy is the better policy")
else:
  print("Both policies have their merits.")

The first policy is the better policy
