In [None]:
# install gitPython
import os, sys, time
!pip install gitPython

# clone my repository
import git
!git clone https://github.com/sungbinlim/RLclass.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gitPython
  Downloading GitPython-3.1.31-py3-none-any.whl (184 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m184.3/184.3 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting gitdb<5,>=4.0.1
  Downloading gitdb-4.0.10-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting smmap<6,>=3.0.1
  Downloading smmap-5.0.0-py3-none-any.whl (24 kB)
Installing collected packages: smmap, gitdb, gitPython
Successfully installed gitPython-3.1.31 gitdb-4.0.10 smmap-5.0.0
Cloning into 'RLclass'...
remote: Enumerating objects: 59, done.[K
remote: Counting objects: 100% (59/59), done.[K
remote: Compressing objects: 100% (47/47), done.[K
remote: Total 59 (delta 15), reused 44 (delta 0), pack-reused 0[K
Unpacking objects: 100% (59/59), 10.74 KiB | 423.00 KiB/s

In [None]:
# append package address
%cd /content/RLclass/STAT436/grid_world/
sys.path.append("/content/RLclass/STAT436/grid_world/") 
from grid_world import *

/content/RLclass/STAT436/grid_world


## Monte-Carlo Methods

Now we will deal with real case that we have no idea how to access dynamics, i.e., we cannot solve the Bellman equation nor optimality equation. Hence it is not desirable to use the `pi_dynamics` class. How can we obtain the optimal policy $\pi_{\ast}$?

As a matter of fact, *Policy Iteration* requires value functions $v_{\pi}$ and $q_{\pi}$, which are attainable without solving the Bellman equation. We will see how to get this.

In [None]:
import numpy as np

def compute_position(state):
    # state(12) -> array(3, 4)
    return np.array([state // 4, state % 4])

def reverse_position(array):
    # array(3, 4) -> state(12)
    return array[0] * 4 + array[1]

def create_value_memory(dim=(12, 4)):
    value_memory = np.zeros(dim)
    return value_memory

def action_to_index(action):
    # up, left, right, down
    if action == 'up':
        return 0
    if action == 'left':
        return 1
    if action == 'right':
        return 2
    if action == 'down':
        return 3
    else:
        raise ValueError('not proper action')

def transform_trajectory_memory(trajectory):
    # trajectory -> {(state, action)}
    trajectory_ = [(reverse_position(pair[0]), action_to_index(pair[1])) for pair in trajectory]

    return trajectory_

def mc_eval(history, reward_stat, gamma, update=0.99):
    """
    input: history
    output: value estimation
    """
    value_memory = create_value_memory()
    
    for trajectory, reward in zip(history, reward_stat):
        
        trajectory_ = transform_trajectory_memory(trajectory)
        T = len(trajectory_) - 1
        G = reward

        tmp = np.zeros_like(value_memory)
        tmp[trajectory_[T][0], trajectory_[T][1]] = G

        for t in range(1, T+1, 1):
            G = gamma * G
            tmp[trajectory_[T-t][0], trajectory_[T-t][1]] = G
        
        value_memory = value_memory + update * (tmp - value_memory)

    return value_memory

# from Policy Iteration
def one_hot(scalar, dim):
    vec = np.zeros(dim)
    vec[scalar] = 1
    return vec

def greedy_action(array, dim):
    vec = np.zeros(dim)
    array_size = array.shape[0]
    for _ in array:
        vec[_] = 1 / array_size

    return vec

def argmax(vec, tie=True):
    if tie:
        return np.where(vec == np.max(vec))[0]
    else: # ordinary argmax
        return np.argmax(vec)

# update policy w/ greedy policy
def update_policy(policy, action_value):

    greedy_policy = np.zeros_like(policy)

    for state in range(12):

        action = argmax(action_value[state, :])
        action = greedy_action(action, 4)
        greedy_policy[state] = action

    return greedy_policy

# Policy Improvement w/ MC
def mc_policy_iteration(pi_init, agent, gamma, eps=1e-8, play_num=100, epsilon=None):

    # call policy eval
    pi = pi_init
    agent_ = agent(pi_init)
    epsilon_init = epsilon
    history, reward_stat, success_rate = agent_.play(play_num, stat=False)
    print("Iteration: 0, Success rate:{} %".format(success_rate * 100))
    action_value = mc_eval(history, reward_stat, gamma)

    advances = np.inf
    n_it = 0

    while advances > eps or n_it <= 2:
        
        # policy improvement
        pi_new = update_policy(pi, action_value)

        # policy evaluation
        agent_ = agent(pi_new, epsilon)
        history, reward_stat, success_rate = agent_.play(play_num, stat=False)
        action_value_new = mc_eval(history, reward_stat, gamma)

        # stop condition
        advances = action_value_new - action_value
        # advances = advances * (advances > 0)
        advances = np.abs(action_value_new - action_value)
        advances = np.sum(advances)

        # save policy and update values
        pi = pi_new
        action_value = action_value_new
        n_it += 1
        epsilon = epsilon_init / n_it

        if n_it % 10 == 0:
            print("Iteration: {}, Success rate:{} %, Error: {}, eps: {}".format(play_num * n_it, success_rate * 100, advances, epsilon))

    print("Monte-Carlo Policy Iteration converged. (Iteration={}, Error={})".format(play_num * n_it, advances))

    return pi_new, action_value_new

In [None]:
import time

gamma = 0.99
# random policy function
pi = np.array([0.25, 0.25, 0.25, 0.25]) #up, left, right, down
pi = np.reshape(np.tile(pi, 12), (12, 4))

print("\nUpdating Policy via Policy Iteration w/ Monte-Carlo")
start_time = time.time()
pi_new, action_value_new = mc_policy_iteration(pi, Agent, gamma, play_num=100, epsilon=0.1)
end_time = time.time()
computation_time = end_time - start_time
print("Wall-clock time for Policy Iteration: {} sec\n".format(np.round(computation_time, 4)))

print("Let's run grid world!")
agent = Agent(pi_new)
success_rate = agent.play(100, stat=True)
agent.show_policy()
print("action value:\n {}".format(np.round(action_value_new, 3)))
print("Success rate:{} %".format(success_rate * 100))



Updating Policy via Policy Iteration w/ Monte-Carlo
Iteration: 0, Success rate:26.0 %
Iteration: 1000, Success rate:100.0 %, Error: 0.9605959906929904, eps: 0.01
Iteration: 2000, Success rate:90.0 %, Error: 2.3986406845221917, eps: 0.005
Iteration: 3000, Success rate:100.0 %, Error: 0.01930702880806834, eps: 0.0033333333333333335
Iteration: 4000, Success rate:100.0 %, Error: 1.920813525790919e-06, eps: 0.0025
Monte-Carlo Policy Iteration converged. (Iteration=4400, Error=9.415733559358212e-53)
Wall-clock time for Policy Iteration: 43.3109 sec

Let's run grid world!
-----------------
| → | → | → | ↑ | 
-----------------
| ↑ | z | ↑ | ↑ | 
-----------------
| ↑ | ← | ↑ | ↑ | 
-----------------
action value:
 [[0.    0.    0.98  0.   ]
 [0.    0.    0.99  0.   ]
 [0.    0.    1.    0.   ]
 [0.    0.    0.    0.   ]
 [0.97  0.    0.    0.   ]
 [0.    0.    0.    0.   ]
 [0.    0.    0.    0.   ]
 [0.    0.    0.    0.   ]
 [0.961 0.    0.    0.   ]
 [0.    0.    0.    0.   ]
 [0.    0.   