In [1]:
import numpy as np
import pandas as pd
import scipy
from typing import TypeVar,Mapping, Set, Generic, Sequence, Callable

### Interface for tabular RL algorithms
- The core of this interface should be a mapping from a (state, action) pair to a sampling of the (next state, reward) pair. It is important that this interface doesn't present the state-transition probability model or the reward model.

In [2]:
# Modify MDP class
class MDPforRL_TB():
    
    # note that state and actions are defined as int in this part 
    # extension 1: can create a dict of state2idx outside of the class
    # extension 2: can change input simulator to s RV generation function (vs. pre-defined np)
    def __init__(self, state_action_tab: dict, # get available actions from each state
                 state_action_simulator: np.ndarray, # probability of [s_cur,a,s_next]
                 reward_simulator: np.ndarray, # reward if [s_cur,a]
                 terminal_states: list,
                 gamma: float) -> None:
        super(MDPforRL_TB, self).__init__()
        self.state = state_action_tab.keys()
        self.state_action_tab = state_action_tab
        self.state_action_simulator = state_action_simulator
        self.reward_simulator = reward_simulator
        self.terminal_states = terminal_states
        self.gamma = gamma
    
    def get_state_action_simulator():
        return self.state_action_simulator
    
    def gen_init_state(self): # uniform start
        init_state = np.random.choice(len(self.state_action_simulator), 1)[0]
        return init_state
    
    def gen_next_state_reward(self,S,A):
        u = np.random.uniform(0,1)
        cdf = np.cumsum(self.state_action_simulator[S,A,:])
        next_state = np.where(cdf > u)[0][0]
        step_reward = self.reward_simulator[S,A]
        return next_state, step_reward
    
    def get_avail_actions(self, S):
        return self.state_action_tab[S]
    
    def get_state(self):
        return list(self.state)
    
    def get_terminal_states(self):
        return self.terminal_states

In [3]:
class tab_RL_interface():

    def __init__(self, mdp: MDPforRL_TB):
        super(tab_RL_interface).__init__()
        self.mdp = mdp
#         self.num_episode= num_episode
#         self.len_episode= len_episode
#         self.init_method = init_method
    
    # Generate initial step
    def init_state_gen(self) -> tuple:
        return mdp.gen_init_state()
    
    # Generate next step
    def next_state_gen(self, cur_state: int, cur_act: int) -> tuple:
        return mdp.gen_next_state_reward(cur_state,cur_act)
    
    # Get available actions
    def get_avail_actions(self, cur_state):
        return mdp.get_avail_actions(cur_state)
    
    # Get states
    def get_states(self):
        return list(self.mdp.get_state())
    def get_terminal_states(self):
        return mdp.terminal_states

### Every visit Monte-Carlo Value prediction
$$V(S_t) \leftarrow V(S_t) + \frac{1}{N(S_t)}(G_t - V(S_t))$$

In [56]:
# Step 1: Generate episodes by policy get_action() S->A
def gen_episode(tb_rl:tab_RL_interface, num_episode: int, len_episode: int, 
                get_action: Callable[[int], int]) ->list:
    # get_action: a rv generation function which takes a state index and generates an action index
    MC_path = []
    for i in range(num_episode):
        trial = []
        s_cur = tb_rl.init_state_gen()
        act = get_action(s_cur)
        s_next, reward = tb_rl.next_state_gen(s_cur,act)
        trial.append((s_cur, act, reward))
        for j in range(len_episode):
            if s_next not in tb_rl.get_terminal_states():
                s_cur = s_next
                act = get_action(s_cur)
                s_next, reward = tb_rl.next_state_gen(s_cur,act)
                trial.append((s_cur, act, reward))
        MC_path.append(trial)
    
    return MC_path # list of list of tuples (s_cur, action, reward)

# Step 2: Get value function prediction
def mc_prediction(tb_rl:tab_RL_interface, num_episode: int, len_episode: int, 
                  get_action: Callable[[int], int]) -> dict:
    MC_path = gen_episode(tb_rl, num_episode, len_episode, get_action)
    sum_return = dict()
    count_return = dict()
    for i in range(len(MC_path)):
        for j,elem in enumerate(MC_path[i]):
            if MC_path[i][j][0] in sum_return.keys():
                sum_return[MC_path[i][j][0]] += MC_path[i][j][2]
                count_return[MC_path[i][j][0]] += 1
            else:
                sum_return.update({MC_path[i][j][0]: MC_path[i][j][2]})
                count_return.update({MC_path[i][j][0]: 1})
    val = dict()
    for s in sum_return.keys():
        val[s] = sum_return[s]/count_return[s]
    
    return val

### 1-step TD algorithm Value Function prediction
$$V(S_t) \leftarrow V(S_t) + \alpha (R_{t+1} + \gamma V(S_{t+1}) - V(S_t))$$

In [62]:
def td_prediction(tb_rl:tab_RL_interface, num_episode: int, len_episode: int, 
                  alpha: float, gamma: float, get_action: Callable[[int], int]) -> dict:
    # initiate value function
    val = {s: 0.0 for s in tb_rl.get_states()}
    i = 0
    while i < num_episode:
        j = 0
        s_cur = tb_rl.init_state_gen()
        while j < len_episode:
            act = get_action(s_cur)
            if s_next not in tb_rl.get_terminal_states():
                s_next, reward = tb_rl.next_state_gen(s_cur,act)
                val[s_cur] += alpha*(reward + gamma*val[s_next] - val[s_cur])
                s_cur = s_next
                j += 1
            else:
                break
        i += 1
    return val

### Test the above implementations

In [53]:
state_action_tab = {0:[0,1,2],1:[0,1,2],
                   2:[0,1,2],3:[0,1,2]} # get available actions from each state
state_action_simulator = np.asarray([0.25]*48).reshape((4,3,4))# probability of [s_cur,a,s_next]
reward_simulator = np.asarray([0,1,3,2,1,3,2,0,3,2,1,4]).reshape((4,3))# reward if [s_cur,a]
gamma = 1
terminal_states = []
mdp = MDPforRL_TB(state_action_tab,state_action_simulator,reward_simulator,terminal_states,gamma)
tb_rl = tab_RL_interface(mdp)
def get_action(s):
    if s == 0:
        return 2
    elif s == 1:
        return 1
    else:
        return 0

##### MC

In [58]:
MC_path = gen_episode(tb_rl, 10, 10, get_action)
print(MC_path)
print(mc_prediction(tb_rl, 10, 10, get_action))

[[(0, 2, 3), (1, 1, 1), (0, 2, 3), (1, 1, 1), (1, 1, 1), (3, 0, 2), (2, 0, 2), (0, 2, 3), (0, 2, 3), (0, 2, 3), (0, 2, 3)], [(2, 0, 2), (1, 1, 1), (3, 0, 2), (1, 1, 1), (1, 1, 1), (3, 0, 2), (2, 0, 2), (0, 2, 3), (3, 0, 2), (3, 0, 2), (2, 0, 2)], [(0, 2, 3), (1, 1, 1), (3, 0, 2), (1, 1, 1), (3, 0, 2), (3, 0, 2), (2, 0, 2), (0, 2, 3), (2, 0, 2), (2, 0, 2), (1, 1, 1)], [(0, 2, 3), (0, 2, 3), (2, 0, 2), (2, 0, 2), (1, 1, 1), (1, 1, 1), (1, 1, 1), (1, 1, 1), (3, 0, 2), (3, 0, 2), (3, 0, 2)], [(2, 0, 2), (0, 2, 3), (0, 2, 3), (3, 0, 2), (3, 0, 2), (1, 1, 1), (1, 1, 1), (2, 0, 2), (2, 0, 2), (0, 2, 3), (0, 2, 3)], [(0, 2, 3), (0, 2, 3), (2, 0, 2), (0, 2, 3), (1, 1, 1), (2, 0, 2), (2, 0, 2), (0, 2, 3), (0, 2, 3), (1, 1, 1), (0, 2, 3)], [(3, 0, 2), (0, 2, 3), (3, 0, 2), (2, 0, 2), (2, 0, 2), (1, 1, 1), (2, 0, 2), (0, 2, 3), (1, 1, 1), (2, 0, 2), (1, 1, 1)], [(0, 2, 3), (1, 1, 1), (1, 1, 1), (0, 2, 3), (3, 0, 2), (0, 2, 3), (3, 0, 2), (3, 0, 2), (0, 2, 3), (2, 0, 2), (1, 1, 1)], [(2, 0, 2), (2,

##### TD

In [64]:
print(td_prediction(tb_rl, 10, 10, 0.1, 1, get_action))

{0: 6.4136523505484675, 1: 3.9193447595540682, 2: 5.190022426616782, 3: 5.009242945308076}


### Prove that fixed learning rate for MC is equivalent to an exponentially decaying average of episode returns
Given that
$$V^{k}(S_t) = V^{k-1}(S_t) + \alpha (G_t^{k-1} -V^{k-1}(S_t))$$
We may get
$$
\begin{split}
V^{k}(S_t) &= (1-\alpha) V^{k-1}(S_t) + \alpha G_t^{k-1}\\
& = (1-\alpha)(V^{k-2}(S_t)+\alpha (G_t^{k-2} -V^{k-2}(S_t)))+ \alpha G_t^{k-1}\\
& = \alpha G_t^{k-1} +(1-\alpha)\alpha G_t^{k-2} +...+ (1-\alpha)^{k-1} \alpha G_t^{0} +(1-\alpha)^k V^{0}(S_t)
\end{split}
$$
Which is equivalent to an exponentially decaying average:  https://en.wikipedia.org/wiki/Exponential_smoothing