In [2]:
import numpy as np
import pandas as pd
import scipy
from typing import TypeVar,Mapping, Set, Generic, Sequence, Callable

### Forward-View TD

In [3]:
# Import class
class MDPforRL_TB():
    
    # note that state and actions are defined as int in this part 
    # extension 1: can create a dict of state2idx outside of the class
    # extension 2: can change input simulator to s RV generation function (vs. pre-defined np)
    def __init__(self, state_action_tab: dict, # get available actions from each state
                 state_action_simulator: np.ndarray, # probability of [s_cur,a,s_next]
                 reward_simulator: np.ndarray, # reward if [s_cur,a]
                 gamma: float) -> None:
        super(MDPforRL_TB, self).__init__()
        self.state = state_action_tab.keys()
        self.state_action_tab = state_action_tab
        self.state_action_simulator = state_action_simulator
        self.reward_simulator =  reward_simulator
        self.gamma = gamma
    
    def get_state_action_simulator():
        return self.state_action_simulator
    
    def gen_init_state(self): # uniform start
        init_state = np.random.choice(len(self.state_action_simulator), 1)[0]
        return init_state
    
    def gen_next_state_reward(self,S,A):
        u = np.random.uniform(0,1)
        cdf = np.cumsum(self.state_action_simulator[S,A,:])
        next_state = np.where(cdf > u)[0][0]
        step_reward = self.reward_simulator[S,A]
        return next_state, step_reward
    
    def get_avail_actions(self, S):
        return self.state_action_tab[S]
    
    def get_state(self):
        return list(self.state)

class tab_RL_interface():

    def __init__(self, mdp: MDPforRL_TB):
        super(tab_RL_interface).__init__()
        self.mdp = mdp
#         self.num_episode= num_episode
#         self.len_episode= len_episode
#         self.init_method = init_method
    
    # Generate initial step
    def init_state_gen(self) -> tuple:
        return mdp.gen_init_state()
    
    # Generate next step
    def next_state_gen(self, cur_state: int, cur_act: int) -> tuple:
        return mdp.gen_next_state_reward(cur_state,cur_act)
    
    # Get available actions
    def get_avail_actions(self, cur_state):
        return mdp.get_avail_actions(cur_state)
    
    # Get states
    def get_states(self):
        return list(self.mdp.get_state())

In [5]:
# Step 1: Generate episodes by policy get_action() S->A
def gen_episode(tb_rl:tab_RL_interface, num_episode: int, len_episode: int, 
                get_action: Callable[[int], int]) ->list:
    # get_action: a rv generation function which takes a state index and generates an action index
    TD_path = []
    for i in range(num_episode):
        trial = []
        s_cur = tb_rl.init_state_gen()
        act = get_action(s_cur)
        s_next, reward = tb_rl.next_state_gen(s_cur,act)
        trial.append((s_cur, act, reward))
        for j in range(len_episode):
            s_cur = s_next
            act = get_action(s_cur)
            s_next, reward = tb_rl.next_state_gen(s_cur,act)
            trial.append((s_cur, act, reward))
        TD_path.append(trial)
    
    return TD_path # list of list of tuples (s_cur, action, reward)

# Step 2: Get value function prediction
def td_forward(tb_rl:tab_RL_interface, num_episode: int, len_episode: int,
               alpha: float, gamma: float, _lambda: float, 
               get_action: Callable[[int], int]) -> dict:
    
    TD_path = gen_episode(tb_rl, num_episode, len_episode, get_action)
    
    # initiate value function
    val = {s: 0.0 for s in tb_rl.get_states()}
    for i in range(len(TD_path)):
        G_path = np.zeros(len(TD_path[i]))
        for j in range(len(TD_path[i])):
            for k in range(j,len(TD_path[i])):
                G_path[k] += gamma**j*TD_path[i][j][2]
            G_path[j] += gamma**(j+1)*val(TD_path[i][j][0])
        G_lambda = 0
        for j in range(G_path.shape[0]):
            G_lambda += (1-_lambda)*_lambda**j*G_path[j]
        val[TD_path[i][0][0]] = val[TD_path[i][0][0]] + alpha*(G_lambda - val[TD_path[i][0][0]])
    return val

### Backward-View TD

In [23]:
def td_backward(tb_rl:tab_RL_interface, num_episode: int, len_episode: int,
               alpha: float, gamma: float, _lambda: float, 
               get_action: Callable[[int], int]) -> dict:
        
    # initiate value function
    val = np.zeros(len(tb_rl.get_states()))
    e_t = np.zeros(len(tb_rl.get_states()))
    
    i = 0
    while i < num_episode:
        j = 0
        s_cur = tb_rl.init_state_gen()
        while j < len_episode:
            act = get_action(s_cur)
            s_next, reward = tb_rl.next_state_gen(s_cur,act)
            e_t *= _lambda * gamma
            e_t[s_cur] += 1.0
            td_error = reward + gamma * val[s_next] - val[s_cur]
            val[s_cur] += alpha*(reward + gamma*val[s_next] - val[s_cur])
            s_cur = s_next
            val += alpha * td_error * e_t
            j += 1
        i += 1
    return val

### Test the above implementions

In [24]:
state_action_tab = {0:[0,1,2],1:[0,1,2],
                   2:[0,1,2],3:[0,1,2]} # get available actions from each state
state_action_simulator = np.asarray([0.25]*48).reshape((4,3,4))# probability of [s_cur,a,s_next]
reward_simulator = np.asarray([0,1,3,2,1,3,2,0,3,2,1,4]).reshape((4,3))# reward if [s_cur,a]
gamma = 1
mdp = MDPforRL_TB(state_action_tab,state_action_simulator,reward_simulator,gamma)
tb_rl = tab_RL_interface(mdp)
def get_action(s):
    if s == 0:
        return 2
    elif s == 1:
        return 1
    else:
        return 0
print(td_backward(tb_rl, 10, 10, 0.1, 1, 0.1, get_action))

[11.53842121 10.08099553 10.1583265  10.54858894]


### Offline Forward-View and Backward-View TD
We know offline forward-view TD($\lambda$) as:
$$G_t^{(n)} = R_{t+1}+...+\gamma^n V(S_{t+n})$$
$$G_t^{\lambda} = (1-\lambda)\sum_{n=1}^{\infty}\lambda^{n-1} G_t^{(n)}$$
$$V(S_t) = V(S_t) + \alpha(G_t^{\lambda} - V(S_t))$$
And backward-view TD($\lambda$) as:
$$\delta_t = R_{t+1} + \gamma V(S_{t+1}) - V(S_t)$$
$$\mathbb{E}_t(s) = \gamma \lambda \mathbb{E}_{t-1}(s) + \mathbb{1}(S_t = s)$$
$$V(S_t) = V(S_t) + \alpha \delta_t \mathbb{E}_t(s) | \mathbb{E}_0(s) = 0$$
Consider an episode where $s$ is visited once at time-step $k$,then $\mathbb{E}_t(s) = 0$ for $t < k$ and $E_t(s) = (\lambda \gamma)^{t-k}$ for $t \geq k$. We also know that for forward TD
$$G_t^{\lambda} - V(S_t) = \delta_t + \gamma \lambda \delta_{t+1} + (\gamma \lambda)^2 \delta_{t+2} +...$$
Thus for backward TD
$$\sum_{t=1}^T \delta_t \mathbb{E}_t(s) = G_t^{\lambda} - V(S_t)$$
i.e.,
$V(S_t) + \alpha(G_t^{\lambda} - V(S_t))$ is equivalent to $V(S_t) + \alpha \delta_t \mathbb{E}_t(s)$.