In [1]:
import numpy as np
import pandas as pd
import scipy
from typing import TypeVar,Mapping, Set, Generic, Sequence, Callable, Tuple, List

###  RL interface with value function approximation
##### Prediction

In [2]:
class MDPforRL_FA():
    
    # note that state and actions are defined as int in this part 
    def __init__(self, 
                 state_action_simulator: Callable[[int], int], 
                 v_func_simulator: Callable[[int], int], 
                 q_func_simulator: Callable[[Tuple[int, int]], int],
                 init_state: Callable[[], int],
                 state_reward_func: Callable[[Tuple[int, int]], Tuple[int, float]],
                 gamma: float) -> None:
        super(MDPforRL_FA, self).__init__()

        self.init_state = init_state
        self.state_action_func = state_action_simulator
        self.state_v_func = v_func_simulator
        self.state_q_func = q_func_simulator
        self.state_reward_func = state_reward_func
        self.gamma = gamma
    
    def gen_init_state(self): 
        return self.init_state()
    
    def gen_action(self, S: int):
        return self.state_action_func(S)
    
    def gen_v_func(self,S: int):
        return self.state_v_func(S) # for example, linear approximation
    
    def gen_q_func(self,S: int,A: int):
        return self.state_q_func(S,A) # for example, linear approximation
    
    def gen_state_reward(self, S: int, A:int):
        return self.state_reward_func(S,A)
    
class FA_RL_interface():

    def __init__(self, mdp: MDPforRL_FA, features: Callable[[int], List[float]]):
        super(FA_RL_interface).__init__()
        self.mdp = mdp
        self.feature_func = features
    
    # Generate initial step
    def init_state_gen(self) -> int:
        return mdp.gen_init_state()
    def action_gen(self,S: int) -> int:
        return mdp.gen_action(S)
    
    def gen_action(self, S: int):
        return mdp.gen_action(S)
    
    # Generate next step 
    def gen_state_reward(self, S: int, A:int):
        return mdp.state_reward_func(S,A)
    
    # Etimate value
    def gen_v_func(self,S: int):
        return mdp.state_v_func(S)

    def gen_q_func(self,S: int,A: int):
        return mdp.state_q_func(S,A)
    
    # Generate features
    def gen_features(self, S:int):
        return self.feature_func(S)

class linearFA():
    def __init__(self, lr: float, features: np.ndarray):
        self.lr = lr
        self.features = features
        self.n_features = self.features.shape[0]
        self.params = np.zeros(self.n_features)
    
    def v_func_predict(self, new_feature):
        return self.params.dot(new_feature)
    
    def update_params(self, new_feature, vf):
        return (self.v_func_predict(new_feature) - vf)*new_feature # gradient
    
    def get_params(self):
        return self.params

#### Monte-Carlo Prediction algorithm with Value Function approximation

In [8]:
# Step 1: Generate episodes by policy get_action() S->A
def gen_episode(fa_rl:FA_RL_interface, num_episode: int, len_episode: int, 
                get_action: Callable[[int], int]) ->list:
    # get_action: policy, a rv generation function which takes a state index and generates an action index
    MC_path = []
    for i in range(num_episode):
        trial = []
        s_cur = fa_rl.init_state_gen()
        act = get_action(s_cur) # for example, eps-greedy
        s_next, reward = fa_rl.gen_state_reward(s_cur,act)
        trial.append((s_cur, act, reward))
        for j in range(len_episode):
            s_cur = s_next
            act = get_action(s_cur)
            s_next, reward = tb_rl.next_state_gen(s_cur,act)
            trial.append((s_cur, act, reward))
        MC_path.append(trial)
    
    return MC_path # list of list of tuples (s_cur, action, reward)

# Step 2: Get value function prediction
def mc_prediction(fa_rl:FA_RL_interface, num_episode: int, len_episode: int, 
                  get_action: Callable[[int], int], linear_FA: linearFA):
    MC_path = gen_episode(fa_rl, num_episode, len_episode, get_action)
    Rewards = np.zeros((len(MC_path),len(MC_path[1])))
    States = np.zeros((len(MC_path),len(MC_path[1])))
    for n in range((MC_path)):
        for i in range(len(MC_path[1])):
            Rewards[n,i] = MC_path[n][i][2]
            States[n,i] = MC_path[n][i][2]
    Return = np.zeros((len(MC_path),len(MC_path[1])))
    for n in range((MC_path)):
        for i in range(len(MC_path[1])):
            for j in range(i,len(MC_path[1])):
                Return[n,i] += Reward[j]*fa_rl.gamma**(j-i)
        for i in range(len(MC_path[1])):
            linear_FA.update_params(fa_rl.gen_features(States[n,i]),Return[n,i])
    return linear_FA

#### TD Prediction algorithm with Value Function approximation

In [9]:
def td_prediction(fa_rl:FA_RL_interface, num_episode: int, len_episode: int, 
                  alpha: float, gamma: float, get_action: Callable[[int], int],linear_FA: linearFA):
    # initiate value function
    i = 0
    while i < num_episode:
        j = 0
        s_cur = fa_rl.init_state_gen()
        while j < len_episode:
            act = get_action(s_cur)
            s_next, reward = fa_rl.gen_state_reward(s_cur,act)
            target = reward + fa_rl.gamma * linear_FA.v_func_predict(fa_rl.gen_features(s_cur))
            linear_FA.update_params(fa_rl.gen_features(s_cur), target)
            s_cur = s_next
            j += 1
        i += 1
    return linear_FA

#### TD($\lambda$) Prediction algorithm with Value Function approximation

In [10]:
def td_backward(fa_rl:FA_RL_interface, num_episode: int, len_episode: int,
               alpha: float, gamma: float, _lambda: float, 
               get_action: Callable[[int], int],linear_FA: linearFA) -> dict:
            
    i = 0
    while i < num_episode:
        j = 0
        et = [np.zeros_like(p) for p in linear_FA.get_params()]
        s_cur = fa_rl.init_state_gen()
        while j < len_episode:
            act = get_action(s_cur)
            s_next, reward = fa_rl.gen_state_reward(s_cur,act)
            target = reward + fa_rl.gamma * linear_FA.v_func_predict(fa_rl.gen_features(s_next))
            delta = target - linear_FA.v_func_predict(fa_rl.gen_features(s_cur))
            et = [et[i] * gamma * _lambda + fa_rl.gen_features(s_cur) for i in range(len(et))]
            for e in et:
                linear_FA.update_params(-e * delta)
            s_cur = s_next
            j += 1
        i += 1
    return linear_FA

##### Control

#### SARSA with Value Function approximation

In [3]:
def q_features(S:int,A:int):
    return np.asarray[S,A] # define features

def epsilon_greedy(fa_rl:FA_RL_interface, S:int, eps:float,linear_FA) -> np.ndarray:
    pol = eps*np.ones(fa_rl.get_action(S))/len(fa_rl.get_action(S))
    A_ast = np.argmax([linear_FA.v_func_prediction(S,a) for a in fa_rl.get_action(S)])
    pol[A_ast] += 1-eps
    return pol

def sarsa(fa_rl:FA_RL_interface, num_episode: int, len_episode: int, 
          alpha: float, gamma: float, epsilon_greedy: Callable[[int], int], eps: float,linear_FA: linearFA):
    
    i = 0
    while i < num_episode:
        j = 0
        s_cur = fa_rl.init_state_gen()
        pol = epsilon_greedy(fa_rl, s_cur, eps,linear_FA)
        u = np.random.uniform(0,1)
        cdf = np.cumsum(pol)
        act_cur = np.where(cdf > u)[0][0]
        
        while j < len_episode:
            s_next, reward = ra_rl.next_state_gen(s_cur,act)
            pol = epsilon_greedy(fa_rl, s_next, eps, linear_FA)
            u = np.random.uniform(0,1)
            cdf = np.cumsum(pol)
            act_next = np.where(cdf > u)[0][0]
            f_cur = q_features(s_cur,act_cur)
            f_next = q_features(s_next,act_next)
            linear_FA.update_params(f_next, linear_FA.v_func_predict(f_cur))
            s_cur = s_next
            act_cur = act_next
            j += 1
        i += 1
    return linear_FA

#### SARSA($\lambda$) with Value Function approximation

In [4]:
def q_features(S:int,A:int):
    return np.asarray[S,A] # define features

def epsilon_greedy(fa_rl:FA_RL_interface, S:int, eps:float) -> np.ndarray:
    pol = eps*np.ones(fa_rl.get_action(S))/len(fa_rl.get_action(S))
    A_ast = np.argmax([linear_FA.v_func_prediction(S,a) for a in fa_rl.get_action(S)])
    pol[A_ast] += 1-eps
    return pol

def sarsa_backward(fa_rl:FA_RL_interface, num_episode: int, len_episode: int, 
                   alpha: float, gamma: float, _lambda: float, epsilon_greedy: Callable[[int], int], 
                   eps: float,linear_FA: linearFA):
    i = 0
    while i < num_episode:
        j = 0
        s_cur = tb_rl.init_state_gen()
        s_cur = fa_rl.init_state_gen()
        pol = epsilon_greedy(fa_rl, s_cur, eps, linear_FA)
        u = np.random.uniform(0,1)
        cdf = np.cumsum(pol)
        a_cur = np.where(cdf > u)[0][0]
        e_t = [np.zeros_like(p) for p in linear_FA.get_params()]
        while j < len_episode:
            s_next, reward = fa_rl.next_state_gen(s_cur,a_cur)
            pol = epsilon_greedy(fa_rl, s_next, eps, linear_FA)
            u = np.random.uniform(0,1)
            cdf = np.cumsum(pol)
            a_next = np.where(cdf > u)[0][0]
            target = reward + fa_rl.gamma * linear_FA.v_func_predict(q_features(s_next,a_next))
            delta = target - linear_FA.v_func_predict(q.gen_features(s_cur,a_cur))
            et = [et[i] * gamma * _lambda + q_features(s_cur,a_cur) for i in range(len(et))]
            for e in et:
                linear_FA.update_params(-e * delta)
            s_cur = s_next
            a_cur = a_next
            j += 1
        i += 1
    return linear_FA

#### Q-Learning with Value Function approximation

In [6]:
def greedy(fa_rl:FA_RL_interface, S:int, linear_FA) -> int:
    pol = eps*np.ones(fa_rl.get_action(S))/len(fa_rl.get_action(S))
    A_ast = np.argmax([linear_FA.v_func_prediction(S,a) for a in fa_rl.get_action(S)])
    return A_ast

def q_learning(fa_rl:FA_RL_interface, num_episode: int, len_episode: int, 
               alpha: float, gamma: float, epsilon_greedy: Callable[[int], int], greedy: Callable[[int], int], 
               eps: float, linear_FA) -> dict:
    
    # Initiate q_func table
    i = 0
    while i < num_episode:
        j = 0
        s_cur = tb_rl.init_state_gen()
        while j < len_episode:
            pol = epsilon_greedy(fa_rl, s_cur, eps,linear_FA)
            u = np.random.uniform(0,1)
            cdf = np.cumsum(pol)
            act_cur = np.where(cdf > u)[0][0]
            s_next, reward = fa_rl.next_state_gen(s_cur,act_cur)
            pol = epsilon_greedy(fa_rl, s_next, eps, linear_FA)
            u = np.random.uniform(0,1)
            cdf = np.cumsum(pol)
            act_next = np.where(cdf > u)[0][0]
            f_cur = q_features(s_cur,act_cur)
            f_next = q_features(s_next,act_next)
            linear_FA.update_params(f_next, linear_FA.v_func_predict(f_cur))
            s_cur = s_next
            act_cur = act_next
            j += 1
        i += 1
    
    return linear_FA

### Test in mini project