Randow Walk
---
## n-step TD Method

<img style="float" src="rw-game.png" alt="drawing" width="700"/>

In this MRP, all episodes start in the center state, C, then proceed either left or right by one state on each step, with equal probability. Episodes terminate either on the extreme left or the extreme right. When an episode terminates on the right, a reward of +1 occurs; all other rewards are zero.

<img style="float" src="n-step.png" alt="drawing" width="700"/>

In [3]:
import numpy as np

In [2]:
# 19 states 
NUM_STATES = 19
START = 9
END_0 = 0
END_1 = 18

In [None]:
class RandomWalk:
    
    def __init__(self, n, start=START, end=False, exp_rate=0.3, lr=0.1, gamma=1):
        self.actions = ["left", "right"]
        self.state = start  # current state
        self.end = end
        self.n = n
        self.exp_rate = exp_rate
        self.lr = lr
        self.gamma = gamma
        self.state_actions = []
        # init q estimates
        self.Q_values = {}
        for i in range(NUM_STATES):
            self.Q_values[i] = {}
            for a in self.actions:
                self.Q_values[i][a] = 0
                
    def chooseAction(self):
        # epsilon-greedy
        mx_nxt_reward = -999
        action = ""
        
        if np.random.uniform(0, 1) <= self.exp_rate:
            action = np.random.choice(self.actions)
        else:
            # greedy action
            for a in self.actions:
                current_position = self.state
                nxt_reward = self.Q_values[current_position][a]
                if nxt_reward >= mx_nxt_reward:
                    action = a
                    mx_nxt_reward = nxt_reward
        return action 
    
    def takeAction(self, action):
        new_state = self.state
        if not self.end:
            if action == "left":
                new_state = self.state-1
            else:
                new_state = self.state+1
            
            if new_state in [END_0, END_1]:
                self.end = True
        self.state = new_state
        return self.state
    
    def giveReward(self):
        if self.state == END_0:
            return 0
        if self.state == END_1:
            return 1
        # other states
        return 0
        
    def play(self, rounds=100):
        for _ in range(rounds):
            t = 0
            T = np.inf
            states = [self.state]
            rewards = [0]
            while True:
                if t < T:
                    action = self.chooseAction()
                    self.state_actions.append((self.state, action))
                    
                    state = self.takeAction(action)
                    reward = self.giveReward()
                    
                    states.append(state)
                    rewards.append(reward)
                    
                    if self.end:
                        T = t+1
                tau = t - self.n + 1
                if tau >= 0:
                    G = 0
                    for i in range(tau+1, min(tau+self.n+1, T+1)):
                        G += np.power(self.gamma, i-tau-1)*rewards[i]
                    if tau+self.n < T:
                        state_action = self.state_actions[tau+self.n]
                        G += np.power(self.gamma, self.n)*self.Q_values[state_action[0]][state_action[1]]
                    # update Q values
                    state_action = self.state_actions[tau]
                    self.Q_values[state_action[0]][state_action[1]] += self.lr*(G-self.Q_values[state_action[0]][state_action[1]])

In [6]:
np.power(1, 3)

1