Cliff Walking
---
<img style="float:left" src="cliff.png" alt="drawing" width="600"/>
---
This is a standard undiscounted, episodic task, with start and goal states, and the usual actions causing movement up, down, right, and left. Reward is `-1` on all transitions except those into the region marked `Cliff`. Stepping
into this region incurs a reward of optimal path `-100` and sends the agent instantly back to the start.

In [2]:
import numpy as np

In [3]:
ROWS = 4
COLS = 12
S = (3, 0)
G = (3, 11)

In [137]:
class Cliff:
    
    def __init__(self):
        self.end = False
        self.pos = S
        self.board = np.zeros([4, 12])
        # add cliff marked as -1
        self.board[3, 1:11] = -1
        
    def nxtPosition(self, action):
        if action == "up":
            nxtPos = (self.pos[0]-1, self.pos[1])
        elif action == "down":
            nxtPos = (self.pos[0]+1, self.pos[1])
        elif action == "left":
            nxtPos = (self.pos[0], self.pos[1]-1)
        else:
            nxtPos = (self.pos[0], self.pos[1]+1)
        # check legitimacy
        if nxtPos[0] >= 0 and nxtPos[0] <= 3:
            if nxtPos[1] >= 0 and nxtPos[1] <= 11:
                self.pos = nxtPos
                
        if self.pos == G:
            self.end = True
            print("Game ends reaching goal")
        if self.board[self.pos] == -1:
            self.end = True
            print("Game ends falling off cliff")
            
        return self.pos
    
    def giveReward(self):
        # give reward
        if self.pos == G:
            return -1
        if self.board[self.pos] == 0:
            return -1
        return -100
    
    
    def show(self):
        for i in range(0, ROWS):
            print('-------------------------------------------------')
            out = '| '
            for j in range(0, COLS):
                if self.board[i, j] == -1:
                    token = '*'
                if self.board[i, j] == 0:
                    token = '0'
                if (i, j) == self.pos:
                    token = 'S'
                if (i, j) == G:
                    token = 'G'
                out += token + ' | '
            print(out)
        print('-------------------------------------------------') 

In [138]:
c = Cliff()
c.show()

-------------------------------------------------
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 
-------------------------------------------------
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 
-------------------------------------------------
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 
-------------------------------------------------
| S | * | * | * | * | * | * | * | * | * | * | G | 
-------------------------------------------------


SARSA(on-policy) VS Q-Learning(off-policy)
---
<img style="float:left" src="sarsa.png" alt="drawing" width="500"/>
<img style="float:left" src="Q-learning.png" alt="drawing" width="500"/>

In [150]:
class Agent:
    
    def __init__(self, exp_rate=0.3, lr=0.1, sarsa=True):
        self.cliff = Cliff()
        self.actions = ["up", "left", "right", "down"]
        self.states = []  # record position and action of each episode
        self.pos = S
        self.exp_rate = exp_rate
        self.lr = lr
        self.sarsa = sarsa
        self.state_actions = {}
        for i in range(ROWS):
            for j in range(COLS):
                self.state_actions[(i, j)] = {}
                for a in self.actions:
                    self.state_actions[(i, j)][a] = 0
        
    def chooseAction(self):
        # epsilon-greedy
        mx_nxt_reward = -999
        action = ""
        
        if np.random.uniform(0, 1) <= self.exp_rate:
            action = np.random.choice(self.actions)
        else:
            # greedy action
            for a in self.actions:
                current_position = self.pos
                nxt_reward = self.state_actions[current_position][a]
                if nxt_reward >= mx_nxt_reward:
                    action = a
                    mx_nxt_reward = nxt_reward
        return action
    
    def reset(self):
        self.states = []
        self.cliff = Cliff()
        self.pos = S
       
    def play(self, rounds=10):
        for _ in range(rounds):
            while 1:
                curr_state = self.pos
                cur_reward = self.cliff.giveReward()
                action = self.chooseAction()

                # next position
                self.cliff.pos = self.cliff.nxtPosition(action)
                self.pos = self.cliff.pos
                
                self.states.append([curr_state, action, cur_reward])
                if self.cliff.end:
                    break
            # game end update estimates
            reward = self.cliff.giveReward()
            print("End game reward", reward)
            # reward of all actions in end state is same
            for a in self.actions:
                self.state_actions[self.pos][a] = reward
            
            if self.sarsa:  
                for s in reversed(self.states):
                    pos, action, r = s[0], s[1], s[2]
                    current_value = self.state_actions[pos][action]
                    reward = current_value + self.lr*(r + reward - current_value)
                    self.state_actions[pos][action] = round(reward, 3)
            else:
                for s in reversed(self.states):
                    pos, action, r = s[0], s[1], s[2]
                    current_value = self.state_actions[pos][action]
                    reward = current_value + self.lr*(r + reward - current_value)
                    self.state_actions[pos][action] = round(reward, 3)
                    # update using the max value of S'
                    reward = np.mean(list(self.state_actions[pos].values()))  # max

            self.reset()

After an initial transient, `Q-learning` learns values for the optimal policy, that which travels right along the edge of the cliff. Unfortunately, this results in its occasionally falling off the cliff because of the "epsilon-greedy" action selection. `Sarsa`, on the other hand, takes the action selection into account and learns the longer but
safer path through the upper part of the grid.

In [151]:
ag = Agent(exp_rate=0.1, sarsa=False)
ag.play(rounds=300)

Game ends falling off cliff
End game reward -100
Game ends falling off cliff
End game reward -100
Game ends falling off cliff
End game reward -100
Game ends falling off cliff
End game reward -100
Game ends falling off cliff
End game reward -100
Game ends falling off cliff
End game reward -100
Game ends falling off cliff
End game reward -100
Game ends falling off cliff
End game reward -100
Game ends falling off cliff
End game reward -100
Game ends falling off cliff
End game reward -100
Game ends falling off cliff
End game reward -100
Game ends falling off cliff
End game reward -100
Game ends falling off cliff
End game reward -100
Game ends falling off cliff
End game reward -100
Game ends falling off cliff
End game reward -100
Game ends falling off cliff
End game reward -100
Game ends falling off cliff
End game reward -100
Game ends reaching goal
End game reward -1
Game ends falling off cliff
End game reward -100
Game ends reaching goal
End game reward -1
Game ends falling off cliff
End 

Game ends reaching goal
End game reward -1
Game ends falling off cliff
End game reward -100
Game ends reaching goal
End game reward -1
Game ends reaching goal
End game reward -1
Game ends reaching goal
End game reward -1
Game ends reaching goal
End game reward -1
Game ends reaching goal
End game reward -1
Game ends falling off cliff
End game reward -100
Game ends reaching goal
End game reward -1
Game ends reaching goal
End game reward -1
Game ends reaching goal
End game reward -1
Game ends reaching goal
End game reward -1
Game ends reaching goal
End game reward -1
Game ends reaching goal
End game reward -1
Game ends reaching goal
End game reward -1
Game ends reaching goal
End game reward -1
Game ends reaching goal
End game reward -1
Game ends reaching goal
End game reward -1
Game ends reaching goal
End game reward -1
Game ends reaching goal
End game reward -1
Game ends reaching goal
End game reward -1
Game ends reaching goal
End game reward -1
Game ends reaching goal
End game reward -1

In [152]:
ag.state_actions

{(0, 0): {'up': -25.237, 'left': -25.756, 'right': -24.605, 'down': -26.075},
 (0, 1): {'up': -24.246, 'left': -24.044, 'right': -23.43, 'down': -25.93},
 (0, 2): {'up': -22.728, 'left': -22.988, 'right': -21.508, 'down': -22.999},
 (0, 3): {'up': -20.565, 'left': -21.182, 'right': -20.39, 'down': -20.625},
 (0, 4): {'up': -20.497, 'left': -19.998, 'right': -19.428, 'down': -19.498},
 (0, 5): {'up': -18.288, 'left': -18.772, 'right': -18.11, 'down': -19.971},
 (0, 6): {'up': -18.272, 'left': -17.51, 'right': -15.511, 'down': -17.989},
 (0, 7): {'up': -14.678, 'left': -14.706, 'right': -14.445, 'down': -14.572},
 (0, 8): {'up': -13.465, 'left': -13.629, 'right': -13.222, 'down': -14.283},
 (0, 9): {'up': -12.826, 'left': -13.61, 'right': -11.224, 'down': -11.422},
 (0, 10): {'up': -10.35, 'left': -11.22, 'right': -9.698, 'down': -9.71},
 (0, 11): {'up': -8.614, 'left': -9.586, 'right': -8.553, 'down': -8.158},
 (1, 0): {'up': -25.991, 'left': -26.087, 'right': -25.982, 'down': -27.007},