Cliff Walking
---
<img style="float:left" src="cliff.png" alt="drawing" width="600"/>
---
This is a standard undiscounted, episodic task, with start and goal states, and the usual actions causing movement up, down, right, and left. Reward is `-1` on all transitions except those into the region marked `Cliff`. Stepping
into this region incurs a reward of optimal path `-100` and sends the agent instantly back to the start.

In [2]:
import numpy as np

In [3]:
ROWS = 4
COLS = 12
S = (3, 0)
G = (3, 11)

In [20]:
class Cliff:
    
    def __init__(self):
        self.end = False
        self.pos = S
        self.board = np.zeros([4, 12])
        # add cliff marked as -1
        self.board[3, 1:11] = -1
        
    def nxtPositon(self, action):
        if action == "up":
            nxtPos = (self.pos[0]-1, self.pos[1])
        elif action == "down":
            nxtPos = (self.pos[0]+1, self.pos[1])
        elif action == "left":
            nxtPos = (self.pos[0], self.pos[1]-1)
        else:
            nxtPos = (self.pos[0], self.pos[1]+1)
        # check legitimacy
        if nxtPos[0] >= 0 and nxtPos[0] <= 3:
            if nxtPos[1] >= 0 and nxtPos[1] <= 11:
                self.pos = nxtPos
        return self.pos
    
    def giveReward(self):
        # give reward and judge if game ends
        if self.board[self.pos] == 0:
            if self.pos == G:
                self.end = True
                print("Game ends reaching goal")
            return -1
        else:
            self.end = True
            print("Game ends falling off cliff")
            return -100
    
    
    def show(self):
        for i in range(0, ROWS):
            print('-------------------------------------------------')
            out = '| '
            for j in range(0, COLS):
                if self.board[i, j] == -1:
                    token = '*'
                if self.board[i, j] == 0:
                    token = '0'
                if (i, j) == self.pos:
                    token = 'S'
                if (i, j) == G:
                    token = 'G'
                out += token + ' | '
            print(out)
        print('-------------------------------------------------') 

In [21]:
c = Cliff()
c.show()

-------------------------------------------------
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 
-------------------------------------------------
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 
-------------------------------------------------
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 
-------------------------------------------------
| S | * | * | * | * | * | * | * | * | * | * | G | 
-------------------------------------------------


SARSA(on-policy) VS Q-Learning(off-policy)
---
<img style="float:left" src="sarsa.png" alt="drawing" width="500"/>
<img style="float:left" src="Q-learning.png" alt="drawing" width="500"/>

In [None]:
class Agent:
    
    def __init__(self, exp_rate=0.3, lr=0.1):
        self.actions = ["up", "left", "right", "down"]
        self.postions = []  # record positions of each episode
        self.pos = S
        self.exp_rate = exp_rate
        self.lr = lr
        self.state_actions = {}
        for i in range(ROWS):
            for j in range(COLS):
                self.state_actions[(i, j)] = {}
                for a in self.actions:
                    self.state_actions[(i, j)][a] = 0
        
    def chooseAction(self):
        # epsilon-greedy
        mx_nxt_reward = 0
        action = ""
        
        if np.random.uniform(0, 1) <= self.exp_rate:
            action = np.random.choice(self.actions)
        else:
            # greedy action
            for a in self.actions:
                current_position = self.pos
                nxt_reward = self.state_actions[current_position][a]
                if nxt_reward >= mx_nxt_reward:
                    action = a
                    mx_nxt_reward = nxt_reward
            # print("current pos: {}, greedy aciton: {}".format(self.State.state, action))
        return action
    
    def reset(self):
        self.states = []
       
    def play(self, rounds=10):
        