In [4]:
import time
import random
import numpy as np

In [5]:
class Env():
    def __init__(self, length, height):
        # define the height and length of the map
        self.length = length
        self.height = height
        # define the agent's start position
        self.x = 0
        self.y = 0

    def render(self, frames=50):
        for i in range(self.height):
            if i == 0: # cliff is in the line 0
                line = ['S'] + ['x']*(self.length - 2) + ['T'] # 'S':start, 'T':terminal, 'x':the cliff
            else:
                line = ['.'] * self.length
            if self.x == i:
                line[self.y] = 'o' # mark the agent's position as 'o'
            print(''.join(line))
        print('\033['+str(self.height+1)+'A')  # printer go back to top-left 
        #time.sleep(1.0 / frames)

    def step(self, action):
        """4 legal actions, 0:up, 1:down, 2:left, 3:right"""
        change = [[0, 1], [0, -1], [-1, 0], [1, 0]]
        self.x = min(self.height - 1, max(0, self.x + change[action][0]))
        self.y = min(self.length - 1, max(0, self.y + change[action][1]))

        states = [self.x, self.y]
        reward = -1
        terminal = False
        if self.x == 0: # if agent is on the cliff line "SxxxxxT"
            if self.y > 0: # if agent is not on the start position 
                terminal = True
                if self.y != self.length - 1: # if agent falls
                    reward = -100
        return reward, states, terminal

    def reset(self):
        self.x = 0
        self.y = 0

In [7]:
class Q_table():
    def __init__(self, length, height, actions=4, alpha=0.1, gamma=0.9):
        self.table = [0] * actions * length * height # initialize all Q(s,a) to zero
        self.actions = actions
        self.length = length
        self.height = height
        self.alpha = alpha
        self.gamma = gamma

    def _index(self, a, x, y):
        """Return the index of Q([x,y], a) in Q_table."""
        return a * self.height * self.length + x * self.length + y

    def _epsilon(self,num_episode):
        # return 0.1
        # version for better convergence:
        # """At the beginning epsilon is 0.2, after 300 episodes decades to 0.05, and eventually go to 0."""
        return 20. / (num_episode + 100)

    def take_action(self, x, y, num_episode):
        if self._epsilon(num_episode) > random.random():
            return random.randint(0,3)
        else:
            arr=np.zeros(self.actions)
            max=-200000
            id=0
            for i in range(self.actions):
                arr[i]=self.table[self._index(i,x,y)]
                if arr[i]>max:
                    id=i
                    max=arr[i]
            return id


    def max_q(self, x, y):
        arr=np.zeros(self.actions)
        max=-200
        id=0
        for i in range(self.actions):
            arr[i]=self.table[self._index(i,x,y)]
            if arr[i]>max:
                id=i
                max=arr[i]
        return max

    def update(self, a, s0, s1, r, is_terminated):
        # both s0, s1 have the form [x,y]
        pre=self.table[self._index(a,s0[0],s0[1])]
        if is_terminated:
            target=r
        else:
            target=r+self.gamma*self.max_q(s1[0],s1[1])
        self.table[self._index(a,s0[0],s0[1])]=pre+self.alpha*(target-pre)

    def epilson_q(self, x, y,num_episode):
        if self._epsilon(num_episode) > random.random():
            ran=random.randint(0,3)
            return self.table[self._index(ran,x,y)],ran
        else:
            arr=np.zeros(self.actions)
            max=-1000000
            id=0
            for i in range(self.actions):
                arr[i]=self.table[self._index(i,x,y)]
                if arr[i]>max:
                    id=i
                    max=arr[i]
            return max,id

    def sarsa_update(self, a, s0, s1, r, is_terminated,num_episode):
        # both s0, s1 have the form [x,y]
        pre=self.table[self._index(a,s0[0],s0[1])]
        if is_terminated:
            target=r
            i=0
        else:
            max,i=self.epilson_q(s1[0],s1[1],num_episode)
            ## max: Q(s', a'), i: a'
            target=r+self.gamma*max
        self.table[self._index(a,s0[0],s0[1])]=pre+self.alpha*(target-pre)
        return i
        

In [8]:
def sarsa_cliff_walk():
    env = Env(length=12, height=4)
    table = Q_table(length=12, height=4)
    for num_episode in range(3000):
        # within the whole learning process
        episodic_reward = 0
        is_terminated = False
        s0 = [0, 0]
        action=table.take_action(s0[0], s0[1], num_episode)
        while not is_terminated:
            # within one episode
            r, s1, is_terminated = env.step(action)
            a=table.sarsa_update(action, s0, s1, r, is_terminated,num_episode)
            episodic_reward += r
            #env.render(frames=100)
            s0 = s1
            action=a
            #print([action,s0])
        if num_episode % 20 == 0:
            print("Episode: {}, Score: {}".format(num_episode, episodic_reward))
        env.reset()
        

In [9]:
sarsa_cliff_walk()

Episode: 0, Score: -101
Episode: 20, Score: -161
Episode: 40, Score: -60
Episode: 60, Score: -93
Episode: 80, Score: -103
Episode: 100, Score: -44
Episode: 120, Score: -41
Episode: 140, Score: -39
Episode: 160, Score: -100
Episode: 180, Score: -35
Episode: 200, Score: -25
Episode: 220, Score: -31
Episode: 240, Score: -100
Episode: 260, Score: -29
Episode: 280, Score: -37
Episode: 300, Score: -115
Episode: 320, Score: -16
Episode: 340, Score: -19
Episode: 360, Score: -15
Episode: 380, Score: -15
Episode: 400, Score: -23
Episode: 420, Score: -19
Episode: 440, Score: -112
Episode: 460, Score: -17
Episode: 480, Score: -21
Episode: 500, Score: -17
Episode: 520, Score: -17
Episode: 540, Score: -17
Episode: 560, Score: -15
Episode: 580, Score: -15
Episode: 600, Score: -15
Episode: 620, Score: -15
Episode: 640, Score: -15
Episode: 660, Score: -17
Episode: 680, Score: -15
Episode: 700, Score: -15
Episode: 720, Score: -16
Episode: 740, Score: -15
Episode: 760, Score: -15
Episode: 780, Score: -15