In [126]:
import numpy as np
from IPython.display import clear_output

In [151]:
class Maze():
    def __init__(self):
        self.maze = np.array([
            ['S', '0', '1', '0', '0'],
            ['0', '1', '0', '1', '0'],
            ['0', '0', '0', '1', '0'],
            ['0', '1', '1', '1', '0'],
            ['0', '0', '0', '0', 'G']
        ])
        self.reward        = 1
        self.punish        = -1
        self.gamma         = 0.5
        self.start         = [0, 0]
        self.goal          = (4, 4)
        self.current_pos   = self.start
        self.n_actions     = 4
        self.current_steps = 0

    def reset(self):
        self.current_pos = self.start
        self.current_action = 0
        return self.current_pos

    def step(self, action):
        self.current_steps += 1
        new_pos = self.current_pos.copy()
        
        if action == 0: # right
            new_pos[1] += 1
        elif action == 1: # down
            new_pos[0] += 1
        elif action == 2: # left
            new_pos[1] -= 1
        elif action == 3: # up
            new_pos[0] -= 1

        if new_pos[0] < 0 or new_pos[0] >= 5 or new_pos[1] < 0 or new_pos[1] >= 5 or self.maze[new_pos[0], new_pos[1]] == '1':
            reward = -1
            new_pos = self.current_pos

        elif self.maze[new_pos[0], new_pos[1]] == "G":
            reward = 1
            done = True
            
        else:
            reward = -0.1
            done = False
        
        self.current_pos = new_pos
        
        if tuple(self.current_pos) == self.goal:
            done = True
            reward = 1
            
        else:
            done = False
        
        return self.current_pos, reward, done
    
    def demonstrate(self):
        maze_copy = self.maze.copy()
        maze_copy[self.current_pos[0], self.current_pos[1]] = 'A'
        for row in maze_copy:
            print(' '.join(row))
        print()

In [187]:
env = Maze()
state = env.reset()
done = False
gamma = 0.6
g = 0
t = 0

In [188]:
env.demonstrate()

A 0 1 0 0
0 1 0 1 0
0 0 0 1 0
0 1 1 1 0
0 0 0 0 G



In [189]:
while not done:
    action = np.random.choice(env.n_actions)
    next_state, reward, done = env.step(action)
    g = gamma ** t * reward
    t += 1
env.demonstrate()
print(f"in {t} steps we got return {g}")

S 0 1 0 0
0 1 0 1 0
0 0 0 1 0
0 1 1 1 0
0 0 0 0 A

in 345 steps we got return 4.8309231851427355e-77


In [190]:
def uniform_policy(state):
    return np.array([1 / env.n_actions] * env.n_actions)

In [191]:
env = Maze()
state = env.reset()
done = False
gamma = 0.6
g = 0
t = 0

In [192]:
env.demonstrate()

A 0 1 0 0
0 1 0 1 0
0 0 0 1 0
0 1 1 1 0
0 0 0 0 G



In [193]:
while not done:
    action = np.random.choice(env.n_actions, p = uniform_policy(next_state))
    next_state, reward, done = env.step(action)
    g = gamma ** t * reward
    t += 1
env.demonstrate()
print(f"in {t} steps we got return {g}")

S 0 1 0 0
0 1 0 1 0
0 0 0 1 0
0 1 1 1 0
0 0 0 0 A

in 420 steps we got return 1.1101301830742745e-93
