## Rules
---
<img style="float:left" src="Maze.png" alt="drawing" width="300"/>

> Consider the simple maze shown inset in the Figure. In each of the 47 states there are four actions, `up`, `down`, `right`, and `left`, which take the agent deterministically to the corresponding neighboring states, except when movement is blocked by an obstacle or the edge of the maze, in which case the agent remains where it is. Reward is zero on all transitions, except those into the goal state, on which it is +1. After reaching the goal state `(G)`, the agent returns to the start state `(S)` to begin a new episode. This is a discounted, episodic task with `gamma = 0.95`.
---
>
## Dyna-Q

---
<img style="float:left" src="Tabular_Dyna-Q.png" alt="drawing" width="600"/>

In [2]:
import numpy as np
import matplotlib.pyplot as plt

In [11]:
ROWS = 6
COLS = 9
S = (2, 0)
G = (0, 8)
BLOCKS = [(1, 2), (2, 2), (3, 2), (0, 7), (1, 7), (2, 7), (4, 5)]
ACTIONS = ["left", "up", "right", "down"]

In [19]:
class Maze:
    
    def __init__(self):
        self.rows = ROWS
        self.cols = COLS
        self.start = S
        self.goal = G
        self.blocks = BLOCKS
        self.state = S
        self.end = False
        # init maze
        self.maze = np.zeros((self.rows, self.cols))
        for b in self.blocks:
            self.maze[b] = -1
            
    def nxtPosition(self, action):
        r, c = self.state
        if action == "left":
            c -= 1
        elif action == "right":
            c += 1
        elif action == "up":
            r -= 1
        else:
            r += 1
        
        if (r >= 0 and r <= self.rows-1) and (c >= 0 and c <= self.cols-1):
            if (r, c) not in self.blocks:
                self.state = (r, c)
        return self.state
    
    def giveReward(self):
        if self.state == self.goal:
            self.end = True
            return 1
        else:
            return 0
        
    def showMaze(self):
        self.maze[self.state] = 1
        for i in range(0, self.rows):
            print('-------------------------------------')
            out = '| '
            for j in range(0, self.cols):
                if self.maze[i, j] == 1:
                    token = '*'
                if self.maze[i, j] == -1:
                    token = 'z'
                if self.maze[i, j] == 0:
                    token = '0'
                out += token + ' | '
            print(out)
        print('-------------------------------------')

In [20]:
m = Maze()
m.showMaze()

-------------------------------------
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | z | 0 | 
-------------------------------------
| 0 | 0 | z | 0 | 0 | 0 | 0 | z | 0 | 
-------------------------------------
| * | 0 | z | 0 | 0 | 0 | 0 | z | 0 | 
-------------------------------------
| 0 | 0 | z | 0 | 0 | 0 | 0 | 0 | 0 | 
-------------------------------------
| 0 | 0 | 0 | 0 | 0 | z | 0 | 0 | 0 | 
-------------------------------------
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 
-------------------------------------


In [None]:
class DynaAgent:
    
    def __init__(self, exp_rate=0.3, lr=0.1, n_steps=5):
        self.maze = Maze()
        self.state = S
        self.actions = ACTIONS
        self.state_actions = []  # state & action track
        self.steps = n_steps
        
        self.Q_values = {}
        # model function
        self.model = {}
        for row in range(ROWS):
            for col in range(COLS):
                self.Q_values[(row, col)] = {}
                self.model[(row, col)] = {}
                for a in self.actions:
                    self.Q_values[(row, col)][a] = 0
                    self.model[(row, col)][a] = (0, (0, 0))  # reward & next state
        
    def chooseAction(self):
        # epsilon-greedy
        mx_nxt_reward = -999
        action = ""
        
        if np.random.uniform(0, 1) <= self.exp_rate:
            action = np.random.choice(self.actions)
        else:
            # greedy action
            current_position = self.state
            # if all actions have same value, then select randomly
            if len(set(self.Q_values[current_position].values)) == 1:
                action = np.random.choice(self.actions)
            else:
                for a in self.actions:
                    nxt_reward = self.Q_values[current_position][a]
                    if nxt_reward >= mx_nxt_reward:
                        action = a
                        mx_nxt_reward = nxt_reward
        return action
    
    def reset(self):
        self.maze = Maze()
        self.state = S
        self.actions = ACTIONS
        self.state_actions = []
    
    def play(self):
        
        self.reset()
        
        while not self.maze.end:
            
            action = self.chooseAction()
            self.state_actions.append((self.state, action))
            
            nxtState = self.maze.nxtPosition(action)
            reward = self.maze.giveReward()
            # update Q-value
            self.Q_values[self.state][action] += self.lr*(reward + np.max(self.Q_values[nxtState].values) - self.Q_values[self.state][action])
            
            # update model
            self.model[self.state][action] = (reward, nxtState)
            self.state = nxtState
            
            # loop n times to randomly update Q-value
            for _ in range(self.steps):
                _state, _action = np.random.choice(self.state_actions)
                _reward, _nxtState = self.model[_state][_action]
                
                self.Q_values[_state][_action] += self.lr*(_reward + np.max(self.Q_values[_nxtState].values) - self.Q_values[_nxtState][_action])
            