In [1]:
import numpy as np


state space:
$$
{x_p \in \mathcal{R}^N, y_p \in \mathcal{R}^N, x_{h} \in \mathcal{R}^N, button \in [0, 1]}
$$
action space:
$$
{direction \in {up, down, left, right}}
$$

$$
Q \in \mathcal{R}^{4 \times N \times N \times N \times 2}
$$

In [2]:
def my_argmax(array):
    """return argmax and break ties"""
    max_ = np.nanmax(array)
    indx = [i for i in range(len(array)) if array[i] == max_]
    return np.random.choice(indx)


In [7]:
class Agent:
    def __init__(self, n_actions, n_dims, gamma, alpha_p=1, 
                 Q=None, s_count=None, sa_count=None):
        self.n_actions = n_actions
        self.action_space = range(n_actions)
        self.n_dims = n_dims
        self.gamma = gamma
        self.alpha_p = alpha_p
        
        self.s_count = np.zeros(n_dims[1:]) if s_count is None else s_count
        self.sa_count = np.zeros(n_dims) if sa_count is None else sa_count
        self.Q = np.zeros(n_dims) if Q is None else Q
        
    def learn(self, s, a, r, sp):
        """update Q_as according to observation and learning rate"""
        # update Q according to error
        self.sa_count[a, (*s)] += 1
        error = r + self.gamma * (np.max(self.Q[self.action_space, sp[0], sp[1], sp[2], sp[3]])) - self.Q[a,  s[0], s[1], s[2], s[3]]
        alpha = 1 / self.sa_count[a,  s[0], s[1], s[2], s[3]] ** self.alpha_p
        self.Q[a, s[0], s[1], s[2], s[3]] += alpha * error 
        
        return self.Q, self.s_count, self.sa_count

    def choose_action(self, s):
        """get action with e-greedy policy"""
        # update s_count()
        self.s_count[s[0], s[1], s[2], s[3]] += 1

        # choose e-greedy action 
        eps = 1 / np.sqrt(self.s_count[ s[0], s[1], s[2], s[3]])
        if np.random.uniform() < eps:
            return np.random.choice(self.n_actions)
        return my_argmax(self.Q[self.action_space,  s[0], s[1], s[2], s[3]])
    
    def choose_action_egreedy(self, s, eps=0.1):
        """get action with e-greedy policy"""
        if np.random.uniform() < eps:
            return np.random.choice(self.n_actions)
        return my_argmax(self.Q[self.action_space,  s[0], s[1], s[2], s[3]])

    def reset(self):
        self.Q = np.zeros(n_dims)


In [None]:
class AgentSmall:
    def __init__(self, n_actions, n_dims, gamma, alpha_p=1, 
                 Q=None, s_count=None, sa_count=None):
        self.n_actions = n_actions
        self.action_space = range(n_actions)
        self.n_dims = n_dims
        self.gamma = gamma
        self.alpha_p = alpha_p
        
        self.s_count = np.zeros(n_dims[1:]) if s_count is None else s_count
        self.sa_count = np.zeros(n_dims) if sa_count is None else sa_count
        self.Q = np.zeros(n_dims) if Q is None else Q
        
    def learn(self, s, a, r, sp):
        """update Q_as according to observation and learning rate"""
        # update Q according to error
        self.sa_count[a, s[0], s[1], s[2]] += 1
        error = r + self.gamma * (np.max(self.Q[self.action_space, sp[0], sp[1], sp[2]])) - self.Q[a, s[0], s[1], s[2]]
        alpha = 1 / self.sa_count[a, s[0], s[1], s[2]] ** self.alpha_p
        self.Q[a, s[0], s[1], s[2]] += alpha * error 
        
        return self.Q, self.s_count, self.sa_count

    def choose_action(self, s):
        """get action with e-greedy policy"""
        # update s_count()
        self.s_count[s[0], s[1], s[2]] += 1

        # choose e-greedy action 
        eps = 1 / np.sqrt(self.s_count[s[0], s[1], s[2]])
        if np.random.uniform() < eps:
            return np.random.choice(self.n_actions)
        return my_argmax(self.Q[self.action_space, s[0], s[1], s[2]])
    
    def choose_action_egreedy(self, s, eps=0.1):
        """get action with e-greedy policy"""
        if np.random.uniform() < eps:
            return np.random.choice(self.n_actions)
        return my_argmax(self.Q[self.action_space, s[0], s[1], s[2]])

    def reset(self):
        self.Q = np.zeros(n_dims)
