In [3]:
from itertools import product
import numpy as np

In [2]:
class ActorCriticETAgent:
    """actor ctitic with eligibility trace"""
    def __init__(self, n_features, degree, n_states, n_actions, 
                 alpha, gamma=0.99, lambda_theta=0.5, lambda_w=0.5):
        self.n_features = n_features
        self.n_states = n_states
        self.degree = degree
        self.n_actions = n_actions
        self.lambda_theta = lambda_theta
        self.lambda_w = lambda_w

        self.alpha = alpha
        self.alpha_w = alpha * self.get_alpha_w()
        self.alpha_theta = alpha * self.get_alpha_theta()
        
        self.gamma = gamma
        
        self.w = np.zeros(self.n_features)
        self.theta = np.zeros(self.n_actions * self.n_features)
        
        self.reset()

    def learn(self, s, a, r, sp, terminal, policy):
        if terminal:
            v_sp = 0
        else:
            v_sp = self.v_s(sp)
        
        # calc error
        delta = r + self.gamma * v_sp  - self.v_s(s)

        # update z_w
        self.z_w = self.gamma * self.lambda_w * self.z_w + self.x_s(s)
        self.w += self.alpha_w * delta * self.z_w

        # update z_theta
        gradient = self.get_pi_gradient(s, a, policy)
        self.z_theta = self.gamma * self.lambda_theta * self.z_theta + self.I * gradient
        self.theta += self.alpha_theta * delta * self.z_theta

        self.I *= self.gamma
            
        return self.w, self.theta
    
    def choose_action(self, s):
        policy = self.pi_s(s)
        return np.random.choice(self.n_actions, p=policy), policy

    def reset(self):
        self.z_w = np.zeros_like(self.w)
        self.z_theta = np.zeros_like(self.theta)
        self.I = 1

    def pi_s(self, s):
        """return policy at state s"""
        h = self.h_s(s)
        exp = np.exp(h - np.max(h))
        return exp / np.sum(exp)

    def h_s(self, s):
        """return actions' preferences in state s"""
        h = np.zeros(self.n_actions)
        for a in range(self.n_actions):
            h[a] = self.theta @ self.x_sa(s, a)
        return h
    
    def x_sa(self, s, a):
        """return x(s, a) as fourier basis of state, shifted according to the action index"""
        x = np.zeros(self.n_features * self.n_actions)
        start = self.n_features * a
        end = start + self.n_features
        x[start: end] = self.x_s(s)
        return x

    def x_s(self, s):
        """return x(s) as fourier basis of state"""
        x = np.zeros(self.n_features)
        for i, c in enumerate(product(range(self.degree + 1), repeat=self.n_states)):
            c = np.array(c)
            x[i] = np.cos(np.pi * s.T @ c)
        return x

    def v_s(self, s):
        """return the value of a state given the weights vector"""
        return self.w @ self.x_s(s)

    def get_pi_gradient(self, s, a, policy):
        """compute gradient ln pi(a|s, theta), which equals x(s,a) = \sum_b \pi(b|s, theta) x(s,b)"""
        x = self.x_sa(s, a)
        summation = 0
        for i in range(self.n_actions):
            summation += policy[i] * self.x_sa(s, i)
        return x - summation
    
    def get_alpha_w(self):
        alpha = np.zeros(self.n_features)
        for i, c in enumerate(product(range(self.degree + 1), repeat=self.n_states)):
            alpha[i] = 1 / np.linalg.norm(c)
        alpha[0] = 1
        return alpha

    def get_alpha_theta(self):
        alpha = self.get_alpha_w()
        alpha_theta = np.concatenate((alpha, alpha, alpha))
        return alpha_theta
    