In [133]:
class QTable:
    def __init__(self, n_states, n_actions):
        self.table = [[0] * n_actions for _ in range(n_states)]

    def get_greedy_action_policy(self, state):
        action_values = self.table[state]
        max_q_actions = [
            action 
            for action, q in enumerate(action_values) 
            if q == max(action_values)
        ]
        return [
            1/len(max_q_actions) if action in max_q_actions else 0 
            for action in range(len(action_values))
        ]
    
    def print(self):
        for state, actions in enumerate(self.table):
            print('State', state, '->', actions)

class StateValueTable:
    def __init__(self, n_states):
        self.table = [0] * n_states

    def get_greedy_action_policy(self, action_to_next_state_dynamics, exit_dynamics):
        exit_actions = [action for action, next_state in enumerate(action_to_next_state_dynamics) if next_state in exit_dynamics]
        greedy_actions = exit_actions
        if not greedy_actions:
            actions_next_state_values = [self.table[next_state] for next_state in action_to_next_state_dynamics]
            max_next_state_value = max([next_state_value for next_state_value in actions_next_state_values])
            max_actions = [action for action, next_state_value in enumerate(actions_next_state_values) if next_state_value == max_next_state_value]
            greedy_actions = max_actions
        return [
            1/len(greedy_actions) if action in greedy_actions else 0 
            for action in range(len(action_to_next_state_dynamics))
        ]

    def print(self):
        print(self.table)

class PolicyFunctions:
    @staticmethod
    def combine_policy_with_exploratory_policy(action_policy, exploration_rate):
        exploratory_action_policy_part = [exploration_rate/len(action_policy)] * len(action_policy)
        action_policy_part = [action_policy[action] * (1 - exploration_rate) for action in range(len(action_policy))]
        return [exploratory_action_policy_part[action] + action_policy_part[action] for action in range(len(action_policy))]

    @staticmethod
    def get_max_actions(state_action_value_function):
        max_q = max(state_action_value_function)
        max_actions = [action for action, q in enumerate(state_action_value_function) if q == max_q]
        return max_actions
    
    @staticmethod
    def single_action_as_action_probabilities(chosen_action, n_actions):
        return [
            1 if action == chosen_action else 0 
            for action in range(n_actions)
        ]
    

In [134]:
from AbstractAgent import AbstractAgent
from Runner import Runner
from AbstractEnvironment import AbstractEnvironment
from OneDMazeEnvironment import OneDMazeEnvironment

In [135]:
def test_agent(agent: AbstractAgent, environment: AbstractEnvironment):
    environment.reset()
    runner = Runner(agent)
    runner.run(environment, n_episodes=10000, learn=True)
    runner.run(environment, n_episodes=1, verbose=True, learn=False)
    agent.print()
    
# Single goal
# environment = OneDMazeEnvironment(initial_agent_position=1, n_states=10, exits=[9], initial_state_rewards={9: 10})
    
# Duel goal
environment = OneDMazeEnvironment(initial_agent_position=3, n_states=10, exits=[0,9], initial_state_rewards={0:1, 9:10})

In [222]:
class QLearningAgent(AbstractAgent):
    def __init__(self, environment: AbstractEnvironment, learning_rate=0.1, discount_factor=0.999, exploration_rate=0.2):
        self.q_table = QTable(environment.get_states_count(), environment.get_actions_count())
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_rate = exploration_rate

    def get_action_policy(self, state):
        greedy_action_policy = self.q_table.get_greedy_action_policy(state)
        return PolicyFunctions.combine_policy_with_exploratory_policy(greedy_action_policy, self.exploration_rate)

    def post_act_learning(self, state, action, reward, next_state):
        max_next_q = max(self.q_table.table[next_state])
        self.q_table.table[state][action] += self.learning_rate * (reward + self.discount_factor * max_next_q - self.q_table.table[state][action])

    def print(self):
        self.q_table.print()

test_agent(QLearningAgent(environment), environment)


Episode: 1
[E| |A| | | | | | |E]
[E|A| | | | | | | |E]
[A| | | | | | | | |E]
State 0 -> [0, 0]
State 1 -> [0.9999999999999996, 0.9980009999999986]
State 2 -> [0.998999999999999, 0.997002998999998]
State 3 -> [0.9980009999999986, 0.9960059960009975]
State 4 -> [0.997002998999998, 0.9950058506766059]
State 5 -> [0.9960057309317163, 0.44638237372489803]
State 6 -> [0.7651121000444705, 0.005140708877284425]
State 7 -> [0.05145854732016441, 0.0]
State 8 -> [0, 1.0]
State 9 -> [0, 0]


In [209]:
import random


class DoubleQLearning(AbstractAgent):
    def __init__(self, environment: AbstractEnvironment, learning_rate=0.1, discount_factor=0.999, exploration_rate=0.2):
        self.q_table1 = QTable(environment.get_states_count(), environment.get_actions_count())
        self.q_table2 = QTable(environment.get_states_count(), environment.get_actions_count())
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_rate = exploration_rate

    def get_action_policy(self, state):
        greedy_action_policy = [
            (q1 + q2) / 2 
            for q1, q2 in zip(self.q_table1.table[state], self.q_table2.table[state])
        ]
        return PolicyFunctions.combine_policy_with_exploratory_policy(greedy_action_policy, self.exploration_rate)

    def post_act_learning(self, state, action, reward, next_state):
        if random.choice([True, False]):
            q_table = self.q_table1.table
            source_q_table = self.q_table2.table
        else:
            q_table = self.q_table2.table
            source_q_table = self.q_table1.table
        max_action = q_table[next_state].index(max(q_table[next_state]))
        q_table[state][action] += self.learning_rate * (reward + self.discount_factor * source_q_table[next_state][max_action] - q_table[state][action])

    def print(self):
        self.q_table1.print()
        self.q_table2.print()

test_agent(DoubleQLearning(environment), environment)

Episode: 1
[E| |A| | | | | | |E]
[E|A| | | | | | | |E]
[E| |A| | | | | | |E]
[E|A| | | | | | | |E]
[E| |A| | | | | | |E]
[E| | |A| | | | | |E]
[E| |A| | | | | | |E]
[E| | |A| | | | | |E]
[E| |A| | | | | | |E]
[E| | |A| | | | | |E]
[E| |A| | | | | | |E]
[E| | |A| | | | | |E]
[E| |A| | | | | | |E]
[E|A| | | | | | | |E]
[E| |A| | | | | | |E]
[E| | |A| | | | | |E]
[E| | | |A| | | | |E]
[E| | |A| | | | | |E]
[E| |A| | | | | | |E]
[E|A| | | | | | | |E]
[A| | | | | | | | |E]
State 0 -> [0, 0]
State 1 -> [0.9999999999999996, 9.930209650349724]
State 2 -> [9.920279440699368, 9.940149800149882]
State 3 -> [9.930209650349724, 9.950099900049942]
State 4 -> [9.940149800149882, 9.96005996000996]
State 5 -> [9.950099900049942, 9.970029989999968]
State 6 -> [9.96005996000996, 9.980009999999975]
State 7 -> [9.970029989999968, 9.989999999999984]
State 8 -> [9.980009999999975, 9.999999999999993]
State 9 -> [0, 0]
State 0 -> [0, 0]
State 1 -> [0.9999999999999996, 9.930209650349724]
State 2 -> [9.920279440

In [225]:
class SarsaAgent(AbstractAgent):
    def __init__(self, environment: AbstractEnvironment, learning_rate=0.1, discount_factor=0.999, exploration_rate=0.2):
        self.q_table = QTable(environment.get_states_count(), environment.get_actions_count())
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_rate = exploration_rate
        self.prior = None

    def get_action_policy(self, state):
        greedy_action_policy = self.q_table.get_greedy_action_policy(state)
        return PolicyFunctions.combine_policy_with_exploratory_policy(greedy_action_policy, self.exploration_rate)

    def post_act_learning(self, state, action, reward, next_state):
        if self.prior is not None:
            prior_state, prior_action, prior_reward = self.prior
            prior_q = self.q_table.table[prior_state][prior_action]
            current_q = 0
            if state is not None and action is not None:
                current_q = self.q_table.table[state][action]
            self.q_table.table[prior_state][prior_action] += self.learning_rate * (prior_reward + self.discount_factor * current_q - prior_q)
        self.prior = state, action, reward
    
    def post_episode_learning(self):
        self.post_act_learning(None, None, 0, None)
        self.prior = None

    def print(self):
        self.q_table.print()

test_agent(SarsaAgent(environment), environment)

Episode: 1
[E| | | |A| | | | |E]
[E| | | | |A| | | |E]
[E| | | | | |A| | |E]
[E| | | | | | |A| |E]
[E| | | | | | | |A|E]
[E| | | | | | |A| |E]
[E| | | | | | | |A|E]
[E| | | | | | | | |A]
State 0 -> [0, 0]
State 1 -> [0.9999999999999996, 7.063417773139303]
State 2 -> [3.5804579405352013, 9.914180144280406]
State 3 -> [9.333577447504139, 9.933765604925165]
State 4 -> [9.824156012520659, 9.950222713707285]
State 5 -> [9.922467526249662, 9.960823710545855]
State 6 -> [9.948868636582626, 9.973974888678107]
State 7 -> [9.961779510140545, 9.988239792874955]
State 8 -> [9.97212915535188, 9.999999999999993]
State 9 -> [0, 0]


In [241]:
class ExpectedSarsaAgent(AbstractAgent):
    def __init__(self, environment: AbstractEnvironment, learning_rate=0.1, discount_factor=0.999, exploration_rate=0.2):
        self.q_table = QTable(environment.get_states_count(), environment.get_actions_count())
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_rate = exploration_rate

    def get_action_policy(self, state):
        greedy_action_policy = self.q_table.get_greedy_action_policy(state)
        return PolicyFunctions.combine_policy_with_exploratory_policy(greedy_action_policy, self.exploration_rate)

    def post_act_learning(self, state, action, reward, next_state):
        current_q = self.q_table.table[state][action]
        expected_q = sum(self.q_table.table[next_state]) / len(self.q_table.table[next_state])
        self.q_table.table[state][action] += self.learning_rate * (reward + self.discount_factor * expected_q - current_q)

    def print(self):
        self.q_table.print()

test_agent(ExpectedSarsaAgent(environment), environment)

Episode: 1
[E| |A| | | | | | |E]
[E|A| | | | | | | |E]
[A| | | | | | | | |E]
State 0 -> [0, 0]
State 1 -> [0.9999999999999996, 0.6677660748717797]
State 2 -> [0.8330212746126426, 0.5043133570450882]
State 3 -> [0.6679404553555607, 0.34319521984279094]
State 4 -> [0.5043541262314737, 0.18492459115252094]
State 5 -> [0.34049698725593247, 0.04502289744945435]
State 6 -> [0.15043442834730886, 0.00029280689206155386]
State 7 -> [0.012576547180063326, 0.0]
State 8 -> [0, 1.0]
State 9 -> [0, 0]


In [247]:
from OneDMazeDynamics import OneDMazeDynamics


class StateValueTemporalDifferenceZero(AbstractAgent):
    def __init__(self, environment: AbstractEnvironment, learning_rate=0.2, discount_factor=0.999, exploration_rate=0.2):
        self.state_value_table = StateValueTable(environment.get_states_count())
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_rate = exploration_rate
        self.environment = environment

    def get_action_policy(self, state):
        action_to_next_state_dynamics = OneDMazeDynamics.get_action_to_next_state_dynamics(state)
        goal_dynamics = environment.get_exits()
        greedy_action_policy = self.state_value_table.get_greedy_action_policy(action_to_next_state_dynamics, goal_dynamics)
        return PolicyFunctions.combine_policy_with_exploratory_policy(greedy_action_policy, self.exploration_rate)

    def post_act_learning(self, state, _, reward, next_state):
        next_state_value = self.state_value_table.table[next_state]
        current_state = self.state_value_table.table[state]
        self.state_value_table.table[state] += self.learning_rate * (reward + self.discount_factor * next_state_value - current_state)

    def print(self):
        self.state_value_table.print()

test_agent(StateValueTemporalDifferenceZero(environment), environment)

Episode: 1
[E| |A| | | | | | |E]
[E|A| | | | | | | |E]
[A| | | | | | | | |E]
[0, 0.9996136198452017, 0.9984869628206533, 0.9970666775103557, 0.9956497263654203, 0.9642809950914902, 0.6838100506051362, 0.4703476832104218, 3.6, 0]


In [251]:
class TemporalDifferenceNStepsToExpectedSarsa(AbstractAgent):
    def __init__(self, environment: AbstractEnvironment, learning_rate=0.2, discount_factor=0.999, exploration_rate=0.2, n_steps=2):
        self.q_table = QTable(environment.get_states_count(), environment.get_actions_count())
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_rate = exploration_rate
        self.n_steps = n_steps
        self.priors = []

    def get_action_policy(self, state):
        greedy_action_policy = self.q_table.get_greedy_action_policy(state)
        return PolicyFunctions.combine_policy_with_exploratory_policy(greedy_action_policy, self.exploration_rate)

    def post_act_learning(self, state, action, reward, next_state):
        self.priors.append((state, action, reward, next_state))
        if len(self.priors) >= self.n_steps:
            self._update_once_from_priors()

    def post_episode_learning(self):
        while(self.priors):
            self._update_once_from_priors()

    def _update_once_from_priors(self):
        returns = 0
        target_state, target_action, _, _ = self.priors[0]
        for step in range(min(self.n_steps, len(self.priors))):
            _, _, step_reward, _ = self.priors[step]
            returns += step_reward * self.discount_factor ** step
        if len(self.priors) == self.n_steps:
            _, _, _, last_step_next_state = self.priors[self.n_steps-1]
            last_step_expected_q = sum(self.q_table.table[last_step_next_state]) / len(self.q_table.table[last_step_next_state])
            returns += last_step_expected_q * self.discount_factor ** self.n_steps
        target_current_q = self.q_table.table[target_state][target_action]
        self.q_table.table[target_state][target_action] += self.learning_rate * (returns - target_current_q)
        self.priors.pop(0)

    def print(self):
        self.q_table.print()

test_agent(TemporalDifferenceNStepsToExpectedSarsa(environment, n_steps=2), environment)

Episode: 1
[E| | | |A| | | | |E]
[E| | | | |A| | | |E]
[E| | | | | |A| | |E]
[E| | | | | | |A| |E]
[E| | | | | | | |A|E]
[E| | | | | | | | |A]
State 0 -> [0, 0]
State 1 -> [0.9999149294082696, 8.924814244002501]
State 2 -> [9.216081111739111, 9.746494627146994]
State 3 -> [8.665104727508094, 9.641825737535338]
State 4 -> [9.809185661148655, 9.9040381836926]
State 5 -> [9.578223400106046, 9.887812569191041]
State 6 -> [9.901347793551393, 9.948053720099756]
State 7 -> [9.867391864468159, 9.983755238735613]
State 8 -> [9.942824556775816, 9.999999999999996]
State 9 -> [0, 0]


In [256]:
import sys


class MonteCarlo(TemporalDifferenceNStepsToExpectedSarsa):
    def __init__(self, environment: AbstractEnvironment, learning_rate=0.1, discount_factor=0.999, exploration_rate=0.2):
        super().__init__(environment, learning_rate, discount_factor, exploration_rate, n_steps=sys.maxsize)

test_agent(MonteCarlo(environment), environment)

Episode: 1
[E| |A| | | | | | |E]
[E| | |A| | | | | |E]
[E| | | |A| | | | |E]
[E| | | | |A| | | |E]
[E| | | | | |A| | |E]
[E| | | | | | |A| |E]
[E| | | | | | | |A|E]
[E| | | | | | | | |A]
State 0 -> [0, 0]
State 1 -> [0.9282102012308148, 8.212036801132694]
State 2 -> [7.160882110376837, 9.926711130591636]
State 3 -> [9.912442269175735, 9.913263225722307]
State 4 -> [9.929928635275699, 9.892433178229991]
State 5 -> [9.897796226413234, 9.939371634835833]
State 6 -> [9.94011899941772, 9.95581485115493]
State 7 -> [9.947579338328529, 9.983774534137396]
State 8 -> [9.974080562144744, 9.999999999999993]
State 9 -> [0, 0]


In [143]:
class BellmansEquations:
    @staticmethod
    def evaluate_and_improve_policy_iteration(n_states, n_actions, discount, environment_probabilities, environment_rewards, max_evaluate_iterations=100, max_improve_iterations=100, min_evaluation_delta=1e-4):
        policy = [0] * n_states
        for iteration_index in range(max_improve_iterations):
            state_value_function = BellmansEquations._evaluate_policy(n_states, policy, max_evaluate_iterations, discount, min_evaluation_delta, environment_probabilities, environment_rewards)
            policy, is_stable = BellmansEquations._improve_policy(n_states, n_actions, policy, state_value_function, discount, environment_probabilities, environment_rewards)
            if is_stable:
                print(f'Policy improvement stopped at iteration {iteration_index}'); 
                break
        return policy

    @staticmethod
    def _evaluate_policy(n_states, policy, max_iterations, discount, min_evaluation_delta, environment_probabilities, environment_rewards):
        state_value_function = [0] * n_states
        for iteration_index in range(max_iterations):
            max_delta = 0
            for state in range(n_states):
                prior_state_value_function = state_value_function[state]
                action = policy[state]
                state_value_function[state] += BellmansEquations._action_value_update(state, action, state_value_function, environment_probabilities, environment_rewards, discount)
                max_delta = max(max_delta, abs(prior_state_value_function-state_value_function[state]))
            if max_delta < min_evaluation_delta: 
                print(f'Policy evaluation stopped at iteration {iteration_index}'); 
                break
        return state_value_function
    
    @staticmethod
    def _improve_policy(n_states, n_actions, policy, state_value_function, discount, environment_probabilities, environment_rewards):
        action_value_function = [[0] * n_actions for _ in range(n_states)]
        is_stable = True
        for state in range(n_states):
            for action in range(n_actions):
                action_value_function[state][action] = BellmansEquations._action_value_update(state, action, state_value_function, environment_probabilities, environment_rewards, discount)
            new_max_actions = PolicyFunctions.get_max_actions(action_value_function[state])
            if policy[state] != new_max_actions[0]:
                is_stable = False
            policy[state] = new_max_actions[0]
        return policy, is_stable
    
    @staticmethod
    def policy_value_iteration(n_states, n_actions, discount, environment_probabilities, environment_rewards, max_iterations=100, min_evaluation_delta=1e-4):
        state_value_function = [0] * n_states
        policy = [0] * n_states
        for iteration_index in range(max_iterations):
            max_delta = 0
            for state in range(n_states):
                prior_state_value_function = state_value_function[state]
                action_value_function = [
                    BellmansEquations._action_value_update(state, action, state_value_function, environment_probabilities, environment_rewards, discount)
                    for action in range(n_actions)
                ]
                state_value_function[state] = max(action_value_function)
                max_delta = max(max_delta, abs(prior_state_value_function-state_value_function[state]))
                policy[state] = action_value_function.index(max(action_value_function))
            if max_delta < min_evaluation_delta: 
                print(f'Stopped at iteration {iteration_index}'); 
                break
        return policy
    
    @staticmethod
    def _action_value_update(state, action, state_value_function, environment_probabilities, environment_rewards, discount):
        return sum([
            (next_state_reward_probability * (environment_rewards[reward_index] + discount * state_value_function[next_state]))
            for next_state, next_state_reward_probabilities in enumerate(environment_probabilities[state][action])
            for reward_index, next_state_reward_probability in enumerate(next_state_reward_probabilities)
        ])

In [161]:
class PolicyIterationAgent(AbstractAgent):
    def __init__(self, environment: OneDMazeEnvironment, discount=0.9):
        self.n_actions = environment.get_actions_count()
        environment_probabilities = OneDMazeDynamics.build_environment_probabilities(environment)
        environment_rewards = environment.get_reward_values()
        self.policy = BellmansEquations.evaluate_and_improve_policy_iteration(environment.get_states_count(), self.n_actions, discount, environment_probabilities, environment_rewards)

    def get_action_policy(self, state):
        policy_action = self.policy[state]
        return PolicyFunctions.single_action_as_action_probabilities(policy_action, self.n_actions)

    def print(self):
        print(self.policy)

test_agent(PolicyIterationAgent(environment), environment)


Policy improvement stopped at iteration 1
Episode: 1
[E| | | |A| | | | |E]
[E| | | | |A| | | |E]
[E| | | | | |A| | |E]
[E| | | | | | |A| |E]
[E| | | | | | | |A|E]
[E| | | | | | | | |A]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 0]


In [145]:
class ValueIterationAgent(AbstractAgent):
    def __init__(self, environment: OneDMazeEnvironment, discount=0.9):
        self.n_actions = environment.get_actions_count()
        environment_probabilities = OneDMazeDynamics.build_environment_probabilities(environment)
        environment_rewards = environment.get_reward_values()
        self.policy = BellmansEquations.policy_value_iteration(environment.get_states_count(), self.n_actions, discount, environment_probabilities, environment_rewards)

    def get_action_policy(self, state):
        policy_action = self.policy[state]
        return PolicyFunctions.single_action_as_action_probabilities(policy_action, self.n_actions)

    def print(self):
        print(self.policy)
    
test_agent(ValueIterationAgent(environment), environment)

Stopped at iteration 59
Episode: 1
[E| | | |A| | | | |E]
[E| | | | |A| | | |E]
[E| | | | | |A| | |E]
[E| | | | | | |A| |E]
[E| | | | | | | |A|E]
[E| | | | | | | | |A]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 0]


In [146]:
import math


class SoftmaxFunctions:
    @staticmethod
    def actions_softmax(state_q, temperature):
        action_exponentials = [math.exp(q / temperature) for q in state_q]
        exponentials_sum = sum(action_exponentials)
        return [action_exponential / exponentials_sum for action_exponential in action_exponentials]

    @staticmethod
    def action_softmax(state_q, action, temperature):
        softmaxed_actions = SoftmaxFunctions.actions_softmax(state_q, temperature)
        return softmaxed_actions[action]


In [259]:
class ReinforceAgent(AbstractAgent):
    def __init__(self, environment: AbstractEnvironment, discount=0.9, learning_rate=0.1, softmax_temperature=0.9):
        self.learning_rate = learning_rate
        self.softmax_temperature = softmax_temperature
        self.discount = discount
        self.critic_state_value_table = StateValueTable(environment.get_states_count())
        self.actor_q_table = QTable(environment.get_states_count(), environment.get_actions_count())
        self.priors = []

    def get_action_policy(self, state):
        return SoftmaxFunctions.actions_softmax(self.actor_q_table.table[state], self.softmax_temperature)

    def post_act_learning(self, state, action, reward, next_state):
        self.priors.append((state, action, reward, next_state))

    def post_episode_learning(self):
        step_discount = self.discount ** (len(self.priors)-1)
        returns = 0
        for step in range(len(self.priors)-1, -1, -1):
            state, action, reward, _ = self.priors[step]
            returns = reward + step_discount * returns
            state_value_delta = returns - self.critic_state_value_table.table[state]
            self.critic_state_value_table.table[state] += self.learning_rate * state_value_delta
            action_softmax = SoftmaxFunctions.action_softmax(self.actor_q_table.table[state], action, self.softmax_temperature)
            self.actor_q_table.table[state][action] += self.learning_rate * state_value_delta * step_discount * (1 - action_softmax) / self.softmax_temperature
            step_discount /= self.discount
        self.priors = []

    def print(self):
        self.actor_q_table.print()

test_agent(ReinforceAgent(environment), environment)

Episode: 1
[E| | | |A| | | | |E]
[E| | | | |A| | | |E]
[E| | | | | |A| | |E]
[E| | | | | | |A| |E]
[E| | | | | | | |A|E]
[E| | | | | | | | |A]
State 0 -> [0, 0]
State 1 -> [0.7596410486503444, -1.6248582840303691]
State 2 -> [0.9691967865042435, -1.1260080418237726]
State 3 -> [-5.612780188750774, 1.5796043060647196]
State 4 -> [-5.836897658421763, 1.2693089783412013]
State 5 -> [-5.262794346677717, 1.34586350573414]
State 6 -> [-5.634121571548922, 1.279968737323111]
State 7 -> [-5.974904052622924, 1.2470817782988437]
State 8 -> [-7.158473275612698, 1.178099171541393]
State 9 -> [0, 0]


In [262]:
class ActorCriticAgent(AbstractAgent):
    def __init__(self, environment: AbstractEnvironment, discount=0.9, learning_rate=0.1, softmax_temperature=0.9):
        self.learning_rate = learning_rate
        self.softmax_temperature = softmax_temperature
        self.discount = discount
        self.critic_state_value_table = StateValueTable(environment.get_states_count())
        self.actor_q_table = QTable(environment.get_states_count(), environment.get_actions_count())

    def get_action_policy(self, state):
        return SoftmaxFunctions.actions_softmax(self.actor_q_table.table[state], self.softmax_temperature)

    def post_act_learning(self, state, action, reward, next_state):
        state_value = self.critic_state_value_table.table[state]
        next_state_value = self.critic_state_value_table.table[next_state]
        state_value_delta = reward + self.discount * next_state_value - state_value
        self.critic_state_value_table.table[state] += self.learning_rate * state_value_delta
        action_softmax = SoftmaxFunctions.action_softmax(self.actor_q_table.table[state], action, self.softmax_temperature)
        self.actor_q_table.table[state][action] += self.learning_rate * state_value_delta * (1 - action_softmax) / self.softmax_temperature

    def print(self):
        self.actor_q_table.print()

test_agent(ActorCriticAgent(environment), environment)
    

Episode: 1
[E| | | |A| | | | |E]
[E| | | | |A| | | |E]
[E| | | | | |A| | |E]
[E| | | | | | |A| |E]
[E| | | | | | | |A|E]
[E| | | | | | | | |A]
State 0 -> [0, 0]
State 1 -> [0.7016086143141989, -0.5748003161339443]
State 2 -> [0.1421739831692055, 0.9595776159965699]
State 3 -> [-7.045608529465373, 1.5434598751824706]
State 4 -> [-5.270055173661396, 1.6417733282204743]
State 5 -> [-5.640474119903142, 1.598290386464439]
State 6 -> [-5.539051128453065, 1.6172599457150714]
State 7 -> [-5.171342295341374, 1.813922982691584]
State 8 -> [-5.000259998721385, 2.04695087290606]
State 9 -> [0, 0]
