# On-policy Prediction with Approximation

The novelty in this chapter is that the approximate value function is represented not as a table  
but as a parameterized functional form with weight vector $w \in R^d$.  
We will write $v̂(s,w) \approx v_\pi(s)$ for the approximate value of state s given weight vector w.

Changing one weight changes the estimated value of many states.  
Consequently, when a single state is updated, the change generalizes from that state to affect the values of many other states.  
Such **generalization** makes the learning potentially more powerful but also potentially more difficult to manage and understand.

Extending reinforcement learning to function approximation also makes it applicable to partially observable problems,
in which the full state is not available to the agent.

## Value Function Approximation

## The Prediction Objective (VE)

## Stochastic-gradient and Semi-gradient Methods

In [None]:
import collections

class GradientMonteCarloEstimateV():

    def __init__(self, gamma, alpha, policy):
        self.gamma = gamma
        self.alpha = alpha

        self.policy = policy

        self.state_value = np.ones((1, 10))

        self.states = []
        self.rewards = []

    def action(self, state):
        return self.policy(state)
    
    def observe(self, state, action, reward):
        self.states.append(state)
        self.rewards.append(reward)
    
    def optimize(self):
        g = 0

        for t in reversed(range(len(self.states))):
            g = self.gamma * g + self.rewards[t]

            self.state_value[self.states[t]] += (1 / self.returns[self.states[t]]) * (g - self.state_value[self.states[t]])
        
        self.states = []
        self.rewards = []

In [102]:
# Windy Gridworld Env
from enum import Enum

import numpy as np

import gymnasium as gym
from gymnasium import spaces

class Actions(Enum):
    RIGHT = 0
    UP = 1
    LEFT = 2
    DOWN = 3

class DynaMaze(gym.Env):
    metadata = { "render_modes": ["ascii"] }

    def __init__(self, render_mode=None, grid_shape=(6, 9)):
        self._grid_shape = grid_shape

        # Observations are dictionaries with the agent's and the target's location.
        # Each location is encoded as an element of {0, ..., `size`}^2, i.e. MultiDiscrete([size, size]).
        self.observation_space = spaces.Dict(
            {
                "agent": spaces.Box(0, self._grid_shape[0] - 1, shape=(2,), dtype=int),
                "target": spaces.Box(0, self._grid_shape[0] - 1, shape=(2,), dtype=int),
            }
        )

        self._agent_location = np.array([2, 0], dtype=int)
        self._target_location = np.array([0, 8], dtype=int)

        self._walls_locations = np.array([[1, 2], [2, 2], [3, 2], [4, 5], [0, 7], [1, 7], [2, 7]], dtype=int)

        # We have 4 actions, corresponding to "right", "up", "left", "down"
        # if king's moves are activated then we add the diagonales, so 4 more moves
        self.action_space = spaces.Discrete(4)

        """
        The following dictionary maps abstract actions from `self.action_space` to
        the direction we will walk in if that action is taken.
        i.e. 0 corresponds to "right", 1 to "up" etc.
        """
        self._action_to_direction = {
            Actions.UP.value: np.array([-1, 0]),
            Actions.DOWN.value: np.array([1, 0]),
            Actions.LEFT.value: np.array([0, -1]),
            Actions.RIGHT.value: np.array([0, 1]),
        }

        assert render_mode is None or render_mode in self.metadata["render_modes"]
        self.render_mode = render_mode
    
    def _get_obs(self):
        return str(self._agent_location)
    
    def _get_info(self):
        return {
            "distance": np.linalg.norm(
            self._agent_location - self._target_location, ord=1
            )
        }
    
    def _render_frame(self):
        if self.render_mode == "ascii":
            grid = np.zeros((6, 9))
            grid[self._agent_location[0], self._agent_location[1]] = 1
            grid[self._target_location[0], self._target_location[1]] = 6
            print(grid, flush=True)

    def step(self, action):
        # Map the action (element of {0,1,2,3}) to the direction we walk in
        direction = self._action_to_direction[action]
        target_location = np.array([self._agent_location[0] + direction[0], self._agent_location[1] + direction[1]])


        if np.any(np.all(target_location == self._walls_locations, axis=1)):
            self._agent_location = self._agent_location
        else:
            # We use `np.clip` to make sure we don't leave the grid
            self._agent_location[0] = np.clip(
                self._agent_location[0] + direction[0], 0, self._grid_shape[0] - 1
            )

            self._agent_location[1] = np.clip(
                self._agent_location[1] + direction[1], 0, self._grid_shape[1] - 1
            )

        # An episode is done iff the agent has reached the target
        terminated = np.all(self._agent_location == self._target_location)
        reward = 0 if terminated else -1
        observation = self._get_obs()
        info = self._get_info()

        self._render_frame()

        return observation, reward, terminated, False, info
    
    def reset(self, seed=None, options=None):
        # We need the following line to seed self.np_random
        super().reset(seed=seed)

        # reset agent's position
        self._agent_location = np.array([2, 0], dtype=int)

        observation = self._get_obs()
        info = self._get_info()

        self._render_frame()

        return observation, info

In [103]:
def argmax(array):
    return np.random.choice(np.where(array == np.max(array))[0])

def get_epsilon_greedy_policy(epsilon=0.1):
    def epsilon_greedy_policy(state_action_value, state):
        take_random_action_prob = np.random.uniform(0, 1)

        if take_random_action_prob < epsilon:
            random_action = np.random.randint(0, len(state_action_value[state]))
            return random_action
        else:
            greedy_action = argmax(state_action_value[state])
            return greedy_action
    
    return epsilon_greedy_policy

In [104]:
def play_env(env, agent):
    reward_sum = 0
    nb_steps = 0

    terminated = False
    observation, info = env.reset()

    while not terminated:
        action = agent.action(observation)

        new_observation, reward, terminated, truncated, info = env.step(action)

        agent.observe(observation, action, reward, new_observation)

        observation = new_observation

        reward_sum += reward
        nb_steps += 1
    
    agent.optimize()

    return reward_sum, nb_steps

In [105]:
env = DynaMaze()
agent = TabularDynaQ(env.action_space, 0.95, 0.1, 5, get_epsilon_greedy_policy())


In [106]:
rewards = []
time_steps = [0]


for i in range(300):
    reward, steps = play_env(env, agent)

    rewards.append(reward)
    time_steps.append(time_steps[-1] + steps)

In [107]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig = make_subplots(rows=1, cols=2, subplot_titles="Windy Gridworld")

x = np.array(time_steps)
y = np.arange(len(time_steps))
fig.add_trace(
    go.Scatter(
        x=x,
        y=y,
        line_color="red",
        name="Time steps",
    ),
    row=1,
    col=1,
)

y = np.array(np.convolve(rewards, np.ones(5)/5, mode='valid'))
x = np.arange(len(rewards[5:-5]))

fig.add_trace(
    go.Scatter(
        x=x,
        y=y,
        line_color="green",
        name="Mean Reward",
    ),
    row=1,
    col=2,
)

fig.update_layout(
    title="test",
    legend_title="Parameters",
)

fig.show()