# Homework 9

## Value from Q


### Mountain Car MDP and Regressors
**Note**: these imports and functions are available in catsoop. You do not need to copy them in.

In [None]:
import numpy as np
from functools import partial


class MountainCar:
    '''
    Mountain Car MDP.

    The state is a tuple of two floats (x, v), denoting the position and 
    velocity of the car.
    '''
    def __init__(self, goal_velocity=0, difficulty='hard', rng=np.random.default_rng(42)):
        self.difficulty = difficulty
        self.min_position = -1.2
        self.max_position = 0.6
        self.max_speed = 0.07
        self.goal_position = 0.5
        self.goal_velocity = goal_velocity

        self.force = 0.001
        self.gravity = 0.0025
        self.force_noise = 0.0002

        self.low = np.array([self.min_position, -self.max_speed], dtype=np.float32)
        self.high = np.array([self.max_position, self.max_speed], dtype=np.float32)
        self.actions = (0, 1, 2)          # (left_acc, none, right_acc)
        self.discount_factor = 1.0
        self.rng = rng

        self.time_factor = 10.

        self.init_state()


    def init_state(self):
        if self.difficulty == 'hard':
            self.state = (self.rng.uniform(-0.6, -0.4), 0.0)
        else:
            self.state = (self.rng.uniform(0.0, 0.5), 0.0)
        return self.state

    def terminal(self, s):
        position, velocity = s
        return bool(position >= self.goal_position and velocity >= self.goal_velocity)

    def sim_transition(self, action: int):
        '''
        Args:
        - action : {0, 1, 2}, indicating left, none, right
        Returns:
        - reward : float
        - state : (x_position : float, velocity : float)
        '''
        position, velocity = self.state
        velocity += ((action - 1) * self.force + np.cos(3 * position) * (-self.gravity) + self.rng.normal(scale=self.force_noise)) * self.time_factor
        velocity = np.clip(velocity, -self.max_speed, self.max_speed)
        position += velocity * self.time_factor
        position = np.clip(position, self.min_position, self.max_position)
        if position == self.min_position and velocity < 0:
            velocity = 0
        reward = -1.0 

        self.state = (position, velocity)

        return reward, self.state


    def sim_episode(self, policy, max_iters = 20):
        traj = []
        s = self.init_state()
        for i in range(max_iters):
            if self.terminal(s):
                for a in self.actions:
                    traj.append((s, a, 0, s))
                return traj
            a = policy(s)
            (r, s_prime) = self.sim_transition(a)
            traj.append((s, a, r, s_prime))
            s = s_prime
        return traj

    def evaluate(self, n_play, traj_length, policy):
        score = 0
        for i in range(n_play):
            score += sum(x[2] for x in self.sim_episode(policy=policy, max_iters=traj_length)) # reward
        return score/n_play


class QFRegressor:
    def __init__(self, mdp, rng=np.random.default_rng(42)):
        self.mdp = mdp
        self.fitted = False
        self.rng = rng
    def fq_q_value(self, s, a):
        raise NotImplementedError('Override me')

    def fq_value(self, s):
        return compute_value_from_q(self.mdp.actions, self.fq_q_value, s)

    def fq_greedy(self, s):
        if not self.fitted:
            return self.rng.choice(self.mdp.actions)
        return greedy_policy_from_q(self.mdp.actions, self.fq_q_value, s)

    def fq_epsilon_greedy(self, s, eps):
        if not self.fitted:
            return self.rng.choice(self.mdp.actions)
        return epsilon_greedy_policy_from_q(self.mdp.actions, self.fq_q_value, s, eps, self.rng)

class NeuralRegressor(QFRegressor):
    def initialize(self, max_iter=1000, hidden_layer_sizes=(40,40)):
        from sklearn.neural_network import MLPRegressor
        self.fq_models = {
            a: MLPRegressor(hidden_layer_sizes=hidden_layer_sizes,
                            max_iter=max_iter, learning_rate_init=0.03)
            for a in self.mdp.actions
        }
        self.fitted = False
    def fq_q_value(self, s, a):
        return self.fq_models[a].predict(np.array(s).reshape(1,-1))[0]

    def fit(self, a, X, Y):
        self.fq_models[a].fit(X, Y)

class KNNRegressor(QFRegressor):
    def initialize(self, n_neighbors=3):
        from sklearn.neighbors import KNeighborsRegressor
        self.fq_models = {
            a: KNeighborsRegressor(n_neighbors=n_neighbors)
            for a in self.mdp.actions
        }
        self.fitted = False

    def fq_q_value(self, s, a):
        return self.fq_models[a].predict(np.array(s).reshape(1,-1))[0]

    def fit(self, a, X, Y):
        self.fq_models[a].fit(X, Y)

def get_state_grid(x_divisions, v_divisions, mdp):
    '''
    Returns a list of points that subdivides the MountainCar state space
    evenly into a grid.

    Args:
    - x_divisions : int
    - v_divisions : int
    - mdp : MountainCars
    Returns:
    - grid : list(state)
    '''
    grid = []
    for pos in np.linspace(mdp.min_position, mdp.max_position, x_divisions):
        for vel in np.linspace(-mdp.max_speed, mdp.max_speed, v_divisions):
            grid.append((pos, vel))
    return grid

### Question
Compute the value function from the Q function.

For reference, our solution is **3** line(s) of code.

In [None]:
def compute_value_from_q(action_space, q_function, state):
    """Given the action space, a q_function and a state, compute the value 
    for this state using the q_function.
    Args:
    - action_space : tuple of actions
    - q_function : (state, action) -> q_value : float
    - state : state
    Returns:
    - value : float - the value of this state
    """
    raise NotImplementedError("Implement me!")

### Tests

In [None]:
def value_from_q_problem_test():
    actions = (0, 1, 2)
    action_scores = [-3, 1, 5.]
    test_state = 3
    def q(state, action):
        assert(state == test_state)
        assert action in actions
        return action_scores[action]
    def rotate(arr):
        arr.append(arr[0])
        arr.pop(0)
    assert(compute_value_from_q(actions, q, test_state) == 5.)
    rotate(action_scores)
    assert(compute_value_from_q(actions, q, test_state) == 5.)
    rotate(action_scores)
    assert(compute_value_from_q(actions, q, test_state) == 5.)

value_from_q_problem_test()

print('Tests passed.')

## Greedy Policy from Q


### Question
Write the greedy policy given a Q function.

For reference, our solution is **6** line(s) of code.

In [None]:
def greedy_policy_from_q(action_space, q_function, state):
    """Given the action space, a q_function and a state, compute the action
    taken by the greedy policy at this state, under the q_function.

    Args:
    - action_space : tuple of actions
    - q_function : (state, action) -> q_value : float
    - state : state

    Return:
    - action
    """
    raise NotImplementedError("Implement me!")

### Tests

In [None]:
def greedy_policy_from_q_test():
    actions = (0, 1, 2)
    action_scores = [-3, 1, 5.]
    test_state = 3
    def q(state, action):
        assert(state == test_state)
        assert action in actions
        return action_scores[action]
    def rotate(arr):
        arr.append(arr[0])
        arr.pop(0)
    assert(greedy_policy_from_q(actions, q, test_state) == 2)
    rotate(action_scores)
    assert(greedy_policy_from_q(actions, q, test_state) == 1)
    rotate(action_scores)
    assert(greedy_policy_from_q(actions, q, test_state) == 0)

greedy_policy_from_q_test()

print('Tests passed.')

## Epsilon Greedy Policy from Q


### Question
Write the epsilon greedy policy given a Q function. With probability epsilon, the policy should pick a random action, and with probability 1-epsilon, pick the greedy action. You may use the `greedy_policy_from_q` function. You may find `rng.choice` and `rng.random` useful.

For reference, our solution is **6** line(s) of code.

In addition to all the utilities defined at the top of the Colab notebook, the following functions are available in this question environment: `greedy_policy_from_q`. You may not need to use all of them.

In [None]:
def epsilon_greedy_policy_from_q(action_space, q_function, state, eps, rng):
    """Given the action space, a q_function and a state, compute the action
    taken by an epsilon-greedy policy at this state, under the q_function.

    Args:
    - action_space : tuple of actions
    - q_function : (state, action) -> q_value : float
    - state : state
    - eps : [0, 1]
    - rng : np.random.Generator

    Return:
    - action
    """
    raise NotImplementedError("Implement me!")

### Tests

In [None]:
def epsilon_greedy_policy_from_q_test():
    actions = (0, 1, 2)
    action_scores = [-3, 1, 5.]
    test_state = 3
    rng = np.random.default_rng(42)
    eps = 0.7
    N = 1000
    def q(state, action):
        assert(state == test_state)
        assert action in actions
        return action_scores[action]
    def rotate(arr):
        arr.append(arr[0])
        arr.pop(0)
    runs = [epsilon_greedy_policy_from_q(actions, q, test_state, eps, rng) for _ in range(N)]
    counts = [sum([1 for j in runs if j == i]) for i in actions]
    def binary_std(p):
        return (p * (1-p) * N)**.5
    assert(np.abs(counts[0] - N*eps/3) < binary_std(eps/3) * 3)
    assert(np.abs(counts[1] - N*eps/3) < binary_std(eps/3) * 3)
    assert(np.abs(counts[2] - N*eps/3 - N * (1-eps)) < binary_std(eps/3) * 3)
    assert(sum(counts) == N)

epsilon_greedy_policy_from_q_test()

print('Tests passed.')

## Sampling points in grid


### Question
Next, we'd want to sample points in the state space on which to perform Bellman backups. The following function samples the state space in an evenly spaced grid, and returns the sampled points and their corresponding transitions for every possible action.

In [None]:
def sample_grid_points(x_divisions, v_divisions, mdp):
    '''
    Samples x_divisions times v_divisions points in a grid across 
    the state space, defined by [mdp.min_position, mdp.max_position] x
    [-mdp.max_speed, mdp.max_speed]. Then, for every action and every state, 
    sample their next transitions using the mdp.sim_transition method. 
    Returns a list of tuples, each tuple describing the sampled state and 
    its transition. 

    You may find the get_state_grid function defined in the utilities useful.

    Args:
    - mdp : MountainCar
    Return:
    - memory : list(tuple (state, action, reward, next_state))
    '''
    memory = []
    for s in get_state_grid(x_divisions, v_divisions, mdp):
        for a in mdp.actions:
            if mdp.terminal(s):
                r, s_prime = 0., s
            else:
                mdp.state = s   # reset state
                r, s_prime = mdp.sim_transition(a)
            memory.append((s, a, r, s_prime))
    return memory

### Tests

In [None]:
def sample_grid_points_test():
    mc = MountainCar()
    mc.force_noise = 0.
    results = sample_grid_points(3, 4, mc)
    my_results = [((-1.2, -0.07), 0, -1.0, (-1.2, 0)), ((-1.2, -0.07), 1, -1.0, (-1.2, 0)), ((-1.2, -0.07), 2, -1.0, (-1.2, 0)), ((-1.2, -0.023333333333333338), 0, -1.0, (-1.2, 0)), ((-1.2, -0.023333333333333338), 1, -1.0, (-1.2, 0)), ((-1.2, -0.023333333333333338), 2, -1.0, (-1.1091437292497965, 0.009085627075020343)), ((-1.2, 0.02333333333333333), 0, -1.0, (-0.8424770625831298, 0.035752293741687015)), ((-1.2, 0.02333333333333333), 1, -1.0, (-0.7424770625831298, 0.04575229374168702)), ((-1.2, 0.02333333333333333), 2, -1.0, (-0.6424770625831299, 0.05575229374168701)), ((-1.2, 0.07), 0, -1.0, (-0.4999999999999999, 0.07)), ((-1.2, 0.07), 1, -1.0, (-0.4999999999999999, 0.07)), ((-1.2, 0.07), 2, -1.0, (-0.4999999999999999, 0.07)), ((-0.30000000000000004, -0.07), 0, -1.0, (-1.0, -0.07)), ((-0.30000000000000004, -0.07), 1, -1.0, (-1.0, -0.07)), ((-0.30000000000000004, -0.07), 2, -1.0, (-1.0, -0.07)), ((-0.30000000000000004, -0.023333333333333338), 0, -1.0, (-0.7887358254009995, -0.04887358254009995)), ((-0.30000000000000004, -0.023333333333333338), 1, -1.0, (-0.6887358254009995, -0.03887358254009995)), ((-0.30000000000000004, -0.023333333333333338), 2, -1.0, (-0.5887358254009996, -0.028873582540099946)), ((-0.30000000000000004, 0.02333333333333333), 0, -1.0, (-0.32206915873433284, -0.002206915873433281)), ((-0.30000000000000004, 0.02333333333333333), 1, -1.0, (-0.22206915873433283, 0.007793084126566721)), ((-0.30000000000000004, 0.02333333333333333), 2, -1.0, (-0.12206915873433283, 0.017793084126566723)), ((-0.30000000000000004, 0.07), 0, -1.0, (0.1445975079323339, 0.044459750793233395)), ((-0.30000000000000004, 0.07), 1, -1.0, (0.24459750793233392, 0.0544597507932334)), ((-0.30000000000000004, 0.07), 2, -1.0, (0.3445975079323339, 0.06445975079323339)), ((0.6, -0.07), 0, -1.0, (-0.10000000000000009, -0.07)), ((0.6, -0.07), 1, -1.0, (-0.04319947632672838, -0.06431994763267283)), ((0.6, -0.07), 2, -1.0, (0.0568005236732716, -0.05431994763267284)), ((0.6, -0.023333333333333338), 0, -1.0, (0.3234671903399383, -0.027653280966006166)), ((0.6, -0.023333333333333338), 1, -1.0, (0.42346719033993835, -0.017653280966006164)), ((0.6, -0.023333333333333338), 2, -1.0, (0.5234671903399384, -0.007653280966006166)), ((0.6, 0.02333333333333333), 0, 0.0, (0.6, 0.02333333333333333)), ((0.6, 0.02333333333333333), 1, 0.0, (0.6, 0.02333333333333333)), ((0.6, 0.02333333333333333), 2, 0.0, (0.6, 0.02333333333333333)), ((0.6, 0.07), 0, 0.0, (0.6, 0.07)), ((0.6, 0.07), 1, 0.0, (0.6, 0.07)), ((0.6, 0.07), 2, 0.0, (0.6, 0.07))]
    def recur_match(a, b, fn):
        if type(a) is tuple:
            assert type(b) is tuple
            assert len(a) == len(b)
            for x, y in zip(a, b):
                if not recur_match(x, y, fn):
                    False
            return True
        else:
            return fn(a, b)
    for r in results:
        found = False
        for dr in my_results:
            if recur_match(dr, r, lambda x, y: np.abs(x - y) < 1e-6):
              found = True
        assert(found)

sample_grid_points_test()

print('Tests passed.')

## Sampling points from policy


### Question
Another way to sample points is to collect points by rolling out trajectories from a policy. Implement a function that samples several trajectories from a given policy, and concatenate the trajectories together to produce a list of (state, action, reward, next_state) tuples.

For reference, our solution is **8** line(s) of code.

In [None]:
def sample_policy_points(policy, traj_length, num_traj, mdp):
    """Produce samples in the state space by rolling out a policy for
    traj_length steps for num_traj rollouts. Use `mdp.sim_episode`
    to obtain rollouts.

    Args:
    - policy : state -> action
    - traj_length : int  - length of rollout
    - num_traj : int - number of trajectories to rollout
    - mdp : MountainCar
    Return:
    - memory : [tuple (state, action, reward, next_state)]
    """
    raise NotImplementedError("Implement me!")

### Tests

In [None]:
def sample_policy_points_test():
    rng = np.random.default_rng(3)
    mc = MountainCar(rng=rng)
    mc.state = 0.
    policy = lambda s : 1+np.sign(s[1])
    results = sample_policy_points(policy, 5, 2, mc)
    my_results = [((-1.2, -0.07), 0, -1.0, (-1.2, 0)), ((-1.2, -0.07), 1, -1.0, (-1.2, 0)), ((-1.2, -0.07), 2, -1.0, (-1.2, 0)), ((-1.2, -0.023333333333333338), 0, -1.0, (-1.2, 0)), ((-1.2, -0.023333333333333338), 1, -1.0, (-1.2, 0)), ((-1.2, -0.023333333333333338), 2, -1.0, (-1.1091437292497965, 0.009085627075020343)), ((-1.2, 0.02333333333333333), 0, -1.0, (-0.8424770625831298, 0.035752293741687015)), ((-1.2, 0.02333333333333333), 1, -1.0, (-0.7424770625831298, 0.04575229374168702)), ((-1.2, 0.02333333333333333), 2, -1.0, (-0.6424770625831299, 0.05575229374168701)), ((-1.2, 0.07), 0, -1.0, (-0.4999999999999999, 0.07)), ((-1.2, 0.07), 1, -1.0, (-0.4999999999999999, 0.07)), ((-1.2, 0.07), 2, -1.0, (-0.4999999999999999, 0.07)), ((-0.30000000000000004, -0.07), 0, -1.0, (-1.0, -0.07)), ((-0.30000000000000004, -0.07), 1, -1.0, (-1.0, -0.07)), ((-0.30000000000000004, -0.07), 2, -1.0, (-1.0, -0.07)), ((-0.30000000000000004, -0.023333333333333338), 0, -1.0, (-0.7887358254009995, -0.04887358254009995)), ((-0.30000000000000004, -0.023333333333333338), 1, -1.0, (-0.6887358254009995, -0.03887358254009995)), ((-0.30000000000000004, -0.023333333333333338), 2, -1.0, (-0.5887358254009996, -0.028873582540099946)), ((-0.30000000000000004, 0.02333333333333333), 0, -1.0, (-0.32206915873433284, -0.002206915873433281)), ((-0.30000000000000004, 0.02333333333333333), 1, -1.0, (-0.22206915873433283, 0.007793084126566721)), ((-0.30000000000000004, 0.02333333333333333), 2, -1.0, (-0.12206915873433283, 0.017793084126566723)), ((-0.30000000000000004, 0.07), 0, -1.0, (0.1445975079323339, 0.044459750793233395)), ((-0.30000000000000004, 0.07), 1, -1.0, (0.24459750793233392, 0.0544597507932334)), ((-0.30000000000000004, 0.07), 2, -1.0, (0.3445975079323339, 0.06445975079323339)), ((0.6, -0.07), 0, -1.0, (-0.10000000000000009, -0.07)), ((0.6, -0.07), 1, -1.0, (-0.04319947632672838, -0.06431994763267283)), ((0.6, -0.07), 2, -1.0, (0.0568005236732716, -0.05431994763267284)), ((0.6, -0.023333333333333338), 0, -1.0, (0.3234671903399383, -0.027653280966006166)), ((0.6, -0.023333333333333338), 1, -1.0, (0.42346719033993835, -0.017653280966006164)), ((0.6, -0.023333333333333338), 2, -1.0, (0.5234671903399384, -0.007653280966006166)), ((0.6, 0.02333333333333333), 0, 0.0, (0.6, 0.02333333333333333)), ((0.6, 0.02333333333333333), 1, 0.0, (0.6, 0.02333333333333333)), ((0.6, 0.02333333333333333), 2, 0.0, (0.6, 0.02333333333333333)), ((0.6, 0.07), 0, 0.0, (0.6, 0.07)), ((0.6, 0.07), 1, 0.0, (0.6, 0.07)), ((0.6, 0.07), 2, 0.0, (0.6, 0.07))]
    def recur_match(a, b, fn):
        if type(a) is tuple:
            assert type(b) is tuple
            assert len(a) == len(b)
            for x, y in zip(a, b):
                if not recur_match(x, y, fn):
                    False
            return True
        else:
            return fn(a, b)
    for r in results:
        found = False
        for dr in my_results:
            if recur_match(dr, r, lambda x, y: np.abs(x - y) < 1e-6):
              found = True
        assert(found)

sample_policy_points_test()

print('Tests passed.')

## Fitted Q Visualization


### Question
Complete the `fitted_Q_learn` function given in the colab notebook. You're now ready to conduct fitted Q learning on the Mountain Car Problem! You may use either sampling method (grid or policy), and either regression method (`KNNRegressor` or `NeuralRegressor`).

In [None]:

import matplotlib
from matplotlib import pyplot as plt
from matplotlib import animation
from IPython import display as display
matplotlib.rc('animation', html='jshtml')

def visualize_qf(qf_regressor):
    min_x, max_x = qf_regressor.mdp.min_position, qf_regressor.mdp.max_position
    min_v, max_v = -qf_regressor.mdp.max_speed, qf_regressor.mdp.max_speed
    vf = np.array([
          [
              qf_regressor.fq_value((x, v))
            for x in np.linspace(min_x, max_x, 50)
          ]
        for v in np.linspace(min_v, max_v, 50)
    ])
    im = plt.imshow(vf, extent=(min_x, max_x, min_v, max_v), aspect='auto')
    plt.colorbar(im)

def visualize_traj(traj):
    '''
    Visualizes a trajectory. Call with the output of MountainCar.sim_episode

    Args:
    - traj : [tuple (state, action, reward, next_state)]
    '''
    # based off https://github.com/mpatacchiola/dissecting-reinforcement-learning/blob/master/environments/mountain_car.py#L105
    mode = 'jupyter'
    file_path='./mountain_car.mp4'
    # Plot init

    fig = plt.figure()
    ax = fig.add_subplot(111, autoscale_on=False, xlim=(-1.2, 0.6), ylim=(-1.1, 1.1))
    ax.grid(False)  # disable the grid
    x_sin = np.linspace(start=-1.2, stop=0.6, num=100)
    y_sin = np.sin(3 * x_sin)
    # plt.plot(x, y)
    ax.plot(x_sin, y_sin)  # plot the sine wave
    # line, _ = ax.plot(x, y, 'o-', lw=2)
    dot, = ax.plot([], [], 'ro')
    time_text = ax.text(0.05, 0.9, '', transform=ax.transAxes)
    _position_list = [s[0][0] for s in traj]
    _delta_t = .6

    def _init():
        dot.set_data([], [])
        time_text.set_text('')
        return dot, time_text

    def _animate(i):
        x = _position_list[i]
        y = np.sin(3 * x)
        dot.set_data(x, y)
        time_text.set_text("Time: " + str(np.round(i*_delta_t, 1)) + "s" + '\n' + "Frame: " + str(i))
        return dot, time_text

    ani = animation.FuncAnimation(fig, _animate, np.arange(1, len(_position_list)),
                                    blit=True, init_func=_init, repeat=True, interval=_delta_t * 1000)

    if mode == 'gif':
        ani.save(file_path, writer='imagemagick', fps=int(1/_delta_t))
    elif mode == 'mp4':
        ani.save(file_path, fps=int(1/_delta_t), writer='avconv', codec='libx264')
    elif mode == 'jupyter':
        video = ani.to_jshtml()
        html = display.HTML(video)
        display.display(html)
        plt.close()


def fitted_Q_learn(mdp, sampler, qf_regressor, iters):
    '''
    Takes in a MountainCar instance, a sampling method, a fitted q
    regression method, and the number of iterations. Runs fitted Q
    learning with that many iterations.

    Args:
    - mdp : MountainCar
    - sampler : (mdp : MountainCar) -> memory : [tuple (state, action, reward, next_state)]
    - qf_regressor : QFRegressor
    - iters : int
    '''
    PRINT_EPOCH = 6
    qf_regressor.initialize()

    for it in range(iters):
        Xd = dict([(a, []) for a in mdp.actions])
        Yd = dict([(a, []) for a in mdp.actions])
        memory = sampler(mdp)
        for (s, a, r, s_prime) in memory:
            if it == 0 or mdp.terminal(s):
                # TODO: IMPLEMENT ME
                raise NotImplementedError('Set v = something here')
            else:
                # TODO: IMPLEMENT ME. You may find mdp.discount_factor 
                # and qf_regressor.fq_value useful.
                raise NotImplementedError('Set v = something here')
            Xd[a].append(s)
            Yd[a].append(np.array([v]))
        for a in mdp.actions:
            X = np.vstack(Xd[a])
            Y = np.vstack(Yd[a])
            Y = Y[:, 0]
            qf_regressor.fit(a, X, Y)
        qf_regressor.fitted = True
        print(f'Iteration {it}:  {mdp.evaluate(n_play=10, traj_length=100, policy=qf_regressor.fq_greedy)}')
        if it % PRINT_EPOCH == PRINT_EPOCH-1:
            visualize_qf(qf_regressor)
            plt.show()

This shows an example of running fitted Q learning using KNNRegressor with policy sampling. Try it with different different regressors, sampling methods and sampling parameters!

In [None]:
def run_fitted_q_example_1():
    NUM_ITERS = 10
    TRAJ_LENGTH = 40
    NUM_ROLLOUTS = 20
    EPSILON = 0.4
    mc = MountainCar()
    qf_regressor = KNNRegressor(mc)
    sampler = partial(sample_policy_points, lambda s: qf_regressor.fq_epsilon_greedy(s, EPSILON), TRAJ_LENGTH, NUM_ROLLOUTS)
    fitted_Q_learn(
        mdp=mc, 
        sampler=sampler,
        qf_regressor=qf_regressor,
        iters=NUM_ITERS)
    print('expected reward =', mc.evaluate(100, 100, qf_regressor.fq_greedy))
    visualize_traj(mc.sim_episode(policy=qf_regressor.fq_greedy, max_iters=100))
    plt.show()
    visualize_qf(qf_regressor)
    plt.show()
def run_fitted_q_example_2():
    NUM_ITERS = 10
    X_DIVISIONS = 15
    V_DIVISIONS = 15
    mc = MountainCar()
    qf_regressor = KNNRegressor(mc)
    sampler = partial(sample_grid_points, X_DIVISIONS, V_DIVISIONS)
    fitted_Q_learn(
        mdp=mc, 
        sampler=sampler,
        qf_regressor=qf_regressor,
        iters=NUM_ITERS)
    print('expected reward =', mc.evaluate(100, 100, qf_regressor.fq_greedy))
    visualize_traj(mc.sim_episode(policy=qf_regressor.fq_greedy, max_iters=100))
    plt.show()
    visualize_qf(qf_regressor)
    plt.show()

## Imports and Utilities
**Note**: these imports and functions are available in catsoop. You do not need to copy them in.

In [None]:
from collections import defaultdict
from math import sqrt, log
import abc
import numpy as np
import functools


class MDP:
    """A Markov Decision Process."""

    @property
    @abc.abstractmethod
    def state_space(self):
        """Representation of the MDP state set.
        """
        raise NotImplementedError("Override me")

    @property
    @abc.abstractmethod
    def action_space(self):
        """Representation of the MDP action set.
        """
        raise NotImplementedError("Override me")

    @property
    def temporal_discount_factor(self):
        """Gamma, defaults to 1.
        """
        return 1.

    @property
    def horizon(self):
        """H, defaults to inf.
        """
        return float("inf")

    def state_is_terminal(self, state):
        """Designate certain states as terminal (done) states.

        Defaults to False.

        Args:
            state: A state.

        Returns:
            state_is_terminal : A bool.
        """
        return False

    @abc.abstractmethod
    def get_reward(self, state, action, next_state=None):
        """Return (deterministic) reward for executing action
        in state.

        Args:
            state: A current state.
            action: An action.
            next_state: Optional. A next state.

        Returns:
            reward : Single time step reward.
        """
        raise NotImplementedError("Override me")

    @abc.abstractmethod
    def get_transition_distribution(self, state, action):
        """Return a distribution over next states.

        The form of this distribution will vary, e.g., depending
        on whether the MDP has discrete or continuous states.

        Args:
            state: A current state.
            action: An action.

        Returns:
            next_state_distribution: Distribution over next states.
        """
        raise NotImplementedError("Override me")

    def sample_next_state(self, state, action, rng=np.random):
        """Sample a next state from the transition distribution.

        This function may be overwritten by subclasses when the explicit
        distribution is too large to enumerate.

        Args:
            state: A state from the state space.
            action: An action from the action space.
            rng: A random number generator.

        Returns:
            next_state: A sampled next state from the state space.
        """
        next_state_dist = self.get_transition_distribution(state, action)
        next_states, probs = zip(*next_state_dist.items())
        next_state_index = rng.choice(len(next_states), p=probs)
        next_state = next_states[next_state_index]
        return next_state


class POMDP(MDP):
    """A partially observable Markov decision process (POMDP)."""

    @property
    @abc.abstractmethod
    def observation_space(self):
        """Representation of the POMDP observation space.
        """
        raise NotImplementedError("Override me")

    @abc.abstractclassmethod
    def get_observation_distribution(self, next_state, action):
        """Return a distribution over the observations.

        The form of this distribution will vary, e.g., depending
        on whether the MDP has discrete or continuous observation
        spaces.

        Args:
            next_state: The next state.
            action: The action taken.

        Returns:
            observation_distribution: Distribution over the observation.
        """
        raise NotImplementedError("Override me")


class LambdaMDP(MDP):
    """A helper class that creates a MDP class based on a set of functions.
    See the constructor for details.
    """

    def __init__(self, state_space, action_space, state_is_terminal_fn, get_reward_fn, get_transition_distribution_fn, temporal_discount_factor=1.0):
        """
        Construct a MDP class based on a set of function definitions.

        Args:
            state_space: The set of possible states.
            action_space: The set of possible actions.
            state_is_terminal_fn: A callable function: state_is_terminal_fn(state) -> bool,
                mapping a state to a boolean value indicating whether
                the state is a terminal state.
            get_reward_fn: A callable function: get_reward_fn(state, action, next_state) -> float,
                mapping a (s, a, s') tuple to a float reward value.
            get_transition_distribution_fn: A callable function:
                get_transition_distribution_fn(state, action) -> distribution of the next state.
                Note that the return value for this function must be a discrete distribution.
            temporal_discount_factor: A float number, the temporal discount factor of the MDP.
        """
        super().__init__()
        self.state_space_v = state_space
        self.action_space_v = action_space
        self.state_is_terminal = state_is_terminal_fn
        self.get_reward = get_reward_fn
        self.get_transition_distribution = get_transition_distribution_fn
        self.temporal_discount_factor_v = temporal_discount_factor

    @property
    def state_space(self):
        return self.state_space_v

    @property
    def action_space(self):
        return self.action_space_v

    @property
    def temporal_discount_factor(self):
        return self.temporal_discount_factor_v


class DiscreteDistribution(object):
    """A discrete distribution, represneted as a dictionary."""

    eps = 1e-6

    def __init__(self, prob_dict):
        """Construct a discrete distribution based on a probability dictionary.
        The dictionary might be "sparse", in which case the omitted entries are
        treated as zero-probability values.

        Note that, even if the random varaible takes values from a continuous space,
        (e.g., all real numbers), we can still define a "discrete distribution",
        that is, a distribution only has mass on a finite set of points.
        For example, we can define a distribution on R: {0: 0.5, 1: 0.5}.
        Implicitly, all values not in the prob_dict will be treated as
        zero-probability.

        Example:

        ```
        p = DiscreteDistribution({'x': 0.0, 'y': 0.6, 'z': 0.4})
        print(p.p('x'))  # 0.0
        for x in p:  # iterate over the set of possible values.
            print(x, p.p(x))  # should print y 0.6 z 0.4
        for x, p_x in p.items():  # just like iterating over a Python dict.
            print(x, p_x)  # should print y 0.6 z 0.4
        ```
        Note that, during iteration, zero-probability values will be omitted.

        Args:
            prob_dict: A dictionary, mapping elements in the domain to a float
                number. The dictionary might be sparse. It should always
                sum up to one (thus being a valid distribution.)
        """
        self.prob_dict = prob_dict

    def __iter__(self):
        """Iterate over the support set."""
        yield from self.support()

    def support(self):
        """Itearte over the support set of the distribution. That is,
        values with a non-zero probability mass.
        """
        for k, v in self.prob_dict.items():
            if v > 0:
                yield k

    def items(self):
        """Iterate over the distribution. Generates a list of (x, p(x)) pairs.
        This function will ignore zero-probability values in the prob_dict.
        """
        for k, v in self.prob_dict.items():
            if v > 0:
                yield k, v

    def p(self, value):
        """Evaluate the proabbility of a value in the support set.

        Args:
            value: An object in the domain of the distribution.

        Returns:
            p: A float, indicating p(value). For values not in the support (prob_dict),
                the probability is assumed to be zero.
        """
        return self.prob_dict.get(value, 0.)

    def renormalize(self):
        """Renormalize the distribution to ensure that the probabilities sum up to 1.

        Returns:
            self
        """
        z = sum(self.prob_dict.values())
        assert z > 0, 'Degenerated probability distribution.'
        self.prob_dict = {k: v / z for k, v in self.prob_dict.items()}
        return self

    def check_normalization(self):
        """Check if the prob dict is correctly normalized (i.e., should sum up to 1)."""
        assert 1 - type(self).eps < sum(self.prob_dict.values()) < 1 + type(self).eps

    def max(self):
        """Return argmax_x p(x).

        Returns:
            arg_max: An object in the support, argmax_x p(x).
        """
        return max(self.prob_dict, key=lambda x: (self.prob_dict[x], x))

    def draw(self, rng=None):
        if rng is None:
            rng = np.random
        keys = list(self.prob_dict.keys())
        probs = [self.prob_dict[k] for k in keys]
        return keys[rng.choice(len(keys), p=probs)]

    def __str__(self):
        return str(self.prob_dict)

    def as_tuple(self):
        return tuple((self.prob_dict.get(k), k) for k in sorted(self.support()))

    def __lt__(self, other):
        return self.as_tuple() < other.as_tuple()

    def __gt__(self, other):
        return self.as_tuple() > other.as_tuple()


def OnehotDiscreteDistribution(obj):
    """Create a DiscreteDistribution of p(obj) = 1."""
    return DiscreteDistribution({obj: 1.0})


def UniformDiscreteDistribution(support):
    """Create a DiscreteDistribution that is uniform. That is, for any object x, p(x) = 1 / |support|."""
    return DiscreteDistribution({x: 1 / len(support) for x in support})

# Our RobotChargingPOMDP

class RobotChargingPOMDP(POMDP):
    DEF_MOVE_SUCCESS = 0.8
    DEF_OBS_IF_THERE = 0.9
    DEF_OBS_IF_NOT_THERE = 0.4
    DEF_C_MOVE = 0.5
    DEF_C_LOOK = 0.1
    DEF_GAMMA = 0.9

    def __init__(
        self,
        p_move_success=DEF_MOVE_SUCCESS, p_obs_if_there=DEF_OBS_IF_THERE, p_obs_if_not_there=DEF_OBS_IF_NOT_THERE,
        c_move=DEF_C_MOVE, c_look=DEF_C_LOOK,
        gamma=DEF_GAMMA
    ):
        """
        Create the Robot Charging POMDP.

        Args:
            p_move_success (float): the probability that a move action is successful.
            p_obs_if_there (float): the probability of return 1 when looking at a location with the charger.
            p_obs_if_not_there (float): the probability of return 1 when looking at a location without a charger.
            c_move (float): the cost of a move action.
            c_look (float): the cost of a look action.
            gamma (float): the temporal discount factor.
        """
        super().__init__()
        self.p_move_success = p_move_success
        self.p_obs_if_there = p_obs_if_there
        self.p_obs_if_not_there = p_obs_if_not_there
        self.c_move = c_move
        self.c_look = c_look
        self.gamma = gamma

    @property
    def state_space(self):
        """
        Three "normal" states: 0, 1, 2, indicating the position of the charger.
        One "terminal" state T. Executing the "charge" action will reach this
        terminal state. And the state is absorbing. The robot will deterministically
        transition to this terminal state when we execute the c action.
        """
        return {0, 1, 2, 'T'}

    @property
    def action_space(self):
        # lx: look(x)
        # mxy: move(start=x, target=y)
        # c: charge
        # nop: NOP
        return {'l0', 'l1', 'l2', 'm01', 'm12', 'm20', 'c', 'nop'}

    @property
    def observation_space(self):
        return {0, 1}

    @property
    def temporal_discount_factor(self):
        return self.gamma

    def state_is_terminal(self, state):
        return state == 'T'

    def get_reward(self, state, action, next_state=None):
        if action == 'nop':
            return 0
        elif action == 'c':
            if state == 0:
                return 10
            else:
                return -100
        elif action.startswith('m'):
            return -self.c_move
        else:  # look
            return -self.c_look

    def get_transition_distribution(self, state, action):
        if action == 'c':
            return OnehotDiscreteDistribution('T')
        elif action.startswith('m'):
            start, target = int(action[1]), int(action[2])
            if state == start:
                return DiscreteDistribution({target : self.p_move_success, start : 1 - self.p_move_success})
        return OnehotDiscreteDistribution(state)

    def get_observation_distribution(self, next_state, action):
        if action.startswith('l'):
            target = int(action[1])
            if next_state == target:
                return DiscreteDistribution({0: 1 - self.p_obs_if_there, 1: self.p_obs_if_there})
            else:
                return DiscreteDistribution({0: 1 - self.p_obs_if_not_there, 1: self.p_obs_if_not_there})
        return OnehotDiscreteDistribution(0)



def bellman_backup(s, V, mdp):
    """Look ahead one step and propose an update for the value of s.

    You can assume that the mdp is either infinite or indefinite
    horizon (that is, mdp.horizon is inf).

    Args:
        s: A state.
        V: A dict, V[state] -> value.
        mdp: An MDP.

    Returns:
        vs: new value estimate for s.
    """

    assert mdp.horizon == float("inf")
    vs = -float("inf")
    for a in mdp.action_space:
        qsa = 0.
        for ns, p in mdp.get_transition_distribution(s, a).items():
            r = mdp.get_reward(s, a, ns)
            qsa += p * (r + mdp.temporal_discount_factor * V[ns])
        vs = max(qsa, vs)
    return vs


def value_iteration(mdp, max_num_iters=1000, change_threshold=1e-4):
    """Run value iteration for a certain number of iterations or until
    the max change between iterations is below a threshold.

    You can assume that the mdp is either infinite or indefinite
    horizon (that is, mdp.horizon is inf).

    Args:
        mdp: An MDP.
        max_num_iters: An int representing the maximum number of
        iterations to run value iteration before giving up.
        change_threshold: A float used to determine when value iteration
        has converged and it is safe to terminate.

    Returns:
        V:  A dict, V[state] -> value.
        it: The number of iterations before convergence.
    """

    # Initialize V to all zeros
    V = {s: 0. for s in mdp.state_space}

    for it in range(max_num_iters):
        next_V = {}
        max_change = 0.
        for s in mdp.state_space:
            if mdp.state_is_terminal(s):
                next_V[s] = 0.
            else:
                next_V[s] = bellman_backup(s, V, mdp)
            max_change = max(abs(next_V[s] - V[s]), max_change)
        V = next_V
        if max_change < change_threshold:
            break
    return V, it


def qsa_from_vs(mdp, V):
    """Compute Q(s, a) based on V(s).

    Args:
        mdp: An MDP.
        V: A dict, V[state] -> value. Typically, this is computed by value_iteration.

    Returns:
        Q: A dict, Q[state, action] -> value.
    """

    Q = dict()
    for s in mdp.state_space:
        if not mdp.state_is_terminal(s):
            for a in mdp.action_space:
                qsa = 0.
                for ns, p in mdp.get_transition_distribution(s, a).items():
                    r = mdp.get_reward(s, a, ns)
                    qsa += p * (r + mdp.temporal_discount_factor * V[ns])
                Q[s, a] = qsa
        else:
            for a in mdp.action_space:
                Q[s, a] = 0
    return Q


def expectimax_search(initial_state, mdp, horizon, return_Q=False):
    """Use expectimax search to determine a next action.

    Note that we're just computing the single next action to
    take, we do not need to store the entire partial V.

    Horizon is given as a separate argument so that we can use
    expectimax search with receding horizon control, for example,
    even if mdp.horizon is inf.

    Args:
        initial_state: A state in the mdp.
        mdp (MDP): An MDP.
        horizon (int): An int horizon.
        return_Q (bool): A boolean value. If true, also return the Q value
            at the root instead of the action.

    Returns:
        action: An action in the mdp.
        Q: The Q value at the root state (only when return_Q is True).
    """
    A = sorted(mdp.action_space)
    R = mdp.get_reward
    P = mdp.get_transition_distribution
    gm = mdp.temporal_discount_factor
    ts = mdp.state_is_terminal

    # Cache the V(s, h)'s that have been computed.
    def V(s, h):
        if h == horizon or ts(s):
            return 0
        return max(Q(s, a, h) for a in A)

    def Q(s, a, h):
        psa = P(s, a)
        # psa is a DiscreteDistribution over beliefs.  ns is a belief.
        return sum(psa.p(ns) * (R(s, a, ns) + gm * V(ns, h+1)) for ns in psa)

    Q_values = {a: Q(initial_state, a, 0) for a in A}
    if return_Q:
        return max(A, key=Q_values.get), Q_values
    return max(A, key=Q_values.get)


def transition_update(pomdp, belief, action):
    """Compute p(s') from a prior distribution of p(s) based on the transition
    distribution p(s, action, s').

    Args:
        pomdp (POMDP): A POMDP object.
        belief (DiscreteDistribution): A distribution over the current state s.
        action: The action to be executed.

    Returns:
        updated_belief (DiscreteDistribution): A distribution over the next state s'.
    """
    updated = {x: 0.0 for x in pomdp.state_space}
    for s in belief:
        prob_s = belief.p(s)
        for s_prime, transition_prob in pomdp.get_transition_distribution(s, action).items():
            updated[s_prime] += transition_prob * prob_s
    # Note that we don't necessarily need to renormalize the distribution (it is self-normalized!)
    # The added `.renormalize()` part is really just for numeric stability.
    return DiscreteDistribution(updated).renormalize()


def observation_update(pomdp, belief, action, observation):
    """Compute p(s' | observation, action) following the Bayes rule.
        p(s' | o, a) is proportional to p(s' | a) * p(o | s', a).

    Args:
        pomdp (POMDP): A POMDP object.
        belief (DiscreteDistribution): The distribution over the next state: p(s' | a).
            Typically, this is the output of the transition_update() function.
        action: The action taken.
        observation: The observation.

    Returns:
        posterior (DiscreteDistribution): The updated belief over the next state s'.
            Normalized!
    """
    posterior = {x: 0.0 for x in pomdp.state_space}
    for s in belief:
        posterior[s] = belief.p(s) * pomdp.get_observation_distribution(s, action).p(observation)
    return DiscreteDistribution(posterior).renormalize()


def belief_filter(pomdp, belief, action, observation):
    """Compute the updated belief over the states based on the current action and obervation.

    Specifically, the process is:
        1. the agent is at state s, and has a belief about its current state p(s).
        2. the agent takes an action a, and has a belief about its next state p(s' | a),
            computed by transition_update.
        3. the agent observes o, which follows the observation model of the POMDP p(o | s', a).
        4. the agent updates its belief over the next state p(s' | o, a), following the Bayes rule.

    Args:
        pomdp (POMDP): A POMDP object.
        belief (DiscreteDistribution): The belief about the agent's current state.
        action: The action taken.
        observation: The observation.

    Returns:
        next_belief: The belief about the next state by taking into consideration the action
            at this step and the observation.
    """
    return observation_update(pomdp, transition_update(pomdp, belief, action),
                              action=action, observation=observation)



## Belief-Space MDP


### Question
In this section, you will implement a function `create_belief_mdp`, that transforms a POMDP into a belief-space MDP.
    We have provided the basic skeleton for you. In particular, you only need to implement the get_reward and the get_transition_distribution
    function for the Belief MDP.  You can use the function `belief_filter(pomdp, b, a, o)` which is already defined; the implementaation of that function is available in the colab.

For reference, our solution is **81** line(s) of code.

In [None]:
def create_belief_mdp(pomdp):
    """Constructs a belief-space MDP from a POMDP.

    Args:
        pomdp: The input POMDP object.

    Returns:
        belief_mdp: The constructed belief-space MDP.
    """
    def state_is_terminal(belief):
        """The state_is_terminal function for the belief-space MDP. It returns true iff. all possible states
        in the belief are terminal states.

        Args:
            belief: A DiscreteDistribution of the state.

        Returns:
            is_terminal: Whether the current belief is a "terminal" belief.
        """
        for state, p in belief.items():
            if p > 0 and not pomdp.state_is_terminal(state):
                return False
        return True

    def get_reward(belief, action, next_belief=None):
        """Compute the expected reward function for the belief-space MDP.

        You only need to implement the case where the original reward function only
        depends on the state and the action (but not the next state).

        In this case, the reward function of the belief-space MDP will be only a function
        of belief and action, but not next_belief.

        In general (where the reward function if a function of state, action, and next_action),
        in order to compute the expected reward, we need to also marginalize over the next state
        distribution (which is next_belief).

        Args:
            belief: A DiscreteDistribution of the state.
            action: An action.
            next_belief: A DiscreteDistribution of the next state. Should be ignored (see above).

        Returns:
            reward: the expected reward at this step.
        """
        raise NotImplementedError()

    def get_transition_distribution(belief, action):
        """Compute the transition distribution for an input belief and an action.

        Specifically, the output will be a distribution over beliefs. That is, a distribution over
        distributions. Since we have restricted our observation space to be finite, the
        possible next belief is also a finite space. Thus, we can still use a DiscreteDistribution
        object to represent the distribution over the next belief.

        Args:
            belief: A DiscreteDistribution of the state.
            action: An action.

        Returns:
            next_belief: A DiscreteDistribution of the next state.
        """
        raise NotImplementedError()

    # Construct a new MDP based on the functions defined above.
    return LambdaMDP(
        state_space=None,  # We are not going to specify the state space explicitly (it's a continuous space).
        action_space=pomdp.action_space,
        state_is_terminal_fn=state_is_terminal,
        get_reward_fn=get_reward,
        get_transition_distribution_fn=get_transition_distribution,
        temporal_discount_factor=pomdp.temporal_discount_factor
    )

### Tests

In [None]:
def test1_create_belief_mdp():
    pomdp = RobotChargingPOMDP()
    belief_mdp = create_belief_mdp(pomdp)
    b4 = DiscreteDistribution({0 : .4, 1: .3, 2: .3})
    a, Q = expectimax_search(b4, belief_mdp, 4, return_Q=True)

    assert a == 'l1'
    gt = {'l0': -0.1, 'l1': 0.255914, 'l2': -0.1, 'm12': -0.471128, 'm01': -0.5, 'm20': -0.031586, 'c': -56.0, 'nop': 0.0}
    for k, v in gt.items():
        assert k in Q and np.allclose(v, Q[k])

test1_create_belief_mdp()

print('Tests passed.')

## Receding Horizon Control


### Utilities

A simple implementation of the Receding Horizon Control (RHC).
**Note**: these imports and functions are available in catsoop. You do not need to copy them in.

In [None]:
def receding_horizon_control(pomdp, h=4, search_algo=expectimax_search):
    """Receding Horizon Control (RHC).

    Args:
        pomdp (POMDP): The input POMDP problem.
        h (int): The receding horizon.

    Returns:
        policy (Callable): A function policy(belief) -> action, mapping from a belief state to the action to take.
    """

    belief_mdp = create_belief_mdp(pomdp)

    def policy(belief):
        """The RHC policy. Basically, it runs a search algorithm (e.g., expectimax_search)
            with a fixed horizon and output the optimal policy at the current belief.

        Args:
            belief (DiscreteDistribution): The current belief.

        Returns:
            action: The next action to take.
        """

        return search_algo(belief, belief_mdp, horizon=h)

    return policy


def simulate(pomdp, initial_belief, policy, n=4, real_s=None):
    """Simulate a policy on a POMDP.

    Specifically, the function commands the environment for n timesteps.
    We will keep track of two variables.
        - robot_b, which is the current belief.
        - real_s: the "true" state of the world.

    Args:
        pomdp (POMDP): The input POMDP problem.
        initial_belief (DiscreteDistribution): a distribution of the state (the initial belief).
        n (int): The nunber of simulation steps.
        real_s: The initial real_s, can be None, in which case it will be sampled from initial_belief.
        policy_gen (Callable): a function that maps a belief to the next action.
    """

    import numpy.random as npr; npr.seed(0)  # to determinize the execution.

    # Create the Belief MDP.
    bmdp = create_belief_mdp(pomdp)

    robot_b = initial_belief
    if real_s is None:
        real_s = robot_b.draw()

    print('Robot belief:', robot_b)
    print('Real state:', real_s)
    print('')

    for t in range(n):
        print('Step', t)
        # search_algo returns the optimal action at the current belief.
        a = policy(robot_b)

        print('  Executing:', a)
        real_s = pomdp.get_transition_distribution(real_s, a).draw()
        print('  Real State:', real_s)
        if pomdp.state_is_terminal(real_s):
            print('Terminated.')
            break
        o = pomdp.get_observation_distribution(real_s, a).draw()
        print('  Observation:', o)
        robot_b = belief_filter(pomdp, robot_b, a, o)
        print('  Robot belief:', robot_b)

### Question
Use the RHC implementation provided in the colab and answer the questions in Catsoop.  **Note that the simulation function sets the random seed so that your results will be deterministic.**

You can run the following code to visualize the execution of your RHC-Expectimax policy.

In [None]:
pomdp = RobotChargingPOMDP(gamma=0.9)
b0 = DiscreteDistribution({0: 0.03, 1: 0.07, 2: 0.9})
policy = receding_horizon_control(pomdp, 4, search_algo=expectimax_search)
simulate(pomdp, b0, policy=policy, n=10, real_s=2)

## Receding Horizon Control with Most-Likely-Observation


### Question
Complete the RHC-MLO implementation provided in the colab and answer the questions in Catsoop.
The only new requirement is to implement the `expectimax_search_mlo` that instead of computing the
expected reward over all possible observations, only focuses on the most-likely observation.<br />
**HINT**: You only need very MINIMAL changes to the original `expectimax_search` code.<br />
**HINT**: You may find the `DiscreteDistribution.max()` function useful.

After implementing `expectimax_search_mlo`, use the following code snippets (in colab) to simulate RHC-MLO, and answer questions in Catsoop.
Specifically, we will use `gamma = 0.9` and initial belief $b = (0.4, 0.3, 0.3)$ (the same one we used in the Most Likely State problem).  Note that we are specifying the actual initial state to be 2.


For reference, our solution is **21** line(s) of code.

In addition to all the utilities defined at the top of the Colab notebook, the following functions are available in this question environment: `create_belief_mdp`. You may not need to use all of them.

In [None]:
def expectimax_search_mlo(initial_state, belief_mdp, horizon, return_Q=False):
    """Use expectimax search to determine a next action.

    Note that we're just computing the single next action to
    take, we do not need to store the entire partial V.

    Horizon is given as a separate argument so that we can use
    expectimax search with receding horizon control, for example,
    even if belief_mdp.horizon is inf.

    Args:
        initial_state: A state in the belief_mdp.
        belief_mdp: An MDP.
        horizon: An int horizon.
        return_Q: A boolean value. If true, also return the Q value
            at the root instead of the action.

    Returns:
        action: An action in the belief_mdp.
        Q: The Q value at the root state (only when return_Q is True).
    """
    A = sorted(belief_mdp.action_space)
    R = belief_mdp.get_reward
    P = belief_mdp.get_transition_distribution
    gm = belief_mdp.temporal_discount_factor
    ts = belief_mdp.state_is_terminal

    def V(s, h):
        if h == horizon or ts(s):
            return 0
        return max(Q(s, a, h) for a in A)

    def Q(s, a, h):
        # TODO: Your code here.
        raise NotImplementedError()

    Q_values = {a: Q(initial_state, a, 0) for a in A}
    if return_Q:
        return max(A, key=Q_values.get), Q_values
    return max(A, key=Q_values.get)

### Tests

In [None]:
def test1_mlo():
    pomdp = RobotChargingPOMDP(gamma=0.9)
    policy = receding_horizon_control(pomdp, 4, search_algo=expectimax_search_mlo)

    b0 = DiscreteDistribution({0: 0.4, 1: 0.3, 2: 0.3})
    assert policy(b0) == 'm20'
    b1 = DiscreteDistribution({0: 0.64, 1: 0.3, 2: 0.05999999999999998, 'T': 0.0})
    assert policy(b1) == 'l0'
    b2 = DiscreteDistribution({0: 0.8, 1: 0.16666666666666666, 2: 0.03333333333333331})
    assert policy(b2) == 'l1'

test1_mlo()

print('Tests passed.')

You can run the following code to visualize the execution of your RHC-Expectimax-MLO policy.

In [None]:
pomdp = RobotChargingPOMDP(gamma=0.9)
b0 = DiscreteDistribution({0: 0.4, 1: 0.3, 2: 0.3})
policy = receding_horizon_control(pomdp, 4, search_algo=expectimax_search_mlo)
simulate(pomdp, b0, policy=policy, n=10, real_s=2)

## QMDP (Optional)


### Question
Complete the QMDP implementation provided in the colab and answer the questions in catsoop. Here you can use the value function
computed by the `value_iteration` algorithm provided and focus on computing the policy.

For reference, our solution is **21** line(s) of code.

In [None]:
def qmdp(pomdp):
    """QMDP algorithm.
    This function takes a POMDP as input and output a policy function that maps belief to the action to take.

    Args:
        pomdp (POMDP): An POMDP.

    Returns:
        policy (Callable): A function policy(belief) -> action, mapping from a belief state to the action to take.
    """
    V, _ = value_iteration(pomdp)  # run value iteration on the underlying MDP.
    Q = qsa_from_vs(pomdp, V)   # constructs Q values from the value function.

    def policy(belief):
        """The QMDP policy.

        Args:
            belief (DiscreteDistribution): The belief about the current state.

        Returns:
            action: The argmax action computed based on QMDP.
        """
        # TODO: Your code here.
        raise NotImplementedError()

    return policy

### Tests

In [None]:
def test1_qmdp():
    pomdp = RobotChargingPOMDP()
    policy = qmdp(pomdp)
    b0 = DiscreteDistribution({0: 0.999, 1: 0.001, 2: 0.0})
    assert policy(b0) == 'c'
    b0 = DiscreteDistribution({0: 0.0, 1: 0.6, 2: 0.4})
    assert policy(b0) == 'm12'
    b0 = DiscreteDistribution({0: 0.1, 1: 0.33, 2: 0.57})
    assert policy(b0) == 'm20'

test1_qmdp()

print('Tests passed.')

You can run the following code to visualize the execution of your QMDP policy (Hint: QMDP is not a good strategy for this case).

In [None]:
pomdp = RobotChargingPOMDP(gamma=0.9)
b0 = DiscreteDistribution({0: 0.03, 1: 0.07, 2: 0.9})
policy = qmdp(pomdp)
simulate(pomdp, b0, policy=policy, n=10, real_s=2)