In [28]:
from abc import ABC, abstractmethod
from collections import defaultdict
import random
from typing import List, Dict, DefaultDict
from gym.spaces import Space
from gym.spaces.utils import flatdim
import numpy as np

In [None]:
def calculate_greedy_policy(policy, rewards, values, valid_actions, gamma=0.9):
    """Improve policy (take greedy actions) with respect to rewards and state values.

  Args:
      policy (np.array): Policy giving the probability of taking each action from each state.
      rewards (np.array): Rewards corresponding to reaching the next state from each state (so r(s,s') here rather than the usual r(s,a)).
      values (np.array): State values evaluated for current policy.
      gamma (float, optional): Discount factor. Defaults to 0.9.

  Returns:
      policy (np.array): Improved (greedy) policy.
  """

    num_states, num_actions = policy.shape # 2, 3
    greedy_policy = np.zeros_like(policy)
    state_action_values = np.nan*policy
    for action in range(num_actions):
        for state in range(num_states):
            if valid_actions[state, action]==1:
                state_action_values[state][action] = 0.0
                # With probability 0.9, next state = action with corresponding reward. With probability 0.1, next state = state with no reward.
                next_state_probabilities = {action:0.9, state:0.1}
                # Often the environment handles this next state computation (transition dynamics). But to use complete dynamic programming we must know 
                # the transition probablities.
        for next_state in next_state_probabilities.keys():
          # Note the expectation is now only over next state probabilities (environment dynamics) since we want the value for each valid state and action.
            state_action_values[state][action] += next_state_probabilities[next_state]*(rewards[state][next_state]+gamma*values[next_state])
            
    
    greedy_action = np.nanargmax(state_action_values[state]) # Argmax ignoring invalid actions with nan value
  
    for action in range(num_actions):
        greedy_policy[state][action] = 1 if action == greedy_action else 0

    print(f'State action values with previous policy:\n {state_action_values}\n')
    print(f'Greedy policy after policy improvement:\n {greedy_policy}')
    return greedy_policy

In [30]:
class Agent(ABC):
    """Base class for Q-Learning agent

    **ONLY CHANGE THE BODY OF THE act() FUNCTION**

    """

    def __init__(
        self,
        action_space: Space,
        obs_space: Space,
        gamma: float,
        epsilon: float,
        **kwargs
    ):
        """Constructor of base agent for Q-Learning

        Initializes basic variables of the Q-Learning agent
        namely the epsilon, learning rate and discount rate.

        :param action_space (int): action space of the environment
        :param obs_space (int): observation space of the environment
        :param gamma (float): discount factor (gamma)
        :param epsilon (float): epsilon for epsilon-greedy action selection

        :attr n_acts (int): number of actions
        :attr q_table (DefaultDict): table for Q-values mapping (OBS, ACT) pairs of observations
            and actions to respective Q-values
        """

        self.action_space = action_space
        self.obs_space = obs_space
        self.n_acts = flatdim(action_space)

        self.epsilon: float = epsilon
        self.gamma: float = gamma

        self.q_table: DefaultDict = defaultdict(lambda: 0)

    def act(self, obs: int) -> int:
        """Implement the epsilon-greedy action selection here

        :param obs (int): received observation representing the current environmental state
        :return (int): index of selected action
        """
        ### PUT YOUR CODE HERE ###
        act_vals = [self.q_table[(obs, act)] for act in range(self.n_acts)]
        max_val = max(act_vals)
        max_acts = [idx for idx, act_val in enumerate(act_vals) if act_val == max_val]

        final_return = 0
        if random.random() < self.epsilon:
            final_return = random.randint(0, self.n_acts - 1)
        else:
            final_return = random.choice(max_acts)
            
        return final_return
        
        ### RETURN AN ACTION HERE ###
#         return -1

    @abstractmethod
    def schedule_hyperparameters(self, timestep: int, max_timestep: int):
        """Updates the hyperparameters

        This function is called before every episode and allows you to schedule your
        hyperparameters.

        :param timestep (int): current timestep at the beginning of the episode
        :param max_timestep (int): maximum timesteps that the training loop will run for
        """
        ...

    @abstractmethod
    def learn(self):
        ...




In [53]:
class QLearningAgent(Agent):
    """Agent using the Q-Learning algorithm"""

    def __init__(self, alpha: float, **kwargs):
        """Constructor of QLearningAgent

        Initializes some variables of the Q-Learning agent, namely the epsilon, discount rate
        and learning rate alpha.

        :param alpha (float): learning rate alpha for Q-learning updates
        """

        super().__init__(**kwargs)
        self.alpha: float = alpha

    def learn(
        self, obs: int, action: int, reward: float, n_obs: int, done: bool
    ) -> float:
        """Updates the Q-table based on agent experience

        **YOU MUST IMPLEMENT THIS FUNCTION FOR Q2**

        :param obs (int): received observation representing the current environmental state
        :param action (int): index of applied action
        :param reward (float): received reward
        :param n_obs (int): received observation representing the next environmental state
        :param done (bool): flag indicating whether a terminal state has been reached
        :return (float): updated Q-value for current observation-action pair
        """
        ### PUT YOUR CODE HERE ###
#         raise NotImplementedError("Needed for Q2")
        def flatten(actions) :
            new_actions = [] 
            for action in actions :
                if type(action) == list :
                    new_actions += action
                elif type(action) == int :
                    new_actions.append(action)
            return new_actions

        def get_actions(possible_actions) :
            if len(possible_actions) == 1 :
                return possible_actions
            pairs = []
            for action in possible_actions[0]:
                for action2 in possible_actions[1]:
                    pairs.append(flatten([action, action2]))
            new_possible_actions = [pairs] + possible_actions[2 : ]
            possible_action_vectors = get_actions(new_possible_actions)
            return possible_action_vectors

        n_action = (flatdim(env.action_space))
        possible_actions = [list(range(k, (k + 1))) for k in range(n_action)]
        action_poss = get_actions(possible_actions)[0][0]

        max_Q = [self.q_table[(n_obs, a)] for a in action_poss]
        
        target_value = reward + self.gamma * (1 - done) * max(max_Q)
        self.q_table[(obs, action)] += self.alpha * (
            target_value - self.q_table[(obs, action)]
        )
        
        return self.q_table[(obs, action)]

    def schedule_hyperparameters(self, timestep: int, max_timestep: int):
        """Updates the hyperparameters

        **DO NOT CHANGE THE PROVIDED SCHEDULING WHEN TESTING PROVIDED HYPERPARAMETER PROFILES IN Q2**

        This function is called before every episode and allows you to schedule your
        hyperparameters.

        :param timestep (int): current timestep at the beginning of the episode
        :param max_timestep (int): maximum timesteps that the training loop will run for
        """
        self.epsilon = 1.0 - (min(1.0, timestep / (0.20 * max_timestep))) * 0.99


class MonteCarloAgent(Agent):
    """Agent using the Monte-Carlo algorithm for training"""

    def __init__(self, **kwargs):
        """Constructor of MonteCarloAgent

        Initializes some variables of the Monte-Carlo agent, namely epsilon,
        discount rate and an empty observation-action pair dictionary.

        :attr sa_counts (Dict[(Obs, Act), int]): dictionary to count occurrences observation-action pairs
        """
        super().__init__(**kwargs)
        self.sa_counts = {}

    def learn(
        self, obses: List[int], actions: List[int], rewards: List[float]
    ) -> Dict:
        """Updates the Q-table based on agent experience

        **YOU MUST IMPLEMENT THIS FUNCTION FOR Q2**

        :param obses (List(int)): list of received observations representing environmental states
            of trajectory (in the order they were encountered)
        :param actions (List[int]): list of indices of applied actions in trajectory (in the
            order they were applied)
        :param rewards (List[float]): list of received rewards during trajectory (in the order
            they were received)
        :return (Dict): A dictionary containing the updated Q-value of all the updated state-action pairs
            indexed by the state action pair.
        """
        updated_values = {}
#         print(obses, actions, rewards)

        N = len(obses)
        T = len(rewards)
            
        # generate all combinations
        all_pairs = []
        for i in range(N):
            for j in range(N):
                # ensure each pair is unique
                if [obses[i],actions[j]] not in all_pairs:
                    all_pairs.append([obses[i],actions[j]])

        returns_sa = np.zeros(T)
        
#         # what is the return????
#         for elem in all_pairs:
#             returns_sa[elem] = 
#             index = all_pairs.index(elem)
#             average = sum(rewards[0:index+1]) / len(rewards[0:index+1])
#             updated_values[index] = average        
            
        # what is the return????
        for elem in all_pairs:
            index = all_pairs.index(elem)
            G = 0
            G = sum(rewards[0:index])
            average = G / (index+1)
            updated_values[index] = average  
        
        ### PUT YOUR CODE HERE ###
#         raise NotImplementedError("Needed for Q2")
        return updated_values

    def schedule_hyperparameters(self, timestep: int, max_timestep: int):
        """Updates the hyperparameters

        **DO NOT CHANGE THE PROVIDED SCHEDULING WHEN TESTING PROVIDED HYPERPARAMETER PROFILES IN Q2**

        This function is called before every episode and allows you to schedule your
        hyperparameters.

        :param timestep (int): current timestep at the beginning of the episode
        :param max_timestep (int): maximum timesteps that the training loop will run for
        """
        self.epsilon = 1.0 - (min(1.0, timestep / (0.9 * max_timestep))) * 0.8

In [11]:
import gym
from tqdm import tqdm

from constants import EX2_QL_CONSTANTS as CONSTANTS
# from agents import QLearningAgent
from utils import evaluate

CONFIG = {
    "eval_freq": 1000, # keep this unchanged
    "alpha": 0.05,
    "epsilon": 0.9,
    "gamma": 0.99,
}
CONFIG.update(CONSTANTS)


def q_learning_eval(
        env,
        config,
        q_table,
        render=False,
        output=True):
    """
    Evaluate configuration of Q-learning on given environment when initialised with given Q-table

    :param env (gym.Env): environment to execute evaluation on
    :param config (Dict[str, float]): configuration dictionary containing hyperparameters
    :param q_table (Dict[(Obs, Act), float]): Q-table mapping observation-action to Q-values
    :param render (bool): flag whether evaluation runs should be rendered
    :param output (bool): flag whether mean evaluation performance should be printed
    :return (float, float): mean and standard deviation of returns received over episodes
    """
    eval_agent = QLearningAgent(
        action_space=env.action_space,
        obs_space=env.observation_space,
        gamma=config["gamma"],
        alpha=config["alpha"],
        epsilon=0.0,
    )
    eval_agent.q_table = q_table
    return evaluate(env, eval_agent, config["eval_eps_max_steps"], config["eval_episodes"], render)


def train(env, config, output=True):
    """
    Train and evaluate Q-Learning on given environment with provided hyperparameters

    :param env (gym.Env): environment to execute evaluation on
    :param config (Dict[str, float]): configuration dictionary containing hyperparameters
    :param output (bool): flag if mean evaluation results should be printed
    :return (float, List[float], List[float], Dict[(Obs, Act), float]):
        total reward over all episodes, list of means and standard deviations of evaluation
        returns, final Q-table
    """
    agent = QLearningAgent(
        action_space=env.action_space,
        obs_space=env.observation_space,
        gamma=config["gamma"],
        alpha=config["alpha"],
        epsilon=config["epsilon"],
    )

    step_counter = 0
    max_steps = config["total_eps"] * config["eps_max_steps"]

    total_reward = 0
    evaluation_return_means = []
    evaluation_negative_returns = []

    for eps_num in tqdm(range(1, config["total_eps"]+1)):
        obs = env.reset()
        episodic_return = 0
        t = 0

        while t < config["eps_max_steps"]:
            agent.schedule_hyperparameters(step_counter, max_steps)
            act = agent.act(obs)
            n_obs, reward, done, _ = env.step(act)
            agent.learn(obs, act, reward, n_obs, done)

            t += 1
            step_counter += 1
            episodic_return += reward

            if done:
                break

            obs = n_obs

        total_reward += episodic_return

        if eps_num > 0 and eps_num % config["eval_freq"] == 0:
            mean_return, negative_returns = q_learning_eval(env, config, agent.q_table)
            tqdm.write(f"EVALUATION: EP {eps_num} - MEAN RETURN {mean_return}")
            evaluation_return_means.append(mean_return)
            evaluation_negative_returns.append(negative_returns)

    return total_reward, evaluation_return_means, evaluation_negative_returns, agent.q_table


if __name__ == "__main__":
    env = gym.make(CONFIG["env"])
    total_reward, _, _, q_table = train(env, CONFIG)

 11%|█▏        | 1130/10000 [00:03<00:36, 244.82it/s]

EVALUATION: EP 1000 - MEAN RETURN -100.0


 21%|██        | 2071/10000 [00:05<00:38, 208.54it/s]

EVALUATION: EP 2000 - MEAN RETURN -99.32


 31%|███       | 3068/10000 [00:08<00:27, 248.09it/s]

EVALUATION: EP 3000 - MEAN RETURN -90.512


 43%|████▎     | 4269/10000 [00:09<00:09, 590.01it/s]

EVALUATION: EP 4000 - MEAN RETURN -58.076


 53%|█████▎    | 5330/10000 [00:11<00:05, 932.39it/s] 

EVALUATION: EP 5000 - MEAN RETURN -26.808


 64%|██████▍   | 6380/10000 [00:11<00:02, 1305.64it/s]

EVALUATION: EP 6000 - MEAN RETURN -3.332


 73%|███████▎  | 7304/10000 [00:12<00:02, 1223.88it/s]

EVALUATION: EP 7000 - MEAN RETURN 4.28


 84%|████████▍ | 8413/10000 [00:13<00:01, 1536.74it/s]

EVALUATION: EP 8000 - MEAN RETURN 6.136


 93%|█████████▎| 9333/10000 [00:13<00:00, 1357.80it/s]

EVALUATION: EP 9000 - MEAN RETURN 7.948


100%|██████████| 10000/10000 [00:14<00:00, 691.03it/s] 

EVALUATION: EP 10000 - MEAN RETURN 7.82





In [54]:
import gym

from constants import EX2_MC_CONSTANTS as CONSTANTS
# from agents import MonteCarloAgent
from utils import evaluate
from tqdm import tqdm

CONFIG = {
    "eval_freq": 5000, # keep this unchanged
    "epsilon": 0.9,
    "gamma": 0.99,
}
CONFIG.update(CONSTANTS)



def monte_carlo_eval(
        env,
        config,
        q_table,
        render=False):
    """
    Evaluate configuration of MC on given environment when initialised with given Q-table

    :param env (gym.Env): environment to execute evaluation on
    :param config (Dict[str, float]): configuration dictionary containing hyperparameters
    :param q_table (Dict[(Obs, Act), float]): Q-table mapping observation-action to Q-values
    :param render (bool): flag whether evaluation runs should be rendered
    :return (float, float): mean and standard deviation of returns received over episodes
    """
    eval_agent = MonteCarloAgent(
        action_space=env.action_space,
        obs_space=env.observation_space,
        gamma=CONFIG["gamma"],
        epsilon=0.0,
    )
    eval_agent.q_table = q_table
    return evaluate(env, eval_agent, config["eval_eps_max_steps"], config["eval_episodes"], render)


def train(env, config):
    """
    Train and evaluate MC on given environment with provided hyperparameters

    :param env (gym.Env): environment to execute evaluation on
    :param config (Dict[str, float]): configuration dictionary containing hyperparameters
    :return (float, List[float], List[float], Dict[(Obs, Act), float]):
        returns over all episodes, list of means and standard deviations of evaluation
        returns, final Q-table, final state-action counts
    """
    agent = MonteCarloAgent(
        action_space=env.action_space,
        obs_space=env.observation_space,
        gamma=config["gamma"],
        epsilon=config["epsilon"],
    )

    step_counter = 0
    max_steps = config["total_eps"] * config["eps_max_steps"]

    total_reward = 0
    evaluation_return_means = []
    evaluation_negative_returns = []

    for eps_num in tqdm(range(1, config["total_eps"] + 1)):
        obs = env.reset()

        t = 0
        episodic_return = 0

        obs_list, act_list, rew_list = [], [], []
        while t < config["eps_max_steps"]:
            agent.schedule_hyperparameters(step_counter, max_steps)
            ##################
            act = agent.act(obs)

            n_obs, reward, done, _ = env.step(act)

            obs_list.append(obs)
            rew_list.append(reward)
            act_list.append(act)

            t += 1
            step_counter += 1
            episodic_return += reward

            if done:
                break

            obs = n_obs

        agent.learn(obs_list, act_list, rew_list)
        total_reward += episodic_return

        if eps_num > 0 and eps_num % config["eval_freq"] == 0:
            mean_return, negative_returns = monte_carlo_eval(env, config, agent.q_table)
            tqdm.write(f"EVALUATION: EP {eps_num} - MEAN RETURN {mean_return}")
            evaluation_return_means.append(mean_return)
            evaluation_negative_returns.append(negative_returns)

    return total_reward, evaluation_return_means, evaluation_negative_returns, agent.q_table


if __name__ == "__main__":
    env = gym.make(CONFIG["env"])
    total_reward, _, _, q_table = train(env, CONFIG)

  5%|▌         | 5076/100000 [00:14<09:26, 167.45it/s]

EVALUATION: EP 5000 - MEAN RETURN -384.42


 10%|█         | 10105/100000 [00:28<09:11, 162.94it/s]

EVALUATION: EP 10000 - MEAN RETURN -391.756


 15%|█▌        | 15100/100000 [00:41<08:23, 168.47it/s]

EVALUATION: EP 15000 - MEAN RETURN -389.42


 20%|██        | 20068/100000 [00:55<09:07, 145.93it/s]

EVALUATION: EP 20000 - MEAN RETURN -392.32


 25%|██▌       | 25066/100000 [01:09<08:32, 146.29it/s]

EVALUATION: EP 25000 - MEAN RETURN -387.822


 30%|███       | 30094/100000 [01:23<06:51, 170.03it/s]

EVALUATION: EP 30000 - MEAN RETURN -388.746


 35%|███▌      | 35111/100000 [01:37<06:33, 164.87it/s]

EVALUATION: EP 35000 - MEAN RETURN -392.102


 40%|████      | 40082/100000 [01:51<05:59, 166.85it/s]

EVALUATION: EP 40000 - MEAN RETURN -390.736


 45%|████▌     | 45100/100000 [02:05<05:47, 157.89it/s]

EVALUATION: EP 45000 - MEAN RETURN -389.426


 50%|█████     | 50058/100000 [02:19<06:37, 125.76it/s]

EVALUATION: EP 50000 - MEAN RETURN -391.256


 55%|█████▌    | 55108/100000 [02:33<04:13, 176.99it/s]

EVALUATION: EP 55000 - MEAN RETURN -391.39


 60%|██████    | 60052/100000 [02:46<04:54, 135.48it/s]

EVALUATION: EP 60000 - MEAN RETURN -388.508


 65%|██████▌   | 65096/100000 [03:00<03:42, 156.98it/s]

EVALUATION: EP 65000 - MEAN RETURN -393.226


 70%|███████   | 70044/100000 [03:14<03:41, 135.31it/s]

EVALUATION: EP 70000 - MEAN RETURN -389.858


 75%|███████▌  | 75079/100000 [03:28<02:15, 183.44it/s]

EVALUATION: EP 75000 - MEAN RETURN -393.368


 80%|████████  | 80147/100000 [03:42<01:42, 192.99it/s]

EVALUATION: EP 80000 - MEAN RETURN -393.87


 85%|████████▌ | 85075/100000 [03:56<01:23, 178.10it/s]

EVALUATION: EP 85000 - MEAN RETURN -388.392


 90%|█████████ | 90078/100000 [04:09<01:08, 145.05it/s]

EVALUATION: EP 90000 - MEAN RETURN -387.368


 95%|█████████▌| 95109/100000 [04:23<00:27, 178.46it/s]

EVALUATION: EP 95000 - MEAN RETURN -389.514


100%|██████████| 100000/100000 [04:37<00:00, 360.79it/s]

EVALUATION: EP 100000 - MEAN RETURN -392.46



