In [None]:
!pip install gym==0.21

In [None]:
import gym
import numpy as np
from itertools import product

## Introduction to Gym ##
gym is a python library that regroups many MDPs (https://www.gymlibrary.dev/) . It provides an abstract MDP class with a step() function for an RL agent to observe a tuple next_state, reward, is_terminal, some_infos .

In [None]:
## How to instantiate a MDP with gym: the inverted pendulum example

env = gym.make("Pendulum-v1")
# env = gym.make("Acrobot-v1")
# env = gym.make("CartPole-v1")
# env = gym.make("LunarLander-v2")

#Initialize the MDP in a state.
state = env.reset()

for _ in range(500):
    #Take a random action in the MDP and observe next_s, r , done, _. 
    next_state, reward, done, info = env.step(env.action_space.sample())
    #Rendering of the environment
    env.render()

    if done:
        next_state = env.reset()

env.close()

### Exercise 1
It is important to note that the Pendulum is a continuous MDP: its State space is a hypervolume in |R^3 and its Action space is in |R .  
Read the gym documentation (https://www.gymlibrary.dev/api/spaces/) and print the State and Action spaces of the inverted Pendulum MDP. 

In [None]:
state_space = env. ...
for i in range (state_space.shape[0]):
    print("The "+str(i)+"th feature of the State space has values between "+str(...) + " and "+ str(...))
    
action_space = env. ...
print("The agent can actuate the pendulum engin with torque between " +str(...) + " and " + str(...))

### Exercise 2: Gym wrappers
It is possible to use wrappers for gym environments in order to change what the agent will observe or, for example, to get from a continuous Action space to a discrete one.  
For example, if the agent can take a continuous action between 0 and 1, then one can write an action wrapper so the agent can now take 2 discrete actions: the first discrete action is 0 and the other discrete action is 1.  
Read the documentation https://www.gymlibrary.dev/api/wrappers/ and complete the code for the DiscreteActionWrapper. 

In [None]:
class DiscreteActionWrapper(gym.Wrapper):
    def __init__(self, env: gym.Env, number_of_discr_actions = 2):
        super().__init__(env)
        self.n = number_of_discr_actions
        
        self.discrete_action_to_continuous = ...

        self.discrete_action_space = gym.spaces.Discrete(len(self.discrete_action_to_continuous))

    def step(self, action):
        # the agent inputs a discrete action, we need to get the corresponding continuous action
        a = self.discrete_action_to_continuous[action]
        # we use the continuous action in the original continuous state MDP to get an observation 
        return self.env.step(a)
    
env = gym.make("Pendulum-v1")
print(env.action_space)
env = DiscreteActionWrapper(env, 2)
print(env.action_space)
print("The first discrete action corresponds to a torque of: " + str(env.discrete_action_to_continuous[0]))
print("The second discrete action corresponds to a torque of: " + str(env.discrete_action_to_continuous[1]))

### We can use wrappers to change the State space of a MDP
For example the State space of the Pendulum is Cos of the angle x Sin of the angle x Angular velocity in |R^3. But we can change it to be Angle x Angular velocity in |R^2 .

In [None]:
class AngleWrapper(gym.ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
        #The angle is in radian between -pi and pi, and the angular velocity is between -8 and 8.
        self.observation_space = gym.spaces.Box(np.array([-np.pi, -8]), np.array([np.pi, 8]))

    def observation(self, obs):
        # The pendulum env has an attribute .state that is [current angle, current angular velocity]
        return self.state
    
env = gym.make("Pendulum-v1")
print("Continuous State space with 3 continuous features: ", env.observation_space)
print(env.reset())
env = AngleWrapper(env)
print("Continuous State space with 2 continuous features: ", env.observation_space)
print(env.reset())

### Exercise 3: Grid wrapper for the Pendulum
Implement the GridWrapper for the Pendulum environment to get from a continuous State space to a discrete State space.

In [None]:
class GridWrapper(gym.ObservationWrapper):
    def __init__(self, env, grid_size = 40):
        super().__init__(env)
        assert self.observation_space.shape[0] == 2, "Environment does not have 2 continuous features."
        self.grid_size = grid_size
        
        ...

        self.observation_space = gym.spaces.Discrete(self.grid_size * self.grid_size)

    def observation(self, obs):
        ...
        return ...

### Homework  
1 ) Try Q-learning and Sarsa on the discrete states and discrete actions Pendulum (try different epsilons, different taus, different grid sizes ...).   
2 ) Plot the Q-fonctions of some policies.