# Q Learning Tutorial Discretizing Continuous Space
(Provided by Sentdex - Youtube)

In [8]:
import gym 
import cv2 
import numpy as np 
import time 

In [9]:
list(gym.envs.registry.keys())

['CartPole-v0',
 'CartPole-v1',
 'MountainCar-v0',
 'MountainCarContinuous-v0',
 'Pendulum-v1',
 'Acrobot-v1',
 'LunarLander-v2',
 'LunarLanderContinuous-v2',
 'BipedalWalker-v3',
 'BipedalWalkerHardcore-v3',
 'CarRacing-v2',
 'Blackjack-v1',
 'FrozenLake-v1',
 'FrozenLake8x8-v1',
 'CliffWalking-v0',
 'Taxi-v3',
 'Reacher-v2',
 'Reacher-v4',
 'Pusher-v2',
 'Pusher-v4',
 'InvertedPendulum-v2',
 'InvertedPendulum-v4',
 'InvertedDoublePendulum-v2',
 'InvertedDoublePendulum-v4',
 'HalfCheetah-v2',
 'HalfCheetah-v3',
 'HalfCheetah-v4',
 'Hopper-v2',
 'Hopper-v3',
 'Hopper-v4',
 'Swimmer-v2',
 'Swimmer-v3',
 'Swimmer-v4',
 'Walker2d-v2',
 'Walker2d-v3',
 'Walker2d-v4',
 'Ant-v2',
 'Ant-v3',
 'Ant-v4',
 'Humanoid-v2',
 'Humanoid-v3',
 'Humanoid-v4',
 'HumanoidStandup-v2',
 'HumanoidStandup-v4']

In [10]:
env = gym.make("Acrobot-v1", render_mode='rgb_array')

In [11]:
class Discretizer():
    # having a q_Table for continous data is unfeasiable due to the infinite value the data could take on. Instead what we do is make it discrete using bins/ranges.
    # this class helps us split each observation into a fixed amount of intervals
    # Note: To determine how big the observation space really is, multiply the "range" of all observation together
    def __init__(self, env, default_bins_per_observation = 20) -> None:
        assert len(env.observation_space.shape) == 1, "Invalid shape, could not discretize"
        

        BUCKET_SIZE = default_bins_per_observation # how many discrete spaces we want (i.e. intervals)
        NUM_OBSERVATIONS_PER_STATE = len(env.observation_space.high) # tells us how many "observations" are in our observation space (in this example we have 2, position and velocity)
        UNIQUE_OBSERVATION = [BUCKET_SIZE] * NUM_OBSERVATIONS_PER_STATE  # this tells us the size of each unique observation in the observation space (ex: position - 20, velocity - 20)
        observation_interval_size =  (env.observation_space.high - env.observation_space.low) / UNIQUE_OBSERVATION # how "big" each interval is


        self.observation_interval_size = observation_interval_size
        self.obs_bins_splits = UNIQUE_OBSERVATION

        self.np_obs_bin_max = np.array(UNIQUE_OBSERVATION) - 1

    
    def get_discrete_state(self, state):
        discrete_space = (state - env.observation_space.low) / self.observation_interval_size # this finds the corresponding bucket for the given state
        discrete_space = np.clip(discrete_space, 0, self.np_obs_bin_max)
        return tuple(discrete_space.astype(int)) # return as tuple so we can index into q_table


In [12]:
class QAgent():

    def __init__(self, env, discretizer=None) -> None:

        # set statespace 
        observation_space = list(env.observation_space.shape)
        
        # update state space if discretizer present
        self.discretizer = discretizer
        if discretizer:
            observation_space = discretizer.obs_bins_splits

        # set tracking varaibles
        self.num_actions = env.action_space.n
        self.observation_space = observation_space

        # setup q table 
        self.q_table = np.random.uniform(low=0, high=2, size=(self.observation_space + [self.num_actions]))
        
        # save enviornment to be used later
        self.env = env 


    def playback(self, frames, episode):
        while len(frames) > 0:
            frame = frames.pop(0)
            frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
            cv2.imshow("Last Play - {}".format(episode), frame)
            cv2.waitKey(30)
        cv2.destroyAllWindows()

    def train(self, episodes, dr=.95, lr=.001, play_every=100):

        
        for i in range(episodes):
            state, _ = env.reset(seed=42)
            discrete_state = (self.discretizer and self.discretizer.get_discrete_state(state)) or state 
            terminate = False 
            truncate = False 
            render = False 

            if i % play_every == 0:
                print(f'Episode {i + 1}')
                frames = []
                render = True

            while not terminate and not truncate:
                action = np.argmax(self.q_table[discrete_state])
                new_state, reward, terminate, truncate, info = env.step(action)
                new_discrete_state = (self.discretizer and self.discretizer.get_discrete_state(new_state)) or new_state

                max_future_q = np.max(self.q_table[new_discrete_state])
                current_q = self.q_table[discrete_state + (action,)] # get the q_table for the observation combination we are at, then select the q value for the action that we took 
                
                # Bellman Equation 
                new_q = (1 - lr) * current_q + lr * (reward + dr * max_future_q)

                # update q_table 
                self.q_table[discrete_state + (action,)] = new_q

                # keep track of previous state 
                discrete_state = new_discrete_state

                if render:
                    frames.append(env.render())
            
            if render:
                self.playback(frames, i)


In [13]:
dis = Discretizer(env, default_bins_per_observation=8)
agent = QAgent(env, dis)

In [14]:
agent.train(episodes=10000, lr=.1, play_every=1000)

Episode 1


KeyboardInterrupt: 