In [27]:
import  gym 
import numpy as np 
from IPython.display import Video
import os
import imageio

In [28]:
ENV_NAME = "MountainCar-v0"
PATH = f'Models/QLearn/{ENV_NAME}/'
MODEL_FILE = os.path.join(PATH, ENV_NAME + ".pth")
os.makedirs(PATH, exist_ok=True)

In [29]:
env = gym.make(ENV_NAME, render_mode='rgb_array') # with human render mode you dont have to call env.render

In [30]:
# What we are doing below is taking a continous space and converting it to a discrete space by spliting up the space into intervals of fixed sizes
BUCKET_SIZE = 20 # how many discrete spaces we want (i.e. intervals)
NUM_OBSERVATIONS_PER_STATE = len(env.observation_space.high) # tells us how many "observations" are in our observation space (in this example we have 2, position and velocity)
UNIQUE_OBSERVATION = [BUCKET_SIZE] * NUM_OBSERVATIONS_PER_STATE  # this tells us the size of each unique observation in the observation space (ex: position - 20, velocity - 20)
observation_interval_size =  (env.observation_space.high - env.observation_space.low) / UNIQUE_OBSERVATION # how "big" each interval is

ACTION_SPACE_SIZE = env.action_space.n # typically you have to know this before hand. This is the amount of actions your agent can take (in this example we can use .n to figure that out)

In [31]:
# define the q table (we need a combination of each of the individual discrete spaces for each observation and the action space)
q_table = np.random.uniform(low=-2, high=0, size=(UNIQUE_OBSERVATION + [ACTION_SPACE_SIZE]))
print(q_table.shape)

(20, 20, 3)


In [32]:
DISCOUNT_RATE = .95 # how much to "trust" future rewards
LEARNING_RATE = .1 # how much to "trust" the action we took 
NUM_EPISODES = 25000 

SAVE_VIDEO_EVERY = 5000
mastered = 0 

In [33]:
def get_discrete_state(state):
    discrete_space = (state - env.observation_space.low) / observation_interval_size # this finds the corresponding bucket for the given state
    return tuple(discrete_space.astype(int)) # return as tuple so we can index into q_table

In [34]:

# training 
for i in range(NUM_EPISODES):
    render = False 

    if i % SAVE_VIDEO_EVERY == 0:
        frames = []
        render = True


    state, _ = env.reset()
    discrete_state = get_discrete_state(state)
    terminate = False 
    truncate = False 
    total_reward = 0

    while not terminate and not truncate:
        action = np.argmax(q_table[discrete_state])
        new_state, reward, terminate, truncate, info = env.step(action)
        new_discrete_state = get_discrete_state(new_state)
        total_reward = total_reward + reward 

        max_future_q = np.max(q_table[new_discrete_state])
        current_q = q_table[discrete_state + (action,)] # get the q_table for the observation combination we are at, then select the q value for the action that we took 
        
        # Bellman Equation 
        new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT_RATE * max_future_q)

        # update q_table 
        q_table[discrete_state + (action,)] = new_q

        # keep track of previous state 
        discrete_state = new_discrete_state

        if render:
            frames.append(env.render())
    
    if render:
        print(f'Episode : {i + 1}/{NUM_EPISODES} | Cumlative Reward : {total_reward}')
        video_path = os.path.join(PATH, f'{ENV_NAME}_train_video_ep_{i}.mp4')
        imageio.mimsave(video_path, frames, fps=20)
        
    frames = []

env.close()


  if not isinstance(terminated, (bool, np.bool8)):


Episode : 1/25000 | Cumlative Reward : -200.0




Episode : 5001/25000 | Cumlative Reward : -199.0




Episode : 10001/25000 | Cumlative Reward : -113.0




Episode : 15001/25000 | Cumlative Reward : -159.0




Episode : 20001/25000 | Cumlative Reward : -114.0


In [35]:
# Display the last training example
Video(video_path)