In [1]:
import gym
from pathlib import Path

# !pip install Box2D
from stable_baselines3 import DQN, PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.monitor import Monitor 

# Local imports
from utils import file_exists, evaluate_model, eval_env_random_actions

# 1. Set up

In [2]:
# Create folder to save models
directory_path = 'models'
Path(directory_path).mkdir(parents=True, exist_ok=True)

# Create environment
env_name = 'LunarLander-v2'
env = gym.make(env_name)

num_steps = 250_000
model_file_name = Path(directory_path, env_name + '_' + str(num_steps))
print(env.action_space)
# 0- Do nothing
# 1- Fire left engine
# 2- Fire down engine
# 3- Fire right engine
print(env.observation_space)

Discrete(4)
Box([-inf -inf -inf -inf -inf -inf -inf -inf], [inf inf inf inf inf inf inf inf], (8,), float32)


## 1.1 Test random actions

In [3]:
eval_env_random_actions(env, render=False)

Episode: 1
	Score: -175.5151312340881
Episode: 2
	Score: -116.07473452116533
Episode: 3
	Score: -45.38174852619592
Episode: 4
	Score: -133.72219742010205
Episode: 5
	Score: -80.77731764816896
Episode: 6
	Score: -160.156771603122
Episode: 7
	Score: -81.19053058611736
Episode: 8
	Score: -250.1056524695289
Episode: 9
	Score: -129.76576653781902


	Mean reward: -130.29887228292307 Num episodes: 10


# 2. Build and Train a model

In [4]:
# Instantiate the agent
env = gym.make(env_name)
env = DummyVecEnv([lambda: env])
model = PPO('MlpPolicy', env, verbose=0) # CNNPolicy

In [5]:
# Train the agent
model.learn(total_timesteps = num_steps)
# We want high explained_variance and 

<stable_baselines3.ppo.ppo.PPO at 0x7fb5691043a0>

# 3 Save and reload

In [6]:
# Save the agent
model.save(model_file_name)
del model  # delete trained model to demonstrate loading

## Load the trained agent

In [7]:
# Load the trained agent
# NOTE: if you have loading issue, you can pass `print_system_info=True`
# to compare the system on which the model was trained vs the current one
# model = DQN.load("dqn_lunar", env=env, print_system_info=True)
model = PPO.load(model_file_name, env=env)

# 3. Evaluate

In [12]:
# Evaluate the agent
mean_reward, std_reward = evaluate_policy(model,  env , n_eval_episodes=10)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

mean_reward:50.98 +/- 113.14


In [11]:
# Enjoy trained agent
obs = env.reset()
for i in range(500):
    env.render()
    action, _states = model.predict(obs, deterministic=True)
    obs, rewards, dones, info = env.step(action)
env.reset()   
env.close()

## 