In [12]:
import gym
import numpy as np
from pathlib import Path

# !pip install Box2D
from stable_baselines3 import DQN, PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.monitor import Monitor 

# https://www.gymlibrary.ml/environments/box2d/lunar_lander/

# Set up

In [13]:
# Create folder to save models
directory_path = 'models'
Path(directory_path).mkdir(parents=True, exist_ok=True)

# Create environment
env_name = 'LunarLander-v2'
env = gym.make(env_name)

num_steps = 1_000_000
model_file_name = Path(directory_path, env_name + '_' + str(num_steps))
print(env.action_space)
# 0- Do nothing
# 1- Fire left engine
# 2- Fire down engine
# 3- Fire right engine
print(env.observation_space)

Discrete(4)
Box(-inf, inf, (8,), float32)


## Simulate with random actions

In [14]:
def simulate_random_actions():
    episodes = 10
    all_rewards = []
    for episode in range(1, episodes):
        state = env.reset() # Restart the agent at the beginning
        done = False # If the agent has completed the level
        score = 0 # Called score not return cause it's python
        while not done:
            env.render()
            random_action = env.action_space.sample() # Do random actions
            state, reward, done, info = env.step(random_action) 
            score += reward
        print(score)
        all_rewards.append(score)
    env.reset()   
    env.close()
    print(f"Mean reward:{np.mean(all_rewards)} Num episodes:{episodes}")

    
simulate_random_actions()

-373.99599499172973
-220.12457561345704
-80.67169289337485
-110.90974397551449
-237.43484428571057
-145.65194554215913
-151.1645465025929
-118.54428424318786
-136.51127909573785
Mean reward:-175.00098968260716 Num episodes:10


# Build and Train a model

In [4]:
# Instantiate the agent
env = gym.make(env_name)
env = DummyVecEnv([lambda: env])
model = PPO(
    policy = 'MlpPolicy',
    env = env,
    n_steps = 1024,
    batch_size = 64,
    n_epochs = 4,
    gamma = 0.999,
    gae_lambda = 0.98,
    ent_coef = 0.01,
    verbose = 0)

In [5]:
# Train the agent
model.learn(total_timesteps = num_steps)

<stable_baselines3.ppo.ppo.PPO at 0x1057d1990>

# Save and reload

In [7]:
# Save the agent
model.save(model_file_name)

## Load the trained agent

In [4]:
# del model  # delete trained model to demonstrate loading
# # Load the trained agent
model = PPO.load(model_file_name, env=env)

# Evaluate

In [5]:
# Evaluate the agent
mean_reward, std_reward = evaluate_policy(model,  env , n_eval_episodes=10)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")



mean_reward:237.54 +/- 68.15


In [6]:
# Enjoy trained agent
env = gym.make(env_name)
obs = env.reset()
for _ in range(500):
    env.render()
    action, _states = model.predict(obs, deterministic=True)
    obs, rewards, dones, info = env.step(action)
env.reset()   
env.close()

2023-04-04 16:08:56.528 Python[54347:5152463] ApplePersistenceIgnoreState: Existing state will not be touched. New state will be written to /var/folders/jt/2mhs0z991rj19zp1t8m4q9nw0000gq/T/org.python.python.savedState


## 