In [1]:
import gym
import numpy as np
from pathlib import Path

# !pip install Box2D
from stable_baselines3 import DQN, PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.monitor import Monitor 

# Set up

In [2]:
# Create folder to save models
directory_path = 'models'
Path(directory_path).mkdir(parents=True, exist_ok=True)

# Create environment
env_name = 'SpaceInvaders-v0'
env = gym.make(env_name)

num_steps = 500_000
model_file_name = Path(directory_path, env_name + '_' + str(num_steps))
print(env.action_space)
# Discrete(6)
print(env.observation_space)
# box

Discrete(6)
Box([[[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 ...

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]], [[[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 ...

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 

## Simulate with random actions

In [11]:
def simulate_random_actions(render=False):
    episodes = 10
    all_rewards = []
    for _ in range(1, episodes):
        state = env.reset() # Restart the agent at the beginning
        done = False # If the agent has completed the level
        score = 0 # Called score not return cause it's python
        while not done:
            if render:
                env.render()
            random_action = env.action_space.sample() # Do random actions
            _, reward, done, _ = env.step(random_action) 
            score += reward
        all_rewards.append(score)
        print(score)
    env.reset()   
    env.close()
    print(f"Mean reward:{np.mean(all_rewards)} Num episodes:{episodes}")

# Build and Train a model

In [4]:
# Instantiate the agent
env = gym.make(env_name)
env = Monitor(env)
env = DummyVecEnv([lambda: env])

model = PPO('MlpPolicy', env, verbose=0) # CNNPolicy

In [5]:
# Train the agent
model.learn(total_timesteps = num_steps)

<stable_baselines3.ppo.ppo.PPO at 0x7fc6c1c73d00>

# Save and reload

In [6]:
# Save the agent
model.save(model_file_name)

## Load the trained agent

In [7]:
# del model  # delete trained model to demonstrate loading
# # model = DQN.load("dqn_lunar", env=env, print_system_info=True)
# model = PPO.load(model_file_name, env=env)

# 4. Evaluate

In [10]:
# Evaluate the agent
mean_reward, std_reward = evaluate_policy(model,  env , n_eval_episodes=10)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

mean_reward:7.00 +/- 5.57


In [None]:
# Enjoy trained agent
obs = env.reset()
done = True
while done:
    env.render()
    action, _states = model.predict(obs, deterministic=True)
    obs, rewards, done, info = env.step(action)
env.reset()   
env.close()

# 