In [1]:
import gym
import numpy as np
from pathlib import Path

# !pip install Box2D
from stable_baselines3 import SAC
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.monitor import Monitor 

# Set up

In [2]:
# Create folder to save models
directory_path = 'models'
Path(directory_path).mkdir(parents=True, exist_ok=True)

# Create environment
env_name = 'Pendulum-v0'
env = gym.make(env_name)

num_steps = 5_000#_000
model_file_name = Path(directory_path, f'{env_name}_{num_steps}')
print(env.action_space)
print(env.observation_space)

Box([-2.], [2.], (1,), float32)
Box([-1. -1. -8.], [1. 1. 8.], (3,), float32)


## Simulate with random actions

In [3]:
def simulate_random_actions(render=False):
    episodes = 10
    all_rewards = []
    for _ in range(1, episodes):
        _ = env.reset() # Restart the agent at the beginning
        done = False # If the agent has completed the level
        score = 0 # Called score not return cause it's python
        while not done:
            if render:
                env.render()
            random_action = env.action_space.sample() # Do random actions
            _, reward, done, _ = env.step(random_action) 
            score += reward
        all_rewards.append(score)
    env.reset()   
    env.close()
    print(f"Mean reward:{np.mean(all_rewards)} Num episodes:{episodes}")
simulate_random_actions()

Mean reward:-1162.129798711533 Num episodes:10


# Build and Train a model

In [4]:
# Instantiate the agent
env = gym.make(env_name)
env = DummyVecEnv([lambda: env])
model = SAC('MlpPolicy', env, train_freq=1, gradient_steps=2, verbose=0)

In [5]:
# Train the agent
model.learn(total_timesteps = num_steps)

<stable_baselines3.sac.sac.SAC at 0x7fa45182b4c0>

# Save and reload

In [6]:
# Save the agent
model.save(model_file_name)

## Load the trained agent

In [7]:
# del model
# model = SAC.load(model_file_name, env=env)

# Evaluate

In [8]:
# Evaluate the agent
mean_reward, std_reward = evaluate_policy(model,  env , n_eval_episodes=10)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")



mean_reward:-169.69 +/- 97.87


In [9]:
obs = env.reset()
done = False
while not done:
    env.render()
    action, _states = model.predict(obs, deterministic=True)
    obs, rewards, done, info = env.step(action)
env.reset()   
env.close()

2022-05-11 15:11:11.305 Python[28006:905107] ApplePersistenceIgnoreState: Existing state will not be touched. New state will be written to (null)


## 