/
plotting_and_normalization.py
75 lines (52 loc) · 2.26 KB
/
plotting_and_normalization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import os
from mushroom_rl.rl_utils.preprocessors import MinMaxPreprocessor
from mushroom_rl.utils.callbacks import PlotDataset
import numpy as np
from mushroom_rl.algorithms.policy_search import REINFORCE
from mushroom_rl.approximators.parametric import LinearApproximator
from mushroom_rl.approximators.regressor import Regressor
from mushroom_rl.core import Core, Logger
from mushroom_rl.environments import LQR
from mushroom_rl.policy import StateStdGaussianPolicy
from mushroom_rl.rl_utils.optimizers import AdaptiveOptimizer
from tqdm import tqdm
"""
This script shows how to use preprocessors and plot callback.
"""
tqdm.monitor_interval = 0
def experiment(n_epochs, n_iterations, ep_per_run):
np.random.seed()
logger = Logger('plot_and_norm_example', results_dir=None)
logger.strong_line()
logger.info('Plotting and normalization example')
# MDP
mdp = LQR.generate(dimensions=2, max_pos=10., max_action=5., episodic=True)
approximator = Regressor(LinearApproximator,
input_shape=mdp.info.observation_space.shape,
output_shape=mdp.info.action_space.shape)
sigma = Regressor(LinearApproximator,
input_shape=mdp.info.observation_space.shape,
output_shape=mdp.info.action_space.shape)
sigma_weights = 2 * np.ones(sigma.weights_size)
sigma.set_weights(sigma_weights)
policy = StateStdGaussianPolicy(approximator, sigma)
# Agent
optimizer = AdaptiveOptimizer(eps=.01)
algorithm_params = dict(optimizer=optimizer)
agent = REINFORCE(mdp.info, policy, **algorithm_params)
# normalization callback
prepro = MinMaxPreprocessor(mdp_info=mdp.info)
agent.add_core_preprocessor(prepro)
# plotting callback
plotter = PlotDataset(mdp.info, obs_normalized=True)
# Train
core = Core(agent, mdp, callback_step=plotter)
# training loop
for n in range(n_epochs):
core.learn(n_episodes=n_iterations * ep_per_run,
n_episodes_per_fit=ep_per_run)
dataset = core.evaluate(n_episodes=ep_per_run, render=False)
J = np.mean(dataset.discounted_return)
logger.epoch_info(n+1, J=J)
if __name__ == '__main__':
experiment(n_epochs=10, n_iterations=10, ep_per_run=100)