In [1]:
%matplotlib inline

from pyvirtualdisplay import Display
from gym.wrappers import Monitor

import matplotlib
import numpy as np
import gym
import torch
from DDPG_Agent import DdpgAgent
import matplotlib.pyplot as plt
from torch.utils.tensorboard import SummaryWriter
import pyglet

import matplotlib.animation as animation
import matplotlib.pyplot as plt

gym.logger.set_level(40)
from collections import deque
from GymWrapper import ImgWrapper
from util import layer_init
from Noise import ActionNoise

env = None
is_image = False
if is_image:
    env = ImgWrapper(gym.make('Pendulum-v0'), gray_scale=True)
else:
    env = gym.make('Pendulum-v0')
#検証用にシードを固定する
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)
env.seed(42)

num_episode = 5000  # 学習エピソード数（学習に時間がかかるので短めにしています）
memory_size = 100000  # replay bufferの大きさ
initial_memory_size = 10000  # 最初に貯めるランダムな遷移の数
# ログ用の設定
episode_rewards = []
num_average_epidodes = 10
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
writer = SummaryWriter(log_dir='./runs')
max_steps = env.spec.max_episode_steps  # エピソードの最大ステップ数
agent = DdpgAgent(env.observation_space,
                  env.action_space,
                  device,
                  batch_size = 64,
                  memory_size=memory_size,
                  writer=writer,
                  is_image = is_image)

# 最初にreplay bufferにランダムな行動をしたときのデータを入れる
state = env.reset()
for step in range(initial_memory_size):
    action = env.action_space.sample() # ランダムに行動を選択 
    next_state, reward, done, _ = env.step(action)
    agent.memory.add(state, action, reward, done)
    state = env.reset() if done else next_state
print('%d Data collected' % (initial_memory_size))

for episode in range(num_episode):
    state = env.reset()
    episode_reward = 0
    noise = ActionNoise(env.action_space.shape[0])
    for t in range(max_steps):
        action = agent.get_action(state, noise, t)  #  行動を選択
        next_state, reward, done, _ = env.step(action * env.action_space.high)
        episode_reward += reward
        agent.memory.add(state, action, reward, done)
        agent.update()  # actorとcriticを更新
        state = next_state
        if done:
            break
    episode_rewards.append(episode_reward)
    writer.add_scalar("reward", episode_reward, episode)
    if episode % 20 == 0:
        print("Episode %d finished | Episode reward %f" % (episode, episode_reward))
        sum_reward = 0.0
        for k in range(50):
            state = env.reset()
            done = False
            step = 0
            while not done and step < max_steps:
                step += 1
                action = agent.get_action(state)  #  行動を選択
                next_state, reward, done, _ = env.step(action * env.action_space.high)
                sum_reward += reward
                state = next_state
        print("Episode %d finished | Average reward %f" % (episode, sum_reward / 50))

# 累積報酬の移動平均を表示
moving_average = np.convolve(episode_rewards, np.ones(num_average_epidodes)/num_average_epidodes, mode='valid')
plt.plot(np.arange(len(moving_average)),moving_average)
plt.title('DDPG: average rewards in %d episodes' % num_average_epidodes)
plt.xlabel('episode')
plt.ylabel('rewards')
plt.show()

env.close()

10000 Data collected
Episode 0 finished | Episode reward -1534.367660
Episode 0 finished | Average reward -1246.041005
loss/critic 0.9218184947967529
loss/actor 12.491300582885742
loss/critic 0.6930734515190125
loss/actor 15.819133758544922
loss/critic 0.6767975687980652
loss/actor 20.09617042541504
loss/critic 1.2657431364059448
loss/actor 27.29192352294922
Episode 20 finished | Episode reward -1132.803531
Episode 20 finished | Average reward -1440.696501
loss/critic 0.4763389825820923
loss/actor 30.72571563720703
loss/critic 1.1937825679779053
loss/actor 38.67076873779297
loss/critic 0.8999608755111694
loss/actor 41.87420654296875
loss/critic 1.2929565906524658
loss/actor 46.78300476074219
Episode 40 finished | Episode reward -1466.030796
Episode 40 finished | Average reward -1412.640073
loss/critic 0.6344180703163147
loss/actor 52.198089599609375
loss/critic 1.0720306634902954
loss/actor 59.16401290893555
loss/critic 0.6462262868881226
loss/actor 59.24747848510742
loss/critic 2.1654

Episode 480 finished | Average reward -167.045454
loss/critic 1.8633116483688354
loss/actor 25.75960922241211
loss/critic 0.3595045208930969
loss/actor 18.07027816772461
loss/critic 0.42204660177230835
loss/actor 17.088184356689453
loss/critic 0.4920065402984619
loss/actor 19.111093521118164
Episode 500 finished | Episode reward -122.382928
Episode 500 finished | Average reward -157.760209
loss/critic 0.8735018968582153
loss/actor 11.40994930267334
loss/critic 0.666684627532959
loss/actor 19.620346069335938
loss/critic 0.6897068619728088
loss/actor 15.01485824584961
loss/critic 0.5286005139350891
loss/actor 8.325299263000488
Episode 520 finished | Episode reward -120.565080
Episode 520 finished | Average reward -150.653989
loss/critic 0.4266456663608551
loss/actor 18.75041961669922
loss/critic 0.524056077003479
loss/actor 11.819513320922852
loss/critic 0.6108249425888062
loss/actor 9.664100646972656
loss/critic 0.32520121335983276
loss/actor 5.1766767501831055
Episode 540 finished | Ep

Episode 960 finished | Episode reward -128.437784
Episode 960 finished | Average reward -199.721197
loss/critic 0.30768483877182007
loss/actor 1.8846532106399536
loss/critic 0.5321428179740906
loss/actor 6.563823699951172
loss/critic 0.32113543152809143
loss/actor 3.785130500793457
loss/critic 0.4256055951118469
loss/actor 3.290717840194702
Episode 980 finished | Episode reward -372.796312
Episode 980 finished | Average reward -132.055528
loss/critic 0.3380940854549408
loss/actor 3.3096935749053955
loss/critic 0.4138662815093994
loss/actor 2.5006422996520996
loss/critic 0.43100666999816895
loss/actor 1.2917671203613281
loss/critic 0.44916436076164246
loss/actor 3.5561985969543457
Episode 1000 finished | Episode reward -124.100458
Episode 1000 finished | Average reward -183.656364
loss/critic 0.4459836184978485
loss/actor 5.723635196685791
loss/critic 0.459855318069458
loss/actor 3.2910847663879395
loss/critic 0.35270410776138306
loss/actor 3.4542236328125
loss/critic 0.3075803518295288

KeyboardInterrupt: 