In [None]:
%matplotlib inline

from pyvirtualdisplay import Display
from gym.wrappers import Monitor

import matplotlib
import numpy as np
import gym
import torch
from TD3_Agent import Td3Agent
import matplotlib.pyplot as plt
from torch.utils.tensorboard import SummaryWriter
import pyglet

import matplotlib.animation as animation
import matplotlib.pyplot as plt

gym.logger.set_level(40)
from collections import deque
from GymWrapper import ImgWrapper
from util import layer_init
from Noise import ActionNoise

env = None
is_image = False
if is_image:
    env = ImgWrapper(gym.make('Pendulum-v0'), gray_scale=True)
else:
    env = gym.make('Pendulum-v0')
#検証用にシードを固定する
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)
env.seed(42)

num_episode = 5000  # 学習エピソード数（学習に時間がかかるので短めにしています）
memory_size = 100000  # replay bufferの大きさ
initial_memory_size = 10000  # 最初に貯めるランダムな遷移の数
# ログ用の設定
episode_rewards = []
num_average_epidodes = 10
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
writer = SummaryWriter(log_dir='./runs')
max_steps = env.spec.max_episode_steps  # エピソードの最大ステップ数
agent = Td3Agent(env.observation_space,
                  env.action_space,
                  device,
                  batch_size = 64,
                  memory_size=memory_size,
                  writer=writer,
                  is_image = is_image)

# 最初にreplay bufferにランダムな行動をしたときのデータを入れる
state = env.reset()
for step in range(initial_memory_size):
    action = env.action_space.sample() # ランダムに行動を選択 
    next_state, reward, done, _ = env.step(action)
    agent.memory.add(state, action, reward, done)
    state = env.reset() if done else next_state
print('%d Data collected' % (initial_memory_size))

for episode in range(num_episode):
    state = env.reset()
    episode_reward = 0
    noise = ActionNoise(env.action_space.shape[0])
    for t in range(max_steps):
        action = agent.get_action(state)  #  行動を選択
        next_state, reward, done, _ = env.step(action * env.action_space.high)
        episode_reward += reward
        agent.memory.add(state, action, reward, done)
        agent.update()  # actorとcriticを更新
        state = next_state
        if done:
            break
    episode_rewards.append(episode_reward)
    writer.add_scalar("reward", episode_reward, episode)
    if episode % 20 == 0:
        print("Episode %d finished | Episode reward %f" % (episode, episode_reward))
        sum_reward = 0.0
        done = False
        for k in range(50):
            state = env.reset()
            step = 0
            while not done and step < max_steps:
                step += 1
                action = agent.get_action(state)  #  行動を選択
                next_state, reward, done, _ = env.step(action * env.action_space.high)
                sum_reward += reward
                state = next_state
        print("Episode %d finished | Average reward %f" % (episode, sum_reward / 50))

# 累積報酬の移動平均を表示
moving_average = np.convolve(episode_rewards, np.ones(num_average_epidodes)/num_average_epidodes, mode='valid')
plt.plot(np.arange(len(moving_average)),moving_average)
plt.title('DDPG: average rewards in %d episodes' % num_average_epidodes)
plt.xlabel('episode')
plt.ylabel('rewards')
plt.show()

env.close()

10000 Data collected
Episode 0 finished | Episode reward -1652.418807
Episode 0 finished | Average reward -28.500828
loss/critic 0.035965412855148315
loss/actor 7.832014083862305
loss/critic 0.07680290937423706
loss/actor 10.49902629852295
