In [1]:
%matplotlib inline

from pyvirtualdisplay import Display
from gym.wrappers import Monitor

import matplotlib
import numpy as np
import gym
import torch
from PPG_Agent import PpgAgent
import matplotlib.pyplot as plt
from torch.utils.tensorboard import SummaryWriter
import pyglet

import matplotlib.animation as animation
import matplotlib.pyplot as plt

gym.logger.set_level(40)
from collections import deque
from GymWrapper import ImgWrapper
from util import layer_init
from Noise import ActionNoise

env = None
is_image = False
if is_image:
    env = ImgWrapper(gym.make('LunarLanderContinuous-v2'), gray_scale=True)
else:
    env = gym.make('LunarLanderContinuous-v2')
#検証用にシードを固定する
np.random.seed(26)
torch.manual_seed(26)
torch.cuda.manual_seed(26)
env.seed(26)

num_episode = 2500  # 学習エピソード数（学習に時間がかかるので短めにしています）
memory_size = 400000  # replay bufferの大きさ
# ログ用の設定
episode_rewards = []
num_average_epidodes = 25
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
writer = SummaryWriter(log_dir='./runs')
max_steps = env.spec.max_episode_steps  # エピソードの最大ステップ数
agent = PpgAgent(env.observation_space,
                  env.action_space,
                  device,
                  memory_size=memory_size,
                  writer=writer,
                  is_image = is_image)

for episode in range(num_episode):
    state = env.reset()
    episode_reward = 0
    noise = ActionNoise(env.action_space.shape[0])
    for t in range(max_steps):
        action, log_pis = agent.get_action(state)  #  行動を選択
        next_state, reward, done, _ = env.step(action)
        episode_reward += reward
        done_masked = False if t == (max_steps - 1) else done
        agent.memory.add(state, action, reward / 100.0, done_masked, log_pis)
        agent.update(next_state)  # actorとcriticを更新
        state = next_state
        if done:
            break
    episode_rewards.append(episode_reward)
    writer.add_scalar("reward", episode_reward, episode)
    #print("Episode %d finished | Episode reward %f" % (episode, episode_reward))
    if episode % 100 == 0:
        sum_reward = 0.0
        agent.memory.save_memory()
        for k in range(50):
            state = env.reset()
            done = False
            step = 0
            while not done and step < max_steps:
                step += 1
                action, log_pis = agent.get_action(state)  #  行動を選択
                next_state, reward, done, _ = env.step(action)
                sum_reward += reward
                state = next_state
        print("Episode %d finished | Average reward %f" % (episode, sum_reward / 50))

# 累積報酬の移動平均を表示
moving_average = np.convolve(episode_rewards, np.ones(num_average_epidodes)/num_average_epidodes, mode='valid')
plt.plot(np.arange(len(moving_average)),moving_average)
plt.title('PPG: average rewards in %d episodes' % num_average_epidodes)
plt.xlabel('episode')
plt.ylabel('rewards')
plt.show()

env.close()

Episode 0 finished | Average reward -233.710662
Episode 100 finished | Average reward -757.753038


KeyboardInterrupt: 