In [5]:
import gym
import time
import zlib
import torch
import pickle
import random
import numpy as np
import torch.optim as optim
import torch.nn.functional as F
from Agent import Agent
from Logger import Logger
from Observer import Observer
from ReplayBuffer import ReplayBuffer
from collections import namedtuple, deque
from gym.wrappers import AtariPreprocessing, Monitor

# 設定
BUFFER_SIZE = 1000000
BATCH_SIZE = 32
GAMMA = 0.99
INITIAL_EPS = 1.0
FINAL_EPS = 0.1
DECAY_EPS = 0.01
LEARNING_RATE = 0.00025
UPDATE_POLICY_FREQ = 4
UPDATE_TARGET_FREQ = 10000
REPLAY_START_SIZE = 50000
TOTAL_STEPS = 1000000
EVALUATION_FREQ = 10000
TOTAL_EVALUATION_STEPS = 5000
MAX_EVALUATION_STEPS = 108000
N_STEP = 3

seed = 1
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

random.seed(seed)
torch.manual_seed(seed)

Transition = namedtuple("Transition", ("state", "action", "next_state", "reward", "expo"))

# 環境構築
ENV = "BreakoutNoFrameskip-v4"
env = gym.make(ENV)
env.seed(seed)
env = AtariPreprocessing(env, noop_max=30, frame_skip=4, screen_size=84, grayscale_obs=True)
n_actions = env.action_space.n
env = Observer(env=env, device=device, seed=seed)

# リプレイバッファ
replay_buffer = ReplayBuffer(capacity=BUFFER_SIZE, n_step=N_STEP, gamma=GAMMA, 
                             Transition=Transition, device=device)
logger = Logger(seed=seed)

# ネットワーク構築
h, w = 84, 84

policy_net = Agent(h=h, w=w, n_actions=n_actions, gamma=GAMMA, initial_eps=INITIAL_EPS, final_eps=FINAL_EPS, 
                   decay_eps=DECAY_EPS, replay_buffer=replay_buffer, seed=seed, batch_size=BATCH_SIZE, 
                   device=device).to(device)
target_net = Agent(h=h, w=w, n_actions=n_actions, gamma=GAMMA, initial_eps=INITIAL_EPS, final_eps=FINAL_EPS, 
                   decay_eps=DECAY_EPS, replay_buffer=replay_buffer, seed=seed, batch_size=BATCH_SIZE, 
                   device=device).to(device)

target_net.load_state_dict(policy_net.state_dict())
policy_net.train()
target_net.eval()
optimizer = optim.Adam(policy_net.parameters(), lr=LEARNING_RATE, eps=0.00015)

# 訓練開始
print(device)

env.reset()        
steps_done = 0
start = time.time()
best_score = -1000000000

while steps_done < TOTAL_STEPS:
            
    lives = 5
    done = False
    frames = deque([env.reset()] * 4, maxlen=4)
    state = torch.cat([frame for frame in frames], axis=1).to(device)
            
    while not done:
        
        action = policy_net.policy(state, True)
        next_state, reward, done, info = env.step(action)
        frames.append(next_state)
        next_state = torch.cat([frame for frame in frames], axis=1).to(device)
        
        if lives != info["lives"]:
            lives = info["lives"]
            replay_buffer.push((state, action, None, reward, torch.tensor(N_STEP)))
        else:
            replay_buffer.push((state, action, next_state, reward, torch.tensor(N_STEP)))

        state = next_state
        
        # ネットワーク更新
        if len(replay_buffer.buffer) > REPLAY_START_SIZE:
            if steps_done % UPDATE_POLICY_FREQ == 0:
                policy_net.update_network(target_net, optimizer)
            if steps_done % UPDATE_TARGET_FREQ == 0:
                target_net.load_state_dict(policy_net.state_dict())
                
        # 評価
        if steps_done % EVALUATION_FREQ == 0:
            policy_net.eval()
            eval_rewards = []
            eval_steps_done = 0 
            eval_env = gym.make(ENV)
            eval_env = AtariPreprocessing(eval_env, noop_max=30, frame_skip=4, screen_size=84, grayscale_obs=True)
            eval_env = Observer(eval_env, device=device, seed=seed+steps_done//EVALUATION_FREQ)
            while eval_steps_done < TOTAL_EVALUATION_STEPS:
                eval_done = False
                eval_episode_reward = 0
                eval_episode_steps_done = 0
                eval_frames = deque([eval_env.reset()] * 4, maxlen=4)
                eval_state = torch.cat([eval_frame for eval_frame in eval_frames], axis=1).to(device)
                while (not eval_done) and (eval_episode_steps_done < MAX_EVALUATION_STEPS):
                    eval_action = policy_net.policy(eval_state, False)
                    eval_next_state, eval_reward, eval_done, eval_info = eval_env.step(eval_action)
                    eval_frames.append(eval_next_state)
                    eval_state = torch.cat([eval_frame for eval_frame in eval_frames], axis=1).to(device)
                    eval_steps_done += 1
                    eval_episode_steps_done += 1
                    eval_episode_reward += eval_reward.item()
                eval_rewards.append(eval_episode_reward)
            eval_average_score = sum(eval_rewards) / len(eval_rewards)
            if best_score < eval_average_score:
                logger.save(policy_net, path_or_buf="logs/best_" + ENV + ".pkl")
                best_score = eval_average_score
            logger.write(sum(eval_rewards) / len(eval_rewards))
            eval_env.close()
            policy_net.train()
            print("{:.2f} % has done.".format(steps_done / TOTAL_STEPS * 100))
            
        steps_done += 1

logger.save(policy_net, path_or_buf="logs/" + ENV + ".pkl")
env.close()

cpu
0.00 % has done.
1.00 % has done.
2.00 % has done.


KeyboardInterrupt: 

In [2]:
import gym
import torch
from Logger import Logger
from Observer import Observer
from collections import namedtuple, deque
from gym.wrappers import AtariPreprocessing, Monitor

ENV = "BreakoutNoFrameskip-v4"
seed = 0
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#Transition = namedtuple("Transition", ("state", "action", "next_state", "reward"))

# ビデオ録画
env = gym.make(ENV)
env = AtariPreprocessing(env, noop_max=0, frame_skip=4, screen_size=84, grayscale_obs=True)
env = Monitor(env, "Video", force=True)
env = Observer(env=env, device=device, seed=seed)

logger = Logger()
model = logger.load(path_or_buf="logs/breakout.pkl")
#print(model)

done = False
state = env.reset()
frames = deque([state] * 4, maxlen=4)
state = torch.cat([frame for frame in frames], axis=1).to(device)
while not done:
    action = model.policy(state)
    next_state, reward, done, info = env.step(action)
    frames.append(next_state)
    next_state = torch.cat([frame for frame in frames], axis=1).to(device)
    state = next_state
env.close()
print("Video has recorded")

Video has recorded


In [16]:
import torch
import torch.nn.functional as F

f = lambda x : x

states = ()
for i in range(9):
    if i % 3 == 2:
        states += (None,)
    else:
        states += (torch.tensor(i + 1, dtype=torch.float32).unsqueeze(0),)
print("states :", states)

non_final_mask = torch.tensor(tuple(map(lambda s : s is not None, states)), dtype=torch.bool)
print("non_final_mask :", non_final_mask)

non_final_next_states = torch.cat([s for s in states if s is not None])
print("non_final_next_states :", non_final_next_states)

state_action_values = torch.tensor([0 for i in range(9)])
state_action_values = state_action_values.T.unsqueeze(1)
print("state_action_values :", state_action_values)
next_state_values = torch.zeros(9)
next_state_values[non_final_mask] = f(non_final_next_states)
next_state_values = next_state_values.T.unsqueeze(1)
print("next_state_values :", next_state_values)
expected_state_action_values = 0.99 * next_state_values + 1
expected_state_action_values = expected_state_action_values.T
print("expected_state_action_values :", expected_state_action_values)
loss = F.smooth_l1_loss(state_action_values, expected_state_action_values)
print(loss)

states : (tensor([1.]), tensor([2.]), None, tensor([4.]), tensor([5.]), None, tensor([7.]), tensor([8.]), None)
non_final_mask : tensor([ True,  True, False,  True,  True, False,  True,  True, False])
non_final_next_states : tensor([1., 2., 4., 5., 7., 8.])
state_action_values : tensor([[0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0]])
next_state_values : tensor([[1.],
        [2.],
        [0.],
        [4.],
        [5.],
        [0.],
        [7.],
        [8.],
        [0.]])
expected_state_action_values : tensor([[1.9900, 2.9800, 1.0000, 4.9600, 5.9500, 1.0000, 7.9300, 8.9200, 1.0000]])
tensor(3.4700)


  loss = F.smooth_l1_loss(state_action_values, expected_state_action_values)


In [None]:
import gym
from gym.wrappers import AtariPreprocessing

env = gym.make("BreakoutNoFrameskip-v4")
env = AtariPreprocessing(env, noop_max=30, frame_skip=4, screen_size=84, 
                         terminal_on_life_loss=True, grayscale_obs=True)

state = env.reset()
print(env.unwrapped.get_action_meanings())
env.close()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from Logger import Logger

logs = pd.read_csv("logs/logs.csv")
x = range(len(logs["reward"]))
y = logs["reward"]

plt.plot(x, y, label="reward")
plt.legend()
plt.show()

logger = Logger("logs", "logs_2")

for i in range(len(logs) // 5):
    R = logs[i * 5 : i * 5 + 5]["reward"].sum()
    logger.write(R)
    
logs["reward"].sum()

In [10]:
from collections import deque

dq = deque([i for i in range(4)], maxlen=4)
print(len(dq), dq[-1])
dq.popleft()
print(len(dq), dq[-1])
dq.popleft()
print(len(dq), dq[-1])
dq.popleft()
print(len(dq), dq[0], dq[-1])

4 3
3 3
2 3
1 3 3


In [22]:
import gym

ENV = "BreakoutNoFrameskip-v4"
env = gym.make(ENV)

ans = 0
done = False
state = env.reset()

while not done:
    action = env.action_space.sample()
    state, reward, done, info = env.step(action)
    ans += reward
print(ans)
env.close()

0.0


In [3]:
import torch
import numpy as np

a = torch.tensor(np.array(-2.0))
print(a)
a.clamp_(-1, 1)
print(a)

tensor(-2., dtype=torch.float64)
tensor(-1., dtype=torch.float64)
