In [1]:
import gym
import time
import zlib
import torch
import pickle
import random
import numpy as np
import torch.optim as optim
import torch.nn.functional as F
from Agent import Agent
from Logger import Logger
from Observer import Observer
from ReplayBuffer import ReplayBuffer
from collections import namedtuple, deque
from gym.wrappers import AtariPreprocessing, Monitor

# 設定
BUFFER_SIZE = 1000000
BATCH_SIZE = 32
GAMMA = 0.99
INITIAL_EPS = 1.0
FINAL_EPS = 0.1
DECAY_EPS = 0.01
LEARNING_RATE = 0.00025
UPDATE_POLICY_FREQ = 4
UPDATE_TARGET_FREQ = 10000
REPLAY_START_SIZE = 50000
TOTAL_STEPS = 1000000

seed = 1
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

random.seed(seed)
torch.manual_seed(seed)

Transition = namedtuple("Transition", ("state", "action", "next_state", "reward"))

# 環境構築
ENV = "BreakoutNoFrameskip-v4"
env = gym.make(ENV)
env.seed(seed)
env = AtariPreprocessing(env, noop_max=30, frame_skip=4, screen_size=84, 
                         terminal_on_life_loss=True, grayscale_obs=True)
n_actions = env.action_space.n
env = Observer(env=env, device=device, seed=seed)

# リプレイバッファ
replay_buffer = ReplayBuffer(capacity=BUFFER_SIZE, Transition=Transition)
logger = Logger()

# ネットワーク構築
h, w = 84, 84

policy_net = Agent(h=h, w=w, n_actions=n_actions, gamma=GAMMA, initial_eps=INITIAL_EPS, final_eps=FINAL_EPS, 
                   decay_eps=DECAY_EPS, replay_buffer=replay_buffer, seed=seed, batch_size=BATCH_SIZE, 
                   device=device).to(device)
target_net = Agent(h=h, w=w, n_actions=n_actions, gamma=GAMMA, initial_eps=INITIAL_EPS, final_eps=FINAL_EPS, 
                   decay_eps=DECAY_EPS, replay_buffer=replay_buffer, seed=seed, batch_size=BATCH_SIZE, 
                   device=device).to(device)

target_net.load_state_dict(policy_net.state_dict())
policy_net.train()
target_net.eval()
optimizer = optim.Adam(policy_net.parameters(), lr=LEARNING_RATE, eps=0.01/BATCH_SIZE)

# 訓練開始
print(device)

env.reset()        
steps_done = 0
start = time.time()
cnt = 0
        
while steps_done < TOTAL_STEPS:
            
    lives = 0
    state = env.reset()
    episode_reward = 0
    
    frames = deque([state] * 4, maxlen=4)
            
    while lives < 5:

        state = torch.cat([frame for frame in frames], axis=1).to(device)
        action = policy_net.policy(state)
        next_state, reward, done, info = env.step(action)
        frames.append(next_state)
        next_state = torch.cat([frame for frame in frames], axis=1).to(device)
        episode_reward += reward.item()
        
        if done:
            replay_buffer.push((state, action, None, reward), Transition)
            logger.write(episode_reward)
            episode_reward = 0
            lives += 1
        else:
            replay_buffer.push((state, action, next_state, reward), Transition)

        state = next_state
        steps_done += 1
                                
        if len(replay_buffer.buffer) > REPLAY_START_SIZE:
            if steps_done % UPDATE_POLICY_FREQ == 0:
                policy_net.update_network(target_net, optimizer)
            if steps_done % UPDATE_TARGET_FREQ == 0:
                target_net.load_state_dict(policy_net.state_dict())

        if steps_done >= cnt * (TOTAL_STEPS // 10):
            print("{} % has done".format(cnt * 10), time.time() - start)
            cnt += 1

logger.save(policy_net)
env.close()

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\Users\fooma\Anaconda3\envs\PyTorch\lib\site-packages\IPython\core\interactiveshell.py", line 3444, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\fooma\AppData\Local\Temp/ipykernel_4528/2241326178.py", line 4, in <module>
    import torch
  File "C:\Users\fooma\Anaconda3\envs\PyTorch\lib\site-packages\torch\__init__.py", line 721, in <module>
    import torch.utils.data
  File "C:\Users\fooma\Anaconda3\envs\PyTorch\lib\site-packages\torch\utils\data\__init__.py", line 38, in <module>
    from torch.utils.data.dataloader_experimental import DataLoader2
  File "C:\Users\fooma\Anaconda3\envs\PyTorch\lib\site-packages\torch\utils\data\dataloader_experimental.py", line 11, in <module>
    from torch.utils.data.datapipes.iter import IterableWrapper
  File "C:\Users\fooma\Anaconda3\envs\PyTorch\lib\site-packages\torch\utils\data\datapipes\__init__.py", line 1, in <module>
    from . import iter
  File "C:\Users\

TypeError: object of type 'NoneType' has no len()

In [None]:
import gym
import torch
from Logger import Logger
from Observer import Observer
from gym.wrappers import AtariPreprocessing, Monitor

ENV = "BreakoutNoFrameskip-v4"
seed = 0
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ビデオ録画
env = gym.make(ENV)
env = AtariPreprocessing(env, noop_max=0, frame_skip=4, screen_size=84, grayscale_obs=True)
obs = Monitor(env, "Video", force=True)
env = Observer(env=env, device=device, seed=seed)
logger = Logger()
model = logger.load()
print(model)

done = False
state = env.reset()
state = obs.reset()
while not done:
    action = model.policy(state)
    state, reward, done, info = env.step(action)
    state, reward, done, info = obs.step(action)
env.close()
obs.close()

In [None]:
import torch
import torch.nn.functional as F

f = lambda x : x

states = ()
for i in range(9):
    if i % 3 == 2:
        states += (None,)
    else:
        states += (torch.tensor(i + 1, dtype=torch.float32).unsqueeze(0),)
print("states :", states)

non_final_mask = torch.tensor(tuple(map(lambda s : s is not None, states)), dtype=torch.bool)
print("non_final_mask :", non_final_mask)

non_final_next_states = torch.cat([s for s in states if s is not None])
print("non_final_next_states :", non_final_next_states)

state_action_values = torch.tensor([0 for i in range(9)])
state_action_values = state_action_values.T.unsqueeze(1)
print("state_action_values :", state_action_values)
next_state_values = torch.zeros(9)
next_state_values[non_final_mask] = f(non_final_next_states)
next_state_values = next_state_values.T.unsqueeze(1)
print("next_state_values :", next_state_values)
expected_state_action_values = 0.99 * next_state_values + 1
expected_state_action_values = expected_state_action_values.T
print("expected_state_action_values :", expected_state_action_values)
loss = F.smooth_l1_loss(state_action_values, expected_state_action_values)
print(loss)

In [None]:
import gym
from gym.wrappers import AtariPreprocessing

env = gym.make("BreakoutNoFrameskip-v4")
env = AtariPreprocessing(env, noop_max=30, frame_skip=4, screen_size=84, 
                         terminal_on_life_loss=True, grayscale_obs=True)

state = env.reset()
print(env.unwrapped.get_action_meanings())
env.close()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from Logger import Logger

logs = pd.read_csv("logs/logs.csv")
x = range(len(logs["reward"]))
y = logs["reward"]

plt.plot(x, y, label="reward")
plt.legend()
plt.show()

logger = Logger("logs", "logs_2")

for i in range(len(logs) // 5):
    R = logs[i * 5 : i * 5 + 5]["reward"].sum()
    logger.write(R)
    
logs["reward"].sum()