AI which learns to play Pong using DQN, quickly! Original code from Maxim Lapan, Packt Publishing.

In [31]:
from lib import wrappers
from lib import dqn_model

from gym.wrappers.monitoring.video_recorder import VideoRecorder
from gym.wrappers import Monitor
import argparse
import time
import numpy as np
import collections

import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.tensorboard import SummaryWriter


DEFAULT_ENV_NAME = "Breakout-v0"
MEAN_REWARD_BOUND = 440

GAMMA = 0.99
BATCH_SIZE = 32
REPLAY_SIZE = 10000
LEARNING_RATE = 1e-4
LR_MIN = 1e-6
ITERATIONS_BEFORE_WR = 10000
WEIGHT_DECAY = 1.00
SYNC_TARGET_FRAMES = 1000
REPLAY_START_SIZE = 10000

EPSILON_DECAY_LAST_FRAME = 150000
EPSILON_START = 1.0
EPSILON_FINAL = 0.01


Experience = collections.namedtuple(
    'Experience', field_names=['state', 'action', 'reward',
                               'done', 'new_state'])


class ExperienceBuffer:
    def __init__(self, capacity):
        self.buffer = collections.deque(maxlen=capacity)

    def __len__(self):
        return len(self.buffer)

    def append(self, experience):
        self.buffer.append(experience)

    def sample(self, batch_size):
        indices = np.random.choice(len(self.buffer), batch_size,
                                   replace=False)
        states, actions, rewards, dones, next_states = \
            zip(*[self.buffer[idx] for idx in indices])
        return np.array(states), np.array(actions), \
               np.array(rewards, dtype=np.float32), \
               np.array(dones, dtype=np.uint8), \
               np.array(next_states)


class Agent:
    def __init__(self, env, exp_buffer):
        self.env = env
        self.exp_buffer = exp_buffer
        self._reset()

    def _reset(self):
        self.state = env.reset()
        self.total_reward = 0.0

    @torch.no_grad()
    def play_step(self, net, epsilon=0.0, device="cpu"):
        done_reward = None

        if np.random.random() < epsilon:
            action = env.action_space.sample()
        else:
            state_a = np.array([self.state], copy=False)
            state_v = torch.tensor(state_a).to(device)
            q_vals_v = net(state_v)
            _, act_v = torch.max(q_vals_v, dim=1)
            action = int(act_v.item())

        # do step in the environment
        new_state, reward, is_done, _ = self.env.step(action)
        self.total_reward += reward

        exp = Experience(self.state, action, reward,
                         is_done, new_state)
        self.exp_buffer.append(exp)
        self.state = new_state
        if is_done:
            done_reward = self.total_reward
            self._reset()
        return done_reward


def calc_loss(batch, net, tgt_net, device="cpu"):
    states, actions, rewards, dones, next_states = batch

    states_v = torch.tensor(np.array(
        states, copy=False)).to(device)
    next_states_v = torch.tensor(np.array(
        next_states, copy=False)).to(device)
    actions_v = torch.tensor(actions).to(device)
    rewards_v = torch.tensor(rewards).to(device)
    done_mask = torch.BoolTensor(dones).to(device)

    state_action_values = net(states_v).gather(
        1, actions_v.unsqueeze(-1)).squeeze(-1)
    with torch.no_grad():
        next_state_values = tgt_net(next_states_v).max(1)[0]
        next_state_values[done_mask] = 0.0
        next_state_values = next_state_values.detach()

    expected_state_action_values = next_state_values * GAMMA + \
                                   rewards_v
    return nn.MSELoss()(state_action_values,
                        expected_state_action_values)


if __name__ == "__main__":    
    
    
    
    env.close()
    
    
    
    
    
    device = torch.device("cuda")

    env = wrappers.make_env(DEFAULT_ENV_NAME)
    
    #env = Monitor(env, "videos", force=True)

    net = dqn_model.DQN(env.observation_space.shape,
                        env.action_space.n).to(device)
    tgt_net = dqn_model.DQN(env.observation_space.shape,
                            env.action_space.n).to(device)
    epsilon = EPSILON_START
    
    
    
    
    #state = torch.load('saves/PongNoFrameskip-v4-best_20.dat', map_location=lambda stg,_:stg)
    #net.load_state_dict(state)
    #tgt_net.load_state_dict(state)
    #epsilon = 0.02
    #EPSILON_DECAY_LAST_FRAME = 100
    #EPSILON_START = 0.01
    #EPSILON_FINAL = 0.01
    #state = env.reset()
    
    
    #video_recorder = None
    #video_recorder = VideoRecorder(env, 'videos/Recording-' + str(DEFAULT_ENV_NAME) + '.mp4', enabled=True)
    
    
    writer = SummaryWriter(comment="-cuda" + DEFAULT_ENV_NAME)
    print(net)

    buffer = ExperienceBuffer(REPLAY_SIZE)
    agent = Agent(env, buffer)
    

    optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
    scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=ITERATIONS_BEFORE_WR, eta_min=LR_MIN)
    #scheduler_solving = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=2, eta_min=1e-6)
    total_rewards = []
    frame_idx = 0
    ts_frame = 0
    ts = time.time()
    start_time = ts
    best_m_reward = None
    while True:
        frame_idx += 1
        epsilon = max(EPSILON_FINAL, EPSILON_START -
                      frame_idx / EPSILON_DECAY_LAST_FRAME)

        reward = agent.play_step(net, epsilon, device=device)
        #video_recorder.capture_frame()
        #env.render()
        if reward is not None:
            total_rewards.append(reward)
            speed = (frame_idx - ts_frame) / (time.time() - ts)
            ts_frame = frame_idx
            ts = time.time()
            m_reward = np.mean(total_rewards[-100:])
            print("%d: done %d games, reward %.3f, "
                  "eps %.2f, speed %.2f f/s, total time %.2fs" % (
                frame_idx, len(total_rewards), m_reward, epsilon,
                speed, (time.time() - start_time)
            ))
            print("Current learning rate: " + str(scheduler.get_last_lr()))
            writer.add_scalar("epsilon", epsilon, frame_idx)
            writer.add_scalar("speed", speed, frame_idx)
            writer.add_scalar("reward_100", m_reward, frame_idx)
            writer.add_scalar("reward", reward, frame_idx)
            if best_m_reward is None or best_m_reward < m_reward:
                torch.save(net.state_dict(), "saves/" + DEFAULT_ENV_NAME +
                           "-best_%.0f.dat" % m_reward)
                if best_m_reward is not None:
                    print("Best reward updated %.3f -> %.3f" % (
                        best_m_reward, m_reward))
                    print("Saved video.")
                    #video_recorder.close()
                    #video_recorder.enabled = False
                    
                best_m_reward = m_reward
            if m_reward >= MEAN_REWARD_BOUND:
                print("Solved in %d frames!" % frame_idx)
                break

        if len(buffer) < REPLAY_START_SIZE:
            continue

        if frame_idx % SYNC_TARGET_FRAMES == 0:
            tgt_net.load_state_dict(net.state_dict())

        optimizer.zero_grad()
        batch = buffer.sample(BATCH_SIZE)
        loss_t = calc_loss(batch, net, tgt_net, device=device)
        loss_t.backward()
        optimizer.step()
        #if best_m_reward is not None and best_m_reward > -19.0:
            #scheduler_solving.step(frame_idx)
        #else:
        scheduler.step(frame_idx)
    writer.close()
    #env._close_video_recorder()
    env.close()

DQN(
  (conv): Sequential(
    (0): Conv2d(4, 32, kernel_size=(8, 8), stride=(4, 4))
    (1): ReLU()
    (2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
    (3): ReLU()
    (4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
    (5): ReLU()
  )
  (fc): Sequential(
    (0): Linear(in_features=3136, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=4, bias=True)
  )
)
70: done 1 games, reward 0.000, eps 1.00, speed 463.21 f/s, total time 0.15s
Current learning rate: [0.0001]
136: done 2 games, reward 0.500, eps 1.00, speed 409.81 f/s, total time 0.31s
Current learning rate: [0.0001]
Best reward updated 0.000 -> 0.500
Saved video.
200: done 3 games, reward 0.667, eps 1.00, speed 402.15 f/s, total time 0.47s
Current learning rate: [0.0001]
Best reward updated 0.500 -> 0.667
Saved video.
246: done 4 games, reward 0.500, eps 1.00, speed 172.66 f/s, total time 0.74s
Current learning rate: [0.0001]
339: done 5 games, reward 1.000, eps 1.00, sp

KeyboardInterrupt: 