In [0]:
import zipfile
with zipfile.ZipFile("Archive.zip","r") as zip_ref:
    zip_ref.extractall("")

In [3]:
!pip install tensorboardX

Installing collected packages: tensorboardX
Successfully installed tensorboardX-1.9


In [0]:
class obj(object):
    def __init__(self, d):
        for a, b in d.items():
            if isinstance(b, (list, tuple)):
               setattr(self, a, [obj(x) if isinstance(x, dict) else x for x in b])
            else:
               setattr(self, a, obj(b) if isinstance(b, dict) else b)

In [0]:
import argparse
import torch

    # Basic Arguments
d  = {'seed': 1122,
    'batch_size':32,
    'no_cuda':False,
    # Training Arguments
    'max_frames':1400000,
                        
    'buffer_size':100000,
                       
    'update_target':1000,
                       
    'train_freq':1,
                       
    'gamma':0.99,
           
    'learning_start':10000,
                        
    'eps_start':1.0,
                       
    'eps_final':0.01,
                       
    'eps_decay':30000,
                    
    # Algorithm Arguments
    'double':True, #Enable Double_Q Learning
    'dueling':True, #Enable Dueling Network
    'noisy':True, #Enable Noisy Network
    'prioritized_replay' :True, #enable prioritized experience replay
    'c51':True, #enable categorical dqn
    'multi_step':1,
                       
    'Vmin':-10 , #Minimum value of support for c51
    'Vmax':10,  #Maximum value of support for c51
    'num_atoms': 51,# Number of atom for c51
    'alpha' : 0.6, #Alpha value for prioritized replay
    'beta_start':0.4,#Start value of beta for prioritized replay
    'beta_frames':100000,#End frame of beta schedule for prioritized replay
    'sigma_init':0.4,#Sigma initialization value for NoisyNet

    # Environment Arguments
    'env':'PongNoFrameskip-v4', #Environment Name
    'episode_life':1, #Whether env has episode life(1) or not(0)
    'clip_rewards':1,#Whether env clip rewards(1) or not(0)
    'frame_stack':1,#Whether env stacks frame(1) or not(0)
    'scale':0,#Whether env scales(1) or not(0)

    # Evaluation Arguments
    'load_model':None, #Pretrained model name to load (state dict)
    'save_model':'model', #Pretrained model name to save (state dict)
    'evaluate':None, #'Evaluate only'
    'render':None ,#Render evaluation agent
    'evaluation_interval': 10000,#Frames for evaluation interval

    # Optimization Arguments
    'lr':1e-4 ,#'Learning rate'
    
    'cuda' : torch.cuda.is_available(),
    'device' : torch.device("cuda" if torch.cuda.is_available() else "cpu")
}
 
parms = obj(d)

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import gym
import time, os
from tensorboardX import SummaryWriter

from common.utils import create_log_dir, print_args, set_global_seeds
from common.wrappers import make_atari, wrap_atari_dqn
#from arguments import get_args
#from train import train
#from test import test



In [7]:
print_args(parms)
log_dir = create_log_dir(parms)
writer = SummaryWriter(log_dir)

env = make_atari(parms.env)
env = wrap_atari_dqn(env, parms)

set_global_seeds(parms.seed)
env.seed(parms.seed)

                          Options
                          seed: 1122
                          batch_size: 32
                          no_cuda: False
                          max_frames: 1400000
                          buffer_size: 100000
                          update_target: 1000
                          train_freq: 1
                          gamma: 0.99
                          learning_start: 10000
                          eps_start: 1.0
                          eps_final: 0.01
                          eps_decay: 30000
                          double: True
                          dueling: True
                          noisy: True
                          prioritized_replay: True
                          c51: True
                          multi_step: 1
                          Vmin: -10
                          Vmax: 10
                          num_atoms: 51
                          alpha: 0.6
                          beta_start: 0.4
                       

[1122, 1711756444]

#Train 

In [0]:
import torch
import torch.optim as optim
import torch.nn.functional as F

import time, os
import numpy as np
from collections import deque

from common.utils import epsilon_scheduler, beta_scheduler, update_target, print_log, load_model, save_model
from model import DQN
from common.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer

def compute_td_loss(current_model, target_model, replay_buffer, optimizer, args, beta=None):
    """
    Calculate loss and optimize for non-c51 algorithm
    """
    if args.prioritized_replay:
        state, action, reward, next_state, done, weights, indices = replay_buffer.sample(args.batch_size, beta)
    else:
        state, action, reward, next_state, done = replay_buffer.sample(args.batch_size)
        weights = torch.ones(args.batch_size)

    state = torch.FloatTensor(np.float32(state)).to(args.device)
    next_state = torch.FloatTensor(np.float32(next_state)).to(args.device)
    action = torch.LongTensor(action).to(args.device)
    reward = torch.FloatTensor(reward).to(args.device)
    done = torch.FloatTensor(done).to(args.device)
    weights = torch.FloatTensor(weights).to(args.device)

    if not args.c51:
        q_values = current_model(state)
        target_next_q_values = target_model(next_state)

        q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1)

        if args.double:
            next_q_values = current_model(next_state)
            next_actions = next_q_values.max(1)[1].unsqueeze(1)
            next_q_value = target_next_q_values.gather(1, next_actions).squeeze(1)
        else:
            next_q_value = target_next_q_values.max(1)[0]

        expected_q_value = reward + (args.gamma ** args.multi_step) * next_q_value * (1 - done)

        loss = F.smooth_l1_loss(q_value, expected_q_value.detach(), reduction='none')
        if args.prioritized_replay:
            prios = torch.abs(loss) + 1e-5
        loss = (loss * weights).mean()
    
    else:
        q_dist = current_model(state)
        action = action.unsqueeze(1).unsqueeze(1).expand(args.batch_size, 1, args.num_atoms)
        q_dist = q_dist.gather(1, action).squeeze(1)
        q_dist.data.clamp_(0.01, 0.99)

        target_dist = projection_distribution(current_model, target_model, next_state, reward, done, 
                                              target_model.support, target_model.offset, args)

        loss = - (target_dist * q_dist.log()).sum(1)
        if args.prioritized_replay:
            prios = torch.abs(loss) + 1e-6
        loss = (loss * weights).mean()

    optimizer.zero_grad()
    loss.backward()
    if args.prioritized_replay:
        replay_buffer.update_priorities(indices, prios.data.cpu().numpy())
    optimizer.step()

    return loss


def projection_distribution(current_model, target_model, next_state, reward, done, support, offset, args):
    delta_z = float(args.Vmax - args.Vmin) / (args.num_atoms - 1)

    target_next_q_dist = target_model(next_state)

    if args.double:
        next_q_dist = current_model(next_state)
        next_action = (next_q_dist * support).sum(2).max(1)[1]
    else:
        next_action = (target_next_q_dist * support).sum(2).max(1)[1]

    next_action = next_action.unsqueeze(1).unsqueeze(1).expand(target_next_q_dist.size(0), 1, target_next_q_dist.size(2))
    target_next_q_dist = target_next_q_dist.gather(1, next_action).squeeze(1)

    reward = reward.unsqueeze(1).expand_as(target_next_q_dist)
    done = done.unsqueeze(1).expand_as(target_next_q_dist)
    support = support.unsqueeze(0).expand_as(target_next_q_dist)

    Tz = reward + args.gamma * support * (1 - done)
    Tz = Tz.clamp(min=args.Vmin, max=args.Vmax)
    b = (Tz - args.Vmin) / delta_z
    l = b.floor().long()
    u = b.ceil().long()

    target_dist = target_next_q_dist.clone().zero_()
    target_dist.view(-1).index_add_(0, (l + offset).view(-1), (target_next_q_dist * (u.float() - b)).view(-1))
    target_dist.view(-1).index_add_(0, (u + offset).view(-1), (target_next_q_dist * (b - l.float())).view(-1))

    return target_dist

def multi_step_reward(rewards, gamma):
    ret = 0.
    for idx, reward in enumerate(rewards):
        ret += reward * (gamma ** idx)
    return ret

In [0]:
 
current_model = DQN(env, parms).to(parms.device)
target_model = DQN(env, parms).to(parms.device)

if parms.noisy:
        current_model.update_noisy_modules()
        target_model.update_noisy_modules()

if parms.load_model and os.path.isfile(parms.load_model):
        load_model(current_model, parms)

epsilon_by_frame = epsilon_scheduler(parms.eps_start, parms.eps_final, parms.eps_decay)
beta_by_frame = beta_scheduler(parms.beta_start, parms.beta_frames)

if parms.prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(parms.buffer_size, parms.alpha)
else:
        replay_buffer = ReplayBuffer(parms.buffer_size)
    
state_deque = deque(maxlen=parms.multi_step)
reward_deque = deque(maxlen=parms.multi_step)
action_deque = deque(maxlen=parms.multi_step)

optimizer = optim.Adam(current_model.parameters(), lr=parms.lr)

reward_list, length_list, loss_list = [], [], []
episode_reward = 0
episode_length = 0

prev_time = time.time()
prev_frame = 1

state = env.reset()
for frame_idx in range(1, parms.max_frames + 1):
        if parms.render:
            env.render()

        if parms.noisy:
            current_model.sample_noise()
            target_model.sample_noise()

        epsilon = epsilon_by_frame(frame_idx)
        action = current_model.act(torch.FloatTensor(state).to(parms.device), epsilon)

        next_state, reward, done, _ = env.step(action)
        state_deque.append(state)
        reward_deque.append(reward)
        action_deque.append(action)

        if len(state_deque) == parms.multi_step or done:
            n_reward = multi_step_reward(reward_deque, parms.gamma)
            n_state = state_deque[0]
            n_action = action_deque[0]
            replay_buffer.push(n_state, n_action, n_reward, next_state, np.float32(done))

        state = next_state
        episode_reward += reward
        episode_length += 1

        if done:
            state = env.reset()
            reward_list.append(episode_reward)
            length_list.append(episode_length)
            writer.add_scalar("data/episode_reward", episode_reward, frame_idx)
            writer.add_scalar("data/episode_length", episode_length, frame_idx)
            episode_reward, episode_length = 0, 0
            state_deque.clear()
            reward_deque.clear()
            action_deque.clear()

        if len(replay_buffer) > parms.learning_start and frame_idx % parms.train_freq == 0:
            beta = beta_by_frame(frame_idx)
            loss = compute_td_loss(current_model, target_model, replay_buffer, optimizer, parms, beta)
            loss_list.append(loss.item())
            writer.add_scalar("data/loss", loss.item(), frame_idx)

        if frame_idx % parms.update_target == 0:
            update_target(current_model, target_model)

        if frame_idx % parms.evaluation_interval == 0:
            print_log(frame_idx, prev_frame, prev_time, reward_list, length_list, loss_list)
            reward_list.clear(), length_list.clear(), loss_list.clear()
            prev_frame = frame_idx
            prev_time = time.time()
            save_model(current_model, parms)

save_model(current_model, parms)

writer.export_scalars_to_json(os.path.join(log_dir, "all_scalars.json"))
writer.close()
env.close()


# Test

In [10]:
import torch
import torch.optim as optim

import os
from common.utils import load_model
from model import DQN

from PIL import Image

current_model = DQN(env, parms).to(parms.device)
current_model.eval()

load_model(current_model, parms)

episode_reward = 0
episode_length = 0

state = env.reset()
frames = []
while True:
        if parms.render:
          
            env.render()
        frames.append(Image.fromarray(env.render(mode='rgb_array')))  
        action = current_model.act(torch.FloatTensor(state).to(parms.device), 0.)

        next_state, reward, done, _ = env.step(action)

        state = next_state
        episode_reward += reward
        episode_length += 1

        if done:
            break
    
print("Test Result - Reward {} Length {}".format(episode_reward, episode_length))
    

Test Result - Reward 20.0 Length 1669


In [0]:
with open('openai_gym.gif', 'wb') as f:  # change the path if necessary
    im = Image.new('RGB', frames[0].size)
    im.save(f, save_all=True, append_images=frames)

<img id="gif" src=""/>

<script>document.getElementById("gif".src="openai_gym.gif?"+new Date().getTime();</script>