In [1]:
import gym
import ptan
import argparse
import numpy as np
import torch
import torch.optim as optim
from tensorboardX import SummaryWriter
from lib import dqn_model, common

In [2]:
#we define the prior alpha value and the change of beta frames, in the first 100000 frames, beta will change from 0.4 to 1.0
PRIO_REPLAY_ALPHA = 0.6
BETA_START = 0.4
BETA_FRAMES = 100000

In [3]:
class PrioReplayBuffer:
    #we use ring buffer to store the replay buffer data, ring buffer can store enough data without resetting, we record
    #the priorities and put the iterator to the experience source object for efficient sampling.
    def __init__(self, exp_source, buf_size, prob_alpha=0.6):
        self.exp_source_iter = iter(exp_source)
        self.prob_alpha = prob_alpha
        self.capacity = buf_size
        self.pos = 0
        self.buffer = []
        self.priorities = np.zeros((buf_size, ), dtype=np.float32)
    
    #populate() will get fixed amount of item from ExperienceSource and save in buffer.
    #if buffer is not full, we put the sample into buffer.
    #if buffer is full, we overwrite the oldest data and change the record position and the buffer size
    def __len__(self):
        return len(self.buffer)
    
    def populate(self, count):
        max_prio = self.priorities.max() if self.buffer else 1.0
        for _ in range(count):
            sample = next(self.exp_source_iter)
            if len(self.buffer) < self.capacity:
                self.buffer.append(sample)
            else:
                self.buffer[self.pos] = sample
            self.priorities[self.pos] = max_prio
            self.pos = (self.pos + 1) % self.capacity
    
    #in sample(), we use hyperparameters ALPHA to change priority value to probabilities.
    def sample(self, batch_size, beta=0.4):
        if len(self.buffer) == self.capacity:
            prios = self.priorities
        else:
            prios = self.priorities[:self.pos]
        probs = prios ** self.prob_alpha
        probs /= probs.sum()
        
        #we use the probabilities to sample in the replay buffer and get the batch
        indices = np.random.choice(len(self.buffer), batch_size, p=probs)
        samples = [self.buffer[idx] for idx in indices]
        
        #we calculate the sample weight and return batch, indices and weights, we need the indices to update the priority
        #level of the samples
        total = len(self.buffer)
        weights = (total * probs[indices]) ** (-beta)
        weights /= weights.max()
        return samples, indices, np.array(weights, dtype=np.float32)
    
    #this function update the priority level for the processed batch, function caller should use the loss to call this batch
    def update_priorities(self, batch_indices, batch_priorities):
        for idx, prio in zip(batch_indices, batch_priorities):
            self.priorities[idx] = prio
    
    

In [4]:
#MSELoss in Pytorch do not support weighting, because in regression problem MSE is the loss, but sample weighting is
#usually used in classification problem, therefore, we need to calculate MSE here and add weightings
def calc_loss(batch, batch_weights, net, tgt_net, gamma, device="cpu"):
    states, actions, rewards, dones, next_states = common.unpack_batch(batch)
    
    #all are same as before except we adding the batch weighting here
    states_v = torch.tensor(states).to(device)
    next_states_v = torch.tensor(next_states).to(device)
    actions_v = torch.tensor(actions).to(device)
    rewards_v = torch.tensor(rewards).to(device)
    done_mask = torch.BoolTensor(dones).to(device)
    batch_weights_v = torch.tensor(batch_weights).to(device)
    
    state_action_values = net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1)
    next_state_values = tgt_net(next_states_v).max(1)[0]
    next_state_values[done_mask] = 0.0
    
    #we calculate loss below, we don't use library function and implement our own code because we can add weightings
    #the weighting will transfer to priority replay buffer to renew the priority, to prevent zero priority existing, we
    #will add a very small value to deal with the zero loss value problem
    expected_state_action_values = next_state_values.detach() * gamma + rewards_v
    losses_v = batch_weights_v * (state_action_values - expected_state_action_values) ** 2
    return losses_v.mean(), losses_v + 1e-5

In [5]:
if __name__ == "__main__":
    #input hyperparameters, check CUDA available, create environment,then we use PTAN DQN wrapper to wrap up the environment
    params = common.HYPERPARAMS['pong']
    parser = argparse.ArgumentParser()
    parser.add_argument("--cuda", default=True, action="store_true", help="Enable cuda")
    args, unknown = parser.parse_known_args()
    device = torch.device("cuda" if args.cuda else "cpu")
    
    env = gym.make(params['env_name'])
    env = ptan.common.wrappers.wrap_dqn(env)
    
    #we make a writer for the environment and action dimension
    writer = SummaryWriter(comment="-" + params['run_name'] + "-prio-replay")
    net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device)
    #the wrapper below can create a copy of DQN network, which is target network, and constantly synchronize with online
    #network
    tgt_net = ptan.agent.TargetNet(net)
    
    #we create agent to change observation to action value, we also need action selector to choose the action we use
    #We use epsilon greedy method as action selector here
    selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start'])
    epsilon_tracker = common.EpsilonTracker(selector, params)
    agent = ptan.agent.DQNAgent(net, selector, device=device)
    
    #experience source is from one step ExperienceSourceFirstLast and replay buffer, it will store fixed step transitions
    exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=1)
    #we use priority buffer above instead of the original buffer
    buffer = PrioReplayBuffer(exp_source, params['replay_size'], PRIO_REPLAY_ALPHA)
    
    #create optimizer and frame counter
    optimizer = optim.Adam(net.parameters(), lr=params['learning_rate'])
    frame_idx = 0
    
    beta = BETA_START
    
    #reward tracker will report mean reward when episode end, and increase frame counter by 1, also getting a transition
    #from frame buffer.
    #buffer.populate(1) will activate following actions:
    #ExperienceReplayBuffer will request for next transition from experience source.
    #Experience source will send the observation to agent to get the action
    #Action selector which use epsilon greedy method will choose an action based on greedy or random
    #Action will be return to experience source and input to the environment to get reward and next observation, 
    # current observation, action, reward, next observation will be stored into replay buffer
    #transfer information will be stored in replay buffer, and oldest observation will be dropped
    with common.RewardTracker(writer, params['stop_reward']) as reward_tracker:
        while True:
            frame_idx += 1
            buffer.populate(1)
            epsilon_tracker.frame(frame_idx)
            
            #we linearly increase Beta value to update epsilon with epsilon similar linearly decreasing function
            beta = min(1.0, BETA_START + frame_idx * (1.0 - BETA_START) / BETA_FRAMES)
            
            #check undiscounted reward list after finishing an episode, and send to reward tracker to record the data
            #Maybe it just play one step or didn't have finished episode, if it returns true, it means the mean reward
            #reached the reward boundary and we can break and stop training
            new_rewards = exp_source.pop_total_rewards()
            if new_rewards:
                writer.add_scalar("beta", beta, frame_idx)
                if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon):
                    break
            
            #we check buffer has cached enough data to start training or not. If not, we wait for more data.
            if len(buffer) < params['replay_initial']:
                continue
            
            #get sample from buffer, return 3 values instead of 1, which is batch, indices and weighting
            #we send batch and weighting to loss function, and get 2 objects:
            #loss_v which is the cumulative loss for back propagation
            #sample_prios_v is a tensor which contains the loss value for each sample in the batch
            #then we back propagate the loss and renew the sample priority level with the function buffer.update_priorities
            optimizer.zero_grad()
            batch, batch_indices, batch_weights = buffer.sample(params['batch_size'], beta)
            loss_v, sample_prios_v = calc_loss(batch, batch_weights, net, tgt_net.target_model,
                                               params['gamma'], device=device)
            loss_v.backward()
            optimizer.step()
            
            buffer.update_priorities(batch_indices, sample_prios_v.data.cpu().numpy())
            
            #synchronize the target network with the online network constantly
            if frame_idx % params['target_net_sync'] == 0:
                tgt_net.sync()

1080: done 1 games, mean reward -20.000, speed 211.83 f/s, eps 0.99
2345: done 2 games, mean reward -19.000, speed 233.76 f/s, eps 0.98
3127: done 3 games, mean reward -19.667, speed 210.77 f/s, eps 0.97
4153: done 4 games, mean reward -20.000, speed 174.16 f/s, eps 0.96
5051: done 5 games, mean reward -20.000, speed 162.26 f/s, eps 0.95
5899: done 6 games, mean reward -20.167, speed 164.75 f/s, eps 0.94
6825: done 7 games, mean reward -20.143, speed 163.06 f/s, eps 0.93
7776: done 8 games, mean reward -20.000, speed 161.70 f/s, eps 0.92
8594: done 9 games, mean reward -20.111, speed 162.22 f/s, eps 0.91
9493: done 10 games, mean reward -20.200, speed 161.92 f/s, eps 0.91
10311: done 11 games, mean reward -20.273, speed 93.85 f/s, eps 0.90
11236: done 12 games, mean reward -20.250, speed 57.93 f/s, eps 0.89
12059: done 13 games, mean reward -20.308, speed 55.39 f/s, eps 0.88
13021: done 14 games, mean reward -20.286, speed 54.96 f/s, eps 0.87
13826: done 15 games, mean reward -20.333, 

227360: done 119 games, mean reward -13.620, speed 33.10 f/s, eps 0.02
230640: done 120 games, mean reward -13.440, speed 32.34 f/s, eps 0.02
233254: done 121 games, mean reward -13.320, speed 33.15 f/s, eps 0.02
237167: done 122 games, mean reward -13.130, speed 33.07 f/s, eps 0.02
240952: done 123 games, mean reward -12.910, speed 32.85 f/s, eps 0.02
244338: done 124 games, mean reward -12.750, speed 33.59 f/s, eps 0.02
247305: done 125 games, mean reward -12.570, speed 32.29 f/s, eps 0.02
250614: done 126 games, mean reward -12.280, speed 32.66 f/s, eps 0.02
254471: done 127 games, mean reward -12.030, speed 32.46 f/s, eps 0.02
257741: done 128 games, mean reward -11.890, speed 32.42 f/s, eps 0.02
261147: done 129 games, mean reward -11.700, speed 32.35 f/s, eps 0.02
265596: done 130 games, mean reward -11.530, speed 32.57 f/s, eps 0.02
269309: done 131 games, mean reward -11.310, speed 32.75 f/s, eps 0.02
272520: done 132 games, mean reward -11.160, speed 38.96 f/s, eps 0.02
276138

507755: done 237 games, mean reward 15.930, speed 38.40 f/s, eps 0.02
509759: done 238 games, mean reward 16.190, speed 39.80 f/s, eps 0.02
511499: done 239 games, mean reward 16.310, speed 39.09 f/s, eps 0.02
513289: done 240 games, mean reward 16.470, speed 38.42 f/s, eps 0.02
515436: done 241 games, mean reward 16.610, speed 38.41 f/s, eps 0.02
517190: done 242 games, mean reward 16.810, speed 38.35 f/s, eps 0.02
519133: done 243 games, mean reward 17.050, speed 39.15 f/s, eps 0.02
520932: done 244 games, mean reward 17.180, speed 39.19 f/s, eps 0.02
522665: done 245 games, mean reward 17.350, speed 39.12 f/s, eps 0.02
524815: done 246 games, mean reward 17.480, speed 38.88 f/s, eps 0.02
526847: done 247 games, mean reward 17.570, speed 38.80 f/s, eps 0.02
529073: done 248 games, mean reward 17.610, speed 39.23 f/s, eps 0.02
530890: done 249 games, mean reward 17.700, speed 37.98 f/s, eps 0.02
532701: done 250 games, mean reward 17.760, speed 40.14 f/s, eps 0.02
534487: done 251 gam