In [50]:
import numpy as np
import ptan
import gym
import torch.nn as nn
import torch
from lib.dqn_model import DQN
from lib.common import unpack_batch, batch_generator
from typing import List, Optional, Tuple, Any

In [51]:
env = gym.make("PongNoFrameskip-v4")
env.reset()
random_step = env.action_space.sample()
(obser, reward, is_done, _) = env.step(random_step)
print(obser.shape)

(210, 160, 3)


In [52]:
env = gym.make("PongNoFrameskip-v4")
env = ptan.common.wrappers.wrap_dqn(env)
env.reset()
(obser, reward, is_done, _) = env.step(random_step)
print(type(obser))
obser = np.array(obser)
print(obser.shape)
print(env.observation_space)
# We notice that wrapper changes the observation. More wrappers can be found in the module 'wrappers.py'

<class 'ptan.common.wrappers.LazyFrames'>
(4, 84, 84)
Box(4, 84, 84)


In [53]:
# Create the neural network and the target network
net = DQN(env.observation_space.shape,env.action_space.n)
tgt_net = ptan.agent.TargetNet(net)

# Create an agent based on the nn and the selector
selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=1.0)
agent = ptan.agent.DQNAgent(net, selector)

# The ExperienceSourceFirstLast is used to generate trajectories. It returns (state, action, reward, last_state) 
exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=0.99)
#exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=0.99, steps_count=4)

# Create a buffer, the buffer_size is only 1 
buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=2)

buffer.populate(2)
batch = buffer.sample(2)
for i in batch:
    print(i,"\n")


ExperienceFirstLast(state=<ptan.common.wrappers.LazyFrames object at 0x7fdb89aa7a58>, action=2, reward=0.0, last_state=<ptan.common.wrappers.LazyFrames object at 0x7fdb89aa7518>) 

ExperienceFirstLast(state=<ptan.common.wrappers.LazyFrames object at 0x7fdb89aa7518>, action=5, reward=0.0, last_state=<ptan.common.wrappers.LazyFrames object at 0x7fdb88951278>) 



In [54]:
states, actions, rewards, dones, next_states = unpack_batch(batch) 
print(states.shape) # The output 2 for batch and the other three are the observations

states_v = torch.tensor(states)
next_states_v = torch.tensor(next_states)
actions_v = torch.tensor(actions)
rewards_v = torch.tensor(rewards)
done_mask = torch.BoolTensor(dones)

print(states_v.shape)
print(next_states_v.shape)
# The input to our DQN should be (batch_size, 4, 84, 84)
out = net(states_v)
print(out)
print(actions_v, actions_v.shape)
print(done_mask)

(2, 4, 84, 84)
torch.Size([2, 4, 84, 84])
torch.Size([2, 4, 84, 84])
tensor([[-0.0326,  0.0243,  0.0364,  0.0376, -0.0048, -0.0450],
        [-0.0326,  0.0243,  0.0363,  0.0377, -0.0048, -0.0449]],
       grad_fn=<AddmmBackward>)
tensor([2, 5]) torch.Size([2])
tensor([False, False])


In [55]:
actions_v_un = actions_v.unsqueeze(-1)
print(actions_v_un)
print(actions_v_un.shape)

state_action_vals = out.gather(1, actions_v_un) # The 1 is for 1st dimension
print(state_action_vals)
print(state_action_vals.squeeze(-1))

""" We are simply selecting the state-action values using squeeze, unsqueeze and gather options """

tensor([[2],
        [5]])
torch.Size([2, 1])
tensor([[ 0.0364],
        [-0.0449]], grad_fn=<GatherBackward>)
tensor([ 0.0364, -0.0449], grad_fn=<SqueezeBackward1>)


' We are simply selecting the state-action values using squeeze, unsqueeze and gather options '

In [56]:
with torch.no_grad(): # Telling torch not to store any gradient transforms, since we are only obtaining data
    print(tgt_net.target_model(next_states_v))
    next_state_vals = tgt_net.target_model(next_states_v).max(1)[0] # max(1) to get max along dimension 1
    print(next_state_vals)
    next_state_vals[done_mask] = 0.0
    print(next_state_vals)
    print(next_state_vals.detach())
    
bellman_vals = next_state_vals.detach() * 0.99 + rewards_v # 0.99 -> gamma value



tensor([[-0.0326,  0.0243,  0.0363,  0.0377, -0.0048, -0.0449],
        [-0.0325,  0.0243,  0.0364,  0.0376, -0.0049, -0.0449]])
tensor([0.0377, 0.0376])
tensor([0.0377, 0.0376])
tensor([0.0377, 0.0376])


In [57]:
print(nn.MSELoss()(state_action_vals, bellman_vals))

tensor(0.0034, grad_fn=<MseLossBackward>)


In [60]:
exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=0.99, steps_count=4)

# Create a buffer, the buffer_size is only 1 
buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=5)

buffer.populate(5)
batch = buffer.sample(5)
for i in batch:
    print(i,"\n")


ExperienceFirstLast(state=<ptan.common.wrappers.LazyFrames object at 0x7fdb8a2314e0>, action=5, reward=0.0, last_state=<ptan.common.wrappers.LazyFrames object at 0x7fdb88951390>) 

ExperienceFirstLast(state=<ptan.common.wrappers.LazyFrames object at 0x7fdb8a2314a8>, action=5, reward=0.0, last_state=<ptan.common.wrappers.LazyFrames object at 0x7fdb889512b0>) 

ExperienceFirstLast(state=<ptan.common.wrappers.LazyFrames object at 0x7fdb89a8f780>, action=2, reward=0.0, last_state=<ptan.common.wrappers.LazyFrames object at 0x7fdb88951978>) 

ExperienceFirstLast(state=<ptan.common.wrappers.LazyFrames object at 0x7fdb89a8f898>, action=1, reward=0.0, last_state=<ptan.common.wrappers.LazyFrames object at 0x7fdb88951780>) 

ExperienceFirstLast(state=<ptan.common.wrappers.LazyFrames object at 0x7fdb88951390>, action=5, reward=0.0, last_state=<ptan.common.wrappers.LazyFrames object at 0x7fdb88951208>) 



In [59]:
states, actions, rewards, dones, next_states = unpack_batch(batch) 
print(states.shape) # The output 2 for batch and the other three are the observations

states_v = torch.tensor(states)
next_states_v = torch.tensor(next_states)
actions_v = torch.tensor(actions)
rewards_v = torch.tensor(rewards)
done_mask = torch.BoolTensor(dones)

print(states_v.shape)
print(next_states_v.shape)
out = net(states_v)
print(out)
print(actions_v, actions_v.shape)
print(done_mask)

(2, 4, 84, 84)
torch.Size([2, 4, 84, 84])
torch.Size([2, 4, 84, 84])
tensor([[-0.0323,  0.0241,  0.0362,  0.0375, -0.0050, -0.0450],
        [-0.0319,  0.0240,  0.0361,  0.0377, -0.0049, -0.0448]],
       grad_fn=<AddmmBackward>)
tensor([2, 1]) torch.Size([2])
tensor([False, False])


Observation is that the batch generated from steps_count=4 is giving the similar output as that with single step.
Shouldn't there be difference is values of 