# DQN

In [None]:
from collections import deque
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.autograd as autograd
import math, random

In [None]:
Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda() if USE_CUDA else autograd.Variable(*args, **kwargs)
USE_CUDA = torch.cuda.is_available()
print(USE_CUDA)

In [None]:
class QLearner(nn.Module):
    def __init__(self, env, num_frames, batch_size, gamma, replay_buffer):
        super(QLearner, self).__init__()

        self.batch_size = batch_size
        self.gamma = gamma
        self.num_frames = num_frames
        self.replay_buffer = replay_buffer
        self.env = env
        self.input_shape = self.env.observation_space.shape
        self.num_actions = self.env.action_space.n

        self.features = nn.Sequential(
            nn.Conv2d(self.input_shape[0], 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU()
        )
        
        self.fc = nn.Sequential(
            nn.Linear(self.feature_size(), 512),
            nn.ReLU(),
            nn.Linear(512, self.num_actions)
        )
        
    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x
    
    def feature_size(self):
            return self.features(autograd.Variable(torch.zeros(1, *self.input_shape))).view(1, -1).size(1)
    
    def act(self, state, epsilon):
        action = []
        
        if random.random() > epsilon:
            state   = Variable(torch.FloatTensor(np.float32(state)).unsqueeze(0), requires_grad=True)
            ######## YOUR CODE HERE! ########
            # TODO: Given state, you should write code to get the Q value and chosen action
            # Complete the R.H.S. of the following 2 lines and uncomment them
            q_value = self.forward(state)
            action = torch.argmax(q_value)
            ######## YOUR CODE HERE! ########
        else:
            action = random.randrange(self.env.action_space.n)
        return action
        
def compute_td_loss(model, batch_size, gamma, replay_buffer):
    state, action, reward, next_state, done = replay_buffer.sample(batch_size)
    
    state = Variable(torch.FloatTensor(np.float32(state)), requires_grad=True)
    next_state = Variable(torch.FloatTensor(np.float32(next_state)), requires_grad=True)
    action = Variable(torch.LongTensor(action))
    reward = Variable(torch.FloatTensor(reward))
    done = Variable(torch.FloatTensor(done))

    ######## YOUR CODE HERE! ########
    # TODO: Implement the Temporal Difference Loss
    q_value = model.forward(state)
    next_q_value = model.forward(next_state)
    
    target = reward + gamma*torch.max(next_q_value)
    current = [q_value[ii,act] for ii, act in enumerate(action)]
    current = Variable(torch.FloatTensor(np.float32(current)), requires_grad=True)
    
    loss = torch.sqrt(torch.mean((current - target)**2))
    ######## YOUR CODE HERE! ########
    return loss


class ReplayBuffer(object):
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity) #Returns a new deque object initialized left-to-right

    def push(self, state, action, reward, next_state, done):
        state = np.expand_dims(state, 0)
        next_state = np.expand_dims(next_state, 0)

        self.buffer.append((state, action, reward, next_state, done))
    
    def sample(self, batch_size):
        ######## YOUR CODE HERE! ########
        # TODO: Randomly sampling data with specific batch size from the buffer
        # Hint: you may use the python library "random".
        
        batch = random.sample(self.buffer, batch_size)
#         state  = []
#         action = []
#         reward = []
#         next_state = []
#         done = []
#         for sample in batch:
#             state.append(sample[0])
#             action.append(sample[1])
#             reward.append(sample[2])
#             next_state.append(sample[3])
#             done.append(sample[4])
        
        # If you are not familiar with the "deque" python library, please google it.
        ######## YOUR CODE HERE! ########
        return batch
#         return np.concatenate(state), action, reward, np.concatenate(next_state), done

    def __len__(self):
        return len(self.buffer)




In [None]:
aa=torch.tensor([[ 0.0616, -1.5050, -1.2542,  1.0467, -1.6583, -0.1956],
        [ 0.1444, -1.4500, -1.2947,  1.0930, -1.7403, -0.2409],
        [ 0.0342, -1.5723, -1.2624,  1.1574, -1.6852, -0.0689],
        [ 0.0517, -1.5105, -1.2464,  1.1025, -1.6326, -0.1251],
        [ 0.0201, -1.5424, -1.2026,  1.1654, -1.7247, -0.1792],
        [ 0.0877, -1.4870, -1.2158,  1.0617, -1.6393, -0.1004],
        [ 0.0881, -1.4855, -1.2736,  1.1042, -1.5769, -0.1997],
        [ 0.1076, -1.4818, -1.2331,  1.0620, -1.6394, -0.1209],
        [ 0.1458, -1.5549, -1.1906,  1.1144, -1.6783, -0.0942],
        [ 0.0267, -1.5309, -1.3258,  1.0811, -1.6813, -0.1498],
        [ 0.0187, -1.5255, -1.3182,  1.0463, -1.6692, -0.0902],
        [ 0.0642, -1.6106, -1.2788,  1.0466, -1.6537, -0.0168],
        [-0.0481, -1.5233, -1.2429,  1.0540, -1.6389, -0.1518],
        [-0.0229, -1.4506, -1.2753,  0.9652, -1.6470, -0.2397],
        [ 0.0108, -1.5544, -1.1406,  1.1781, -1.7532, -0.1468],
        [ 0.1174, -1.5528, -1.2626,  1.0589, -1.6256, -0.1757],
        [ 0.0499, -1.5252, -1.2493,  1.1104, -1.6591, -0.0693],
        [ 0.0906, -1.5743, -1.2251,  1.0825, -1.6204, -0.0840],
        [ 0.0260, -1.5498, -1.2999,  1.1594, -1.6374, -0.1158],
        [ 0.0567, -1.4924, -1.2527,  1.0692, -1.6544, -0.1253],
        [ 0.0356, -1.5186, -1.3110,  1.1103, -1.7097, -0.0847],
        [ 0.0463, -1.5214, -1.2331,  1.0992, -1.6376, -0.1251],
        [ 0.0372, -1.4996, -1.2329,  1.0936, -1.6099, -0.0969],
        [ 0.0567, -1.5007, -1.2477,  1.0955, -1.6199, -0.1252],
        [ 0.0213, -1.5079, -1.2459,  1.1022, -1.6472, -0.1064],
        [ 0.0600, -1.4948, -1.2267,  1.0692, -1.6599, -0.1107],
        [ 0.0256, -1.5184, -1.2491,  1.1132, -1.6322, -0.1291],
        [ 0.1561, -1.5188, -1.2421,  1.1181, -1.6393, -0.1787],
        [ 0.0355, -1.5439, -1.2369,  1.1085, -1.6272, -0.1101],
        [ 0.0417, -1.4993, -1.3280,  1.0897, -1.6234, -0.1404],
        [ 0.0097, -1.5227, -1.2437,  1.0888, -1.6143, -0.1501],
        [ 0.0409, -1.6244, -1.4683,  1.1526, -1.8225, -0.1008]]) 
print(max(aa[1]))

print(max(aa[1,:]))

# run DQN Pong

In [None]:
from Wrapper.layers import *
from Wrapper.wrappers import make_atari, wrap_deepmind, wrap_pytorch
import math, random
import gym
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.autograd as autograd 
import torch.nn.functional as F
USE_CUDA = torch.cuda.is_available()
# from dqn import QLearner, compute_td_loss, ReplayBuffer

In [None]:
env_id = "PongNoFrameskip-v4"
env = make_atari(env_id)
env = wrap_deepmind(env)
env = wrap_pytorch(env)

In [None]:
# plot observation of env

In [None]:
# num_frames = 1000000
# batch_size = 32
# gamma = 0.99
    
# replay_initial = 10000
# replay_buffer = ReplayBuffer(100000)
# model = QLearner(env, num_frames, batch_size, gamma, replay_buffer)
# optimizer = optim.Adam(model.parameters(), lr=0.00001)
# if USE_CUDA:
#     model = model.cuda()

# epsilon_start = 1.0
# epsilon_final = 0.01
# epsilon_decay = 30000
# epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)

# losses = []
# all_rewards = []
# episode_reward = 0

# state = env.reset()

In [None]:
num_frames = 10
batch_size = 3
gamma = 0.99

epsilon_start = 1.0
epsilon_final = 0.01
epsilon_decay = 30000
epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)


model = QLearner(env, num_frames, batch_size, gamma, replay_buffer)

state = env.reset()

replay_buffer = ReplayBuffer(100)
for ii in range(100):
        epsilon = epsilon_by_frame(frame_idx)
        action = model.act(state, epsilon)
        next_state, reward, done, _ = env.step(action)
        replay_buffer.push(state, action, reward, next_state, done)



In [None]:
from collections import namedtuple
Transition = namedtuple('Transition', 
                        ['state', 'action', 'reward', 'next_state', 'done'])
batch = zip(*transitions)

In [None]:
num_frames = 1000000
batch_size = 32
gamma = 0.99
    
replay_initial = 10000
replay_buffer = ReplayBuffer(10000)

model = QLearner(env, num_frames, batch_size, gamma, replay_buffer)
optimizer = optim.Adam(model.parameters(), lr=0.00001)
if USE_CUDA:
    model = model.cuda()

epsilon_start = 1.0
epsilon_final = 0.01
epsilon_decay = 30000
epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)

losses = []
all_rewards = []
episode_reward = 0

state = env.reset()

In [None]:
for frame_idx in range(1, num_frames + 1):

    epsilon = epsilon_by_frame(frame_idx)
    action = model.act(state, epsilon)
    
    next_state, reward, done, _ = env.step(action)
    replay_buffer.push(state, action, reward, next_state, done)
    
    state = next_state
    episode_reward += reward
    if done:
        state = env.reset()
        all_rewards.append(episode_reward)
        episode_reward = 0

    if len(replay_buffer) > replay_initial:
        loss = compute_td_loss(model, batch_size, gamma, replay_buffer)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        losses.append(loss.data.cpu().numpy())

    if frame_idx % 10000 == 0 and len(replay_buffer) <= replay_initial:
        print('#Frame: %d, preparing replay buffer' % frame_idx)
        print(len(replay_buffer))
    if frame_idx % 10000 == 0 and len(replay_buffer) > replay_initial:
        print('#Frame: %d, Loss: %f' % (frame_idx, np.mean(losses)))
        print('Last-10 average reward: %f' % np.mean(all_rewards[-10:]))