In [5]:

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from collections import deque
import random
import gym
import matplotlib
import matplotlib.pyplot as plt

# if gpu is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [6]:
class DQN(nn.Module):

    def __init__(self, n_observations, n_actions,epsilon):
        super(DQN, self).__init__()
        self.nA = n_actions
        self.epsilon = epsilon
        self.layer1 = nn.Linear(n_observations, 128)
        self.layer2 = nn.Linear(128, 128)
        self.layer3 = nn.Linear(128, n_actions)
        
    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        return self.layer3(x)
    
    def action(self,state):        
        A = np.ones(self.nA, dtype=float) * (self.epsilon/self.nA)
        temp = self.forward(state)
        best_action = temp.max(1)[1].data[0].item()
        A[best_action] += 1.0 - self.epsilon
        return A


In [7]:
class replay_buffer(object):
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = deque(maxlen=self.capacity)

    def storeX(self, state,action,GValue):
        state = state
        GValue = GValue
        action = action
        self.memory.append([state,action,GValue])
        
    def store(self, n_tuple,state,action):
        self.memory.append([state,action,n_tuple])
#         state = state
#         action = action
#         reward = reward
#         done = done
#         b_prob = b_prob
#         sigma = sigma
#         self.memory.append([state,action,reward,done,b_prob,sigma])

    def clear(self):
        self.memory.clear()
        
    def sample(self, batch_size):
        batch = random.sample(self.memory, batch_size)
        state,action, n_tuple= zip(* batch)
        return np.concatenate(state, 0),action, n_tuple

    def __len__(self):
        return len(self.memory)


In [22]:
def train(b_net,pi_net,buffer,optimizer,batch_size,count,soft_update,gamma):
    if len(buffer) < batch_size:
        return
    
    state,action, n_tuple = buffer.sample(batch_size)
    stateQ = torch.tensor(state, dtype=torch.float32, device=device)
    actionQ = torch.tensor(action, dtype=torch.long, device=device)
    storedG = []
    for tuples in n_tuple:
        state_t1 = tuples[0][0]
        action_t1 = tuples[0][1]
        q_t1 = pi_net.forward(state_t1).detach().squeeze(0)
        G = q_t1[action_t1]
        for state,action,reward,b_prob,sigma,done in tuples:
            if done:
                G = reward
            else:

                s_k = state
                a_k = action
                r_k = reward
                bProb_k = b_prob
                sigma = sigma

                pi_prob = pi_net.action(s_k)
                action = np.random.choice(actions,p=pi_prob)
                piProb_k = pi_prob[action]
                pho_k = piProb_k / bProb_k
                q_k = pi_net.forward(s_k).detach().squeeze(0)

                VBar = np.sum([(pi_prob[a]) * q_k[a] for a in range(action_dim)])
                G = r_k + gamma * ((sigma * pho_k) + ((1-sigma) * pi_prob[a_k])) * (G - q_k[a_k])+ gamma * VBar
        storedG.append(G)
    
    storedG = torch.tensor(storedG, dtype=torch.float32, device=device)
    
    q_values = b_net.forward(stateQ)
    q_value = q_values.gather(1, actionQ.unsqueeze(1)).squeeze(1)

    loss = (storedG - q_value).pow(2)
    print(loss)
    loss = loss.mean()
    print(loss)
    print("spacing")
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if count % soft_update == 0:
        pi_net.load_state_dict(b_net.state_dict())

In [23]:
def chooseSigma():
     return np.random.uniform(0, 1)

In [24]:
episode_durations = []
def plot_durations(show_result=False):
    plt.figure(1)
    durations_t = torch.tensor(episode_durations, dtype=torch.float)
    if show_result:
        plt.title('Result')
    plt.xlabel('Episode')
    plt.ylabel('Duration')
    plt.plot(durations_t.numpy())
    # Take 100 episode averages and plot them too
    if len(durations_t) >= 100:
        means = durations_t.unfold(0, 100, 1).mean(1).view(-1)
        means = torch.cat((torch.zeros(99), means))
        plt.plot(means.numpy())

In [25]:
gamma = 0.99
learning_rate = 1e-3
batch_size = 5
soft_update = 50
capacity = 10000
exploration = 10

episode = 600
n_step = 4
render = False
count = 0

buffer = replay_buffer(1000)

env = gym.make('CartPole-v1')
observation_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
actions = list(range(0,env.action_space.n))

b_epsilon = 0.3
pi_epsilon = 0.1
b_net = DQN(observation_dim,action_dim,b_epsilon).to(device)
pi_net = DQN(observation_dim,action_dim,pi_epsilon).to(device)
b_net.load_state_dict(pi_net.state_dict())

optimizer = torch.optim.Adam(b_net.parameters(), lr=learning_rate)
for i in range(episode):
    
    T = np.inf
    t = 0
    tau = 0
    stored_actions = {}
    stored_states = {}
    stored_rewards = {}
    stored_bProb = {}
    stored_sigma={}
    stored_G = {}
    stored_done = {}
    
    state = env.reset()
    state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)
    b_prob = b_net.action(state)
    pi_prob = pi_net.action(state)
    action = np.random.choice(actions,p=b_prob)
    b_actionProb = b_prob[action]

    stored_actions[0] = action
    stored_states[0] = state
    stored_bProb[0] = b_actionProb
    while True:
    
        count = count + 1
        
        if t < T:
            state, reward, done, info = env.step(action)
            state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)
            stored_states[t+1] = state
            stored_rewards[t+1] = reward
            stored_done[t+1] = done
            if done:
                print(T)
                T = t + 1
                episode_durations.append(T)

                
            else:
                b_prob = b_net.action(state)

                
                action = np.random.choice(actions,p=b_prob)
                b_actionProb = b_prob[action]
                sigma = chooseSigma()
                
                stored_actions[t+1] = action
                stored_sigma[t+1] = sigma
                stored_bProb[t+1] = b_actionProb
                
        tau = t - n_step + 1
        if tau >= 0:
            
            n_tuple = []
            for k in range(min(t+1, T), tau, -1):
                
                s_k = stored_states[k]
                done_k = stored_done[k]
                r_k = stored_rewards[k]
                if k == T:
                    a_k = 0
                    bProb_k = 0
                    sigma_k = 0
                    
                else:
                    a_k = stored_actions[k]
                    bProb_k = stored_bProb[k]
                    sigma_k = stored_sigma[k]
                n_tuple.append([s_k,a_k,r_k,bProb_k,sigma_k,done_k])      
            s_tau= stored_states[tau]
            a_tau = stored_actions[tau]
            buffer.store(n_tuple,s_tau,a_tau)
        
        if i > exploration:
            train(b_net,pi_net,buffer,optimizer,batch_size,count,soft_update,gamma )
        if tau == (T-1):
            
            break
        else:
            t = t + 1

print('Complete')
plot_durations(show_result=True)

plt.savefig('results.png')
plt.show()


inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
tensor([  2.7005,  16.1784, 362.8735, 244.2822, 964.0118],
       grad_fn=<PowBackward0>)
tensor(318.0093, grad_fn=<MeanBackward0>)
spacing
tensor([16.2058,  1.9479,  3.5736,  2.4181,  0.2577], grad_fn=<PowBackward0>)
tensor(4.8806, grad_fn=<MeanBackward0>)
spacing
tensor([16.1952, 18.1836, 24.0917, 14.4646, 15.8485], grad_fn=<PowBackward0>)
tensor(17.7567, grad_fn=<MeanBackward0>)
spacing
tensor([ 12.5340, 101.6134, 949.1942,  16.5151,  17.4255],
       grad_fn=<PowBackward0>)
tensor(219.4565, grad_fn=<MeanBackward0>)
spacing
tensor([ 5.2263, 15.1749, 16.7740, 14.9756,  0.5697], grad_fn=<PowBackward0>)
tensor(10.5441, grad_fn=<MeanBackward0>)
spacing
tensor([  3.1067, 346.1738,   8.7291,   1.4976,  11.9913],
       grad_fn=<PowBackward0>)
tensor(74.2997, grad_fn=<MeanBackward0>)
spacing
tensor([5.1685e-01, 1.3108e+01, 1.6342e+01, 4.7521e+01, 7.2564e+02],
       grad_fn=<PowBackward0>)
tensor(160.6255, grad_fn=<MeanBackward0>)
spacing
tensor(

tensor([656.7851, 116.9909,   2.6423,   2.6362,   2.8035],
       grad_fn=<PowBackward0>)
tensor(156.3716, grad_fn=<MeanBackward0>)
spacing
tensor([  3.4539,  15.1685,   3.9466, 148.0540, 342.7986],
       grad_fn=<PowBackward0>)
tensor(102.6843, grad_fn=<MeanBackward0>)
spacing
tensor([  2.1634, 183.7885,   0.5781,  16.5005,  14.3755],
       grad_fn=<PowBackward0>)
tensor(43.4812, grad_fn=<MeanBackward0>)
spacing
inf
tensor([158.7585,   1.0193,   2.1856,   1.9441,   2.9111],
       grad_fn=<PowBackward0>)
tensor(33.3637, grad_fn=<MeanBackward0>)
spacing
tensor([ 2.6282,  1.2751, 10.8410, 30.4160, 13.9951], grad_fn=<PowBackward0>)
tensor(11.8311, grad_fn=<MeanBackward0>)
spacing
tensor([159.8011,   3.6611,  77.3692, 176.3230,  41.1297],
       grad_fn=<PowBackward0>)
tensor(91.6568, grad_fn=<MeanBackward0>)
spacing
tensor([139.3149,   9.5836,  45.7509,  17.6863,   4.7586],
       grad_fn=<PowBackward0>)
tensor(43.4189, grad_fn=<MeanBackward0>)
spacing
tensor([9.2301e-02, 8.3433e+01, 2

tensor([ 7.0305,  0.1625, 81.8707, 71.2079,  0.6093], grad_fn=<PowBackward0>)
tensor(32.1762, grad_fn=<MeanBackward0>)
spacing
tensor([  0.5286,   2.9534,  87.6107,  28.9664, 236.3484],
       grad_fn=<PowBackward0>)
tensor(71.2815, grad_fn=<MeanBackward0>)
spacing
tensor([7.6311e+00, 6.3092e+01, 7.1292e+01, 2.7744e+03, 1.5662e+00],
       grad_fn=<PowBackward0>)
tensor(583.5983, grad_fn=<MeanBackward0>)
spacing
tensor([ 1.2309, 77.2012,  0.2146,  2.9742, 64.5893], grad_fn=<PowBackward0>)
tensor(29.2420, grad_fn=<MeanBackward0>)
spacing
tensor([8.2141e+01, 8.5742e+01, 2.0413e-03, 7.7662e+01, 2.1675e+00],
       grad_fn=<PowBackward0>)
tensor(49.5429, grad_fn=<MeanBackward0>)
spacing
tensor([1.4458e-02, 1.9675e+00, 4.9388e+01, 7.0054e+01, 6.3158e+01],
       grad_fn=<PowBackward0>)
tensor(36.9165, grad_fn=<MeanBackward0>)
spacing
inf
tensor([  0.2063,   1.3091,   0.7659,  58.1733, 180.9317],
       grad_fn=<PowBackward0>)
tensor(48.2772, grad_fn=<MeanBackward0>)
spacing


KeyboardInterrupt: 