In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import numpy as np
import random
import gym 
import os
import time
import glob 
import import_ipynb

In [2]:
MOMENTUM = 0.99
EPSILON = 1e-6
device= torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Replay buffer

In [3]:
#The startegy is to append the (S,A,R,S) item to the buffer. 
class ReplayBuffer: 
    def __init__(self,dim):
        self.buffer=[]
        self.dim=dim
    def __len__(self): # extend the existing function len to the current
        return len(self.buffer)
    
    def add(self,state,action,reward,next_state,done):
        sarsd=(state,action,reward,next_state,done)
        if len(self.buffer) < self.dim:
            self.buffer.append(sarsd)
        else:
            self.buffer.append(sarsd)
            self.buffer.pop(0)
            
    def sample(self,batch_size): 
        batch_index=np.random.choice(len(self.buffer),batch_size)
        batch_samples=[self.buffer[idx] for idx in batch_index]
        b_state,b_action,b_reward,b_nstate,b_ndone= list(zip(*batch_samples)) # Rearange the list by type
        return np.array(b_state), np.array(b_action),np.array(b_reward),np.array(b_nstate),np.array(b_ndone)

In [4]:
def play_and_record(start_state, agent, env, exp_replay, n_steps=1):
    s=start_state
    sum_rewards=0
    
    for _ in range(n_steps):
        qvalues=agent.get_qvalues([s])
        a = agent.sample_actions(qvalues)[0]
        
        #the standard output contains the next_state, reward, done
        next_s, r, done, _ =env.step(a)
        
        sum_rewards+=r
        exp_replay.add(s,a,r,next_s,done)
        if done:
            s=env.reset()
        else:
            s=next_s
        
    return sum_rewards, s  

## Dense Neural network

In [5]:
"""
This function defines one block for the nn 
"""
class Dense(nn.Module): 
    def __init__(self,cin, cout, batch_norm=False, activate=True): 
        super(Dense,self).__init__()
        self.cin=cin; 
        self.cout=cout; 
        self.activate=activate; 
        
        self.linear=nn.Linear(self.cin,self.cout) #The linear layer
        #BatchNorm1d: it requires the input to be a correct size
        if batch_norm: 
            self.bn=nn.BatchNorm1d(cout,eps=EPSILON,momentum=MOMENTUM)
        else: 
            self.bn=None
      #  nn.init.normal_(self.linear.weight,std=5.0/np.sqrt(cin+cout))
        # This is the He initialization
        
    def forward(self,x): 
        x=self.linear(x)
        if self.bn is not None and x.shape[0]!=1:
            x=self.bn(x)
        if self.activate:
            x=torch.relu(x)
        return x 

In [6]:
class DenseNet(nn.Module):
    def __init__(self, config):
        super(DenseNet,self).__init__()
        self.config=config
        
        self.bn=nn.BatchNorm1d(config.num_hiddens[0],eps=EPSILON,momentum=MOMENTUM) ## So there is batch norm no problem
        # range(1,5): 1,2,3,4
        self.layers=[Dense(config.num_hiddens[i-1],config.num_hiddens[i]) for i in range(1, len(config.num_hiddens)-1)]
        self.layers+=[Dense(config.num_hiddens[-2], config.num_hiddens[-1],activate=False)]
        self.layers=nn.Sequential(*self.layers)
    
    def forward(self,x):
     #   if x.shape[0]!=1:
      #      x=self.bn(x) 
        ## Interestingly the batchnorm is not supposed to be used in this case
        x=self.layers(x)
        return x 

## Loss function

In [7]:
## The sarsa input

def compute_td_loss(agent, target_network, states, actions, rewards, next_states, done_flags,
                    gamma=0.99, device=device):

    # convert numpy array to torch tensors
    states = torch.tensor(states, device=device, dtype=torch.float)
    actions = torch.tensor(actions, device=device, dtype=torch.long)
    rewards = torch.tensor(rewards, device=device, dtype=torch.float)
    next_states = torch.tensor(next_states, device=device, dtype=torch.float)
    done_flags = torch.tensor(done_flags.astype('float32'),device=device,dtype=torch.float)

    # get q-values for all actions in current states
    # use agent network
    # Dimension is 32x2
    predicted_qvalues = agent(states)

    # compute q-values for all actions in next states
    # use target network
    # Dimension is 32x2
    predicted_next_qvalues = target_network(next_states)
    
    # select q-values for chosen actions
    # dimension is now 32x1
    predicted_qvalues_for_actions = predicted_qvalues[range(
        len(actions)), actions]

    # compute Qmax(next_states, actions) using predicted next q-values
    next_state_values,_ = torch.max(predicted_next_qvalues, dim=1)

    # compute "target q-values" 
    target_qvalues_for_actions = rewards + gamma * next_state_values * (1-done_flags)

    loss = torch.mean((predicted_qvalues_for_actions -
                       target_qvalues_for_actions.detach()) ** 2)

    return loss