# Train DDQN for trailer reversing
Here we instantiate a simulation of the trailer system, and train a DDQN agent to control it.  

Imports

In [None]:
# Import dependencies
import torch
import numpy as np
from numpy import array
import gym
from collections import namedtuple
from dqn_model import DoubleQLearningModel, ExperienceReplay
from IPython.core.debugger import set_trace
import dqn_model

#Local files. 
from visualize_combination_code import *
from Simulate_combination_code import *
from utility_functions import *

#dqn_model.test_calculate_q_targets(calculate_q_targets)

In [None]:
# CPU should be enough, but feel free to play around with this if you want to.
device = torch.device("cpu")

In [None]:
def eps_greedy_policy(q_values, eps):
    '''
    Creates an epsilon-greedy policy
    :param q_values: set of Q-values of shape (num actions,)
    :param eps: probability of taking a uniform random action 
    :return: policy of shape (num actions,)
    '''
    # YOUR CODE HERE
    a_star = np.argmax(q_values)
    
    m = len(q_values)
    policy = np.ones(m )*eps/m 
    
    #Add extra probability that we choose the optimal action
    policy[a_star] = policy[a_star] +1-eps  
    
    #There is some numerical error here or so, so we dont pass the test. However, our own verifications shows it is correct.
    return policy

In [None]:
def calc_q_and_take_action(ddqn, state, eps):
    '''
    Calculate Q-values for current state, and take an action according to an epsilon-greedy policy.
    Inputs:
        ddqn   - DDQN model. An object holding the online / offline Q-networks, and some related methods.
        state  - Current state. Numpy array, shape (1, num_states).
        eps    - Exploration parameter.
    Returns:
        q_online_curr   - Q(s,a) for current state s. Numpy array, shape (1, num_actions) or  (num_actions,).
        curr_action     - Selected action (0 or 1, i.e. left or right), sampled from epsilon-greedy policy. Integer.
    '''
    # FYI:
    # ddqn.online_model & ddqn.offline_model 
    # are Pytorch modules for online / offline Q-networks, which take the state as input, 
    # and output the Q-values for all actions.
    # Input shape (batch_size, num_states). Output shape (batch_size, num_actions).
    
    
    # YOUR CODE HERE
    
    #Seems we need to cast the incoming state to a tensor for later function uses. Note that tensor constructor 
    #performs a deep copy

    #state_tensor = torch.tensor(state, requires_grad = False, ).to(device)
    #By detach() the tensor should never need a gradient and is detached from the computational graph it seems
    state_tensor = torch.FloatTensor(state).detach()
    #state_tensor = state_tensor.detach()
    
    q_online_curr = ddqn.online_model( state_tensor )
    
    #set_trace()
    
    #Cast to numpy and remove axtra dimension. 
    q_np = q_online_curr.detach().numpy().squeeze()
    
    actionProbs = eps_greedy_policy(q_np, eps)
    
    #Hardcoded action set, unclear where we find the available actions of the ddqn
    curr_action = np.random.choice(  [0,1]       ,    p=actionProbs) 
    
    
    
    return q_online_curr, curr_action

In [None]:
def calculate_q_targets(q1_batch, q2_batch, r_batch, nonterminal_batch, gamma=.99):
    '''
    Calculates the Q target used for the loss
    : param q1_batch: Batch of Q(s', a) from online network. FloatTensor, shape (N, num actions)
    : param q2_batch: Batch of Q(s', a) from target network. FloatTensor, shape (N, num actions)
    : param r_batch: Batch of rewards. FloatTensor, shape (N,)
    : param nonterminal_batch: Batch of booleans, with False elements if state s' is terminal and True otherwise. BoolTensor, shape (N,)
    : param gamma: Discount factor, float.
    : return: Q target. FloatTensor, shape (N,)
    '''    
    action = np.argmax(q1_batch.detach().numpy(),  1   )
    
    
     #TODO: It seems that the colon operator is different from the matlab(?), so we need to select elements like this instead.
    N = q2_batch.size(0)
    discountTerm = gamma*q2_batch[ range(N) ,action]
    
    #Add discounted value for all states that aren't terminal
    Y =  r_batch 
    Y[nonterminal_batch] =  Y[nonterminal_batch] + discountTerm[nonterminal_batch]
    
    
    return torch.Tensor(Y)

In [None]:
def sample_batch_and_calculate_loss(ddqn, replay_buffer, batch_size, gamma):
    '''
    Sample mini-batch from replay buffer, and compute the mini-batch loss
    Inputs:
        ddqn          - DDQN model. An object holding the online / offline Q-networks, and some related methods.
        replay_buffer - Replay buffer object (from which smaples will be drawn)
        batch_size    - Batch size
        gamma         - Discount factor
    Returns:
        Mini-batch loss, on which .backward() will be called to compute gradient.
    '''
    # Sample a minibatch of transitions from replay buffer
    curr_state, curr_action, reward, next_state, nonterminal = replay_buffer.sample_minibatch(batch_size)

    # FYI:
    # ddqn.online_model & ddqn.offline_model are Pytorch modules for online / offline Q-networks, which take the state 
    # as input, and output the Q-values for all actions.
    # Input shape (batch_size, num_states). Output shape (batch_size, num_actions).

    # YOUR CODE HERE
    
    #TODO: Something should have no_grad() here
    q_online_next = ddqn.online_model(next_state)
    q_offline_next = ddqn.offline_model(next_state)
    q_online_curr = ddqn.online_model(curr_state)
    
    q_target = calculate_q_targets(q_online_next, q_offline_next, reward, nonterminal, gamma=gamma)
    #q_target.
    loss = ddqn.calc_loss(q_online_curr, q_target.detach(), curr_action)

    return loss

In [None]:
###Add a debug copy for troubleshooting
#There is some debug in the per loop printout in the code from the course repository. 

def train_loop_ddqn(ddqn, env, replay_buffer, num_episodes, enable_visualization=False, batch_size=64, gamma=.94):        
    Transition = namedtuple("Transition", ["s", "a", "r", "next_s", "t"])
    eps = 1.
    eps_end = .1 
    eps_decay = .001
    tau = 1000
    cnt_updates = 0
    R_buffer = []
    R_avg = []
    for i in range(num_episodes):
        state = env.reset() # Initial state
        set_trace()
        state = state[None,:] # Add singleton dimension, to represent as batch of size 1.
        finish_episode = False # Initialize
        ep_reward = 0 # Initialize "Episodic reward", i.e. the total reward for episode, when disregarding discount factor.
        q_buffer = []
        steps = 0
        while not finish_episode:
            if enable_visualization:
                env.render() # comment this line out if you don't want to / cannot render the environment on your system
            steps += 1

            # Take one step in environment. No need to compute gradients,
            # we will just store transition to replay buffer, and later sample a whole batch
            # from the replay buffer to actually take a gradient step.
            q_online_curr, curr_action = calc_q_and_take_action(ddqn, state, eps)
            q_buffer.append(q_online_curr)
            new_state, reward, finish_episode, _ = env.step(curr_action) # take one step in the evironment
            new_state = new_state[None,:]
            
            # Assess whether terminal state was reached.
            # The episode may end due to having reached 200 steps, but we should not regard this as reaching the terminal state, and hence not disregard Q(s',a) from the Q target.
            # https://arxiv.org/abs/1712.00378
            nonterminal_to_buffer = not finish_episode or steps == 200
            
            # Store experienced transition to replay buffer
            replay_buffer.add(Transition(s=state, a=curr_action, r=reward, next_s=new_state, t=nonterminal_to_buffer))

            state = new_state
            ep_reward += reward
            
            # If replay buffer contains more than 1000 samples, perform one training step
            if replay_buffer.buffer_length > 1000:
                loss = sample_batch_and_calculate_loss(ddqn, replay_buffer, batch_size, gamma)
                ddqn.optimizer.zero_grad()
                loss.backward()
                ddqn.optimizer.step()

                cnt_updates += 1
                if cnt_updates % tau == 0:
                    ddqn.update_target_network()
                
        eps = max(eps - eps_decay, eps_end) # decrease epsilon        
        R_buffer.append(ep_reward)
        
        #Commented out for debug. 
        # Running average of episodic rewards (total reward, disregarding discount factor)
        R_avg.append(.05 * R_buffer[i] + .95 * R_avg[i-1])  if i > 0 else  R_avg.append(R_buffer[i])

        if(i%30 == 0):
            print('Episode: {:d}, Total Reward (running avg): {:4.0f}'.format( i, R_avg[-1]))
        #print('Episode: {:d}, Total Reward (running avg): {:4.0f} ({:.2f}) Epsilon: {:.3f}, Avg Q: {:.4g}'.format(i, ep_reward, R_avg[-1], eps, np.mean(np.array(q_buffer))))
        
        # If running average > 195 (close to 200), the task is considered solved
        if R_avg[-1] > 195:
            return R_buffer, R_avg
    return R_buffer, R_avg

Instantiate a system model

In [None]:
##System initial conditions. 
truck_translation = array([np.float(18),np.float(5)])
truck_rotation = 0
first_trailer_rotation = 20
second_trailer_rotation = 20
destination_translation = array([4.5,5])
destination_rotation = 180
number_trailers = 2
step_size = 1e-2

rotation_center_truck = array([2.5,1])
rotation_center_first_trailer = array([1.5,1])
rotation_center_second_trailer = array([2.5,1])

truck_shape = array([7,2])
first_trailer_shape = array([2,2])
second_trailer_shape = array([9,2])

#Create a simulation given the initconditions provided. 
truck = Simulate_combination(truck_translation,\
                                            truck_rotation,\
                                            first_trailer_rotation,\
                                            second_trailer_rotation,\
                                            destination_translation,\
                                            destination_rotation,\
                                            number_trailers,\
                                            step_size)    

Train.

In [None]:
# Create the environment
#env = gym.make("CartPole-v0")

# Enable visualization? Does not work in all environments.
enable_visualization = False

#Actions are full turn left, straight, full turn right
actions = (-1,0,1)

#Initstate are cartesian position x,y and angle relative to the global coordinate system. 
initState = (10,10,0)

# Initializations
num_actions = len(actions)
num_states = len(initState)

#Training hyperparameters. 
num_episodes = 1200 
batch_size = 128
gamma = .94
learning_rate = 1e-4

# Object holding our online / offline Q-Networks
ddqn = DoubleQLearningModel(device, num_states, num_actions, learning_rate)

# Create replay buffer, where experience in form of tuples <s,a,r,s',t>, gathered from the environment is stored 
# for training
replay_buffer = ExperienceReplay(device, num_states)

# Train
#set_trace()
#R, R_avg = train_loop_ddqn(ddqn, env, replay_buffer, num_episodes, enable_visualization=enable_visualization, batch_size=batch_size, gamma=gamma)


R, R_avg = train_loop_ddqn(ddqn, truck, replay_buffer, num_episodes, enable_visualization=enable_visualization, batch_size=batch_size, gamma=gamma)