# Train DDQN for trailer reversing
Here we instantiate a simulation of the trailer system, and train a DDQN agent to control it.  

Imports

In [None]:
# Import dependencies
import torch
import numpy as np
from numpy import array
import gym
from collections import namedtuple
from dqn_model import DoubleQLearningModel, ExperienceReplay
from IPython.core.debugger import set_trace
import dqn_model
from bicycleEnv import *

#Local files. 
from visualize_combination_code import *
from Simulate_combination_code import *
from utility_functions import *

#import DDQNhelpers
#from DDQNhelpers import *
#dqn_model.test_calculate_q_targets(calculate_q_targets)

In [None]:
# CPU should be enough, but feel free to play around with this if you want to.
device = torch.device("cpu")

# Environment
Define an environment that we are to learn.

In [None]:
##System initial conditions. 


L = 2 #Length rear axis to front axis
Ts = 0.01 #Sample interval in seconds. 

#Position x, y, and heading
initState = (0,0, 0)

#Yes, the truck is a bicycle these days. 
truck = BicycleEnv(L,Ts, initState)

# Actions
The DDQN framework and all helper functions here assume a structure where actions are defined as integers 

$a_i \in \{0,1,\ldots , N_a-1  \}$

This seems to be for making array operations within the framework easier and more lightweight. This actions are then mapped to something useful the agent can do, in this case a tuple containing steering and velocity like 

$a_i \mapsto \langle v_i, \delta_i \rangle$

The map is contained within the environment. 



In [None]:
#Draw an element from the map of actions
result = truck.action_map[2]
lala = truck.action_map[2].vel
print("Vel:" ,result.vel,"Steering:", result.steeringRad)

Instantiate a system model and run a simulation for a sanity check. 

In [None]:

truck_pos_x = []
truck_pos_y = []
truck_angle = []

action = 2

print("Simulation for velocity", truck.action_map[action].vel, "m/s and steering", np.rad2deg(truck.action_map[action].steeringRad)   )

for step_number in range(np.int(1e3)):
    
        #velocity = -0.1+np.sin(step_number/1e3*np.pi)
        #steering_percentage = np.sin(step_number/1e2*np.pi)
 
        
        state,_,_  = truck.step(action) 
        
        truck_pos_x.append(state[0])
        truck_pos_y.append(state[1])
        truck_angle.append(state[2])
        #truck_rot.append(truck_rotation)
        #trailer1_rot.append(first_trailer_rotation)
        #trailer2_rot.append(second_trailer_rotation)


#Reset before training on network.         
truck.reset()



Animate data for a sanity check

In [None]:
from animateRectangle import* 

%matplotlib notebook 

#The previous code was put in a separate class, try it here. 
#%matplotlib tk 
fig= plt.figure()
#ax.axis('equal')
fig.set_dpi(100)
fig.set_size_inches(3, 3)
B =0

rectAnim = animateRectangle(fig, B,L, truck_pos_x, truck_pos_y, truck_angle)

anim = rectAnim.animate(Ts*1000)
plt.show()
plt.grid()

In [None]:
def calculate_q_targets(q1_batch, q2_batch, r_batch, nonterminal_batch, gamma=.99):
    '''
    Calculates the Q target used for the loss
    : param q1_batch: Batch of Q(s', a) from online network. FloatTensor, shape (N, num actions)
    : param q2_batch: Batch of Q(s', a) from target network. FloatTensor, shape (N, num actions)
    : param r_batch: Batch of rewards. FloatTensor, shape (N,)
    : param nonterminal_batch: Batch of booleans, with False elements if state s' is terminal and True otherwise. BoolTensor, shape (N,)
    : param gamma: Discount factor, float.
    : return: Q target. FloatTensor, shape (N,)
    '''    
    action = np.argmax(q1_batch.detach().numpy(),  1   )
    
    
     #TODO: It seems that the colon operator is different from the matlab(?), so we need to select elements like this instead.
    N = q2_batch.size(0)
    discountTerm = gamma*q2_batch[ range(N) ,action]
    
    #Add discounted value for all states that aren't terminal
    Y =  r_batch 
    Y[nonterminal_batch] =  Y[nonterminal_batch] + discountTerm[nonterminal_batch]
    
    
    return torch.Tensor(Y)

In [None]:

def eps_greedy_policy(q_values, eps):
    '''
    Creates an epsilon-greedy policy
    :param q_values: set of Q-values of shape (num actions,)
    :param eps: probability of taking a uniform random action 
    :return: policy of shape (num actions,)
    '''
    # YOUR CODE HERE
    a_star = np.argmax(q_values)
    
    m = len(q_values)
    policy = np.ones(m )*eps/m 
    
    #Add extra probability that we choose the optimal action
    policy[a_star] = policy[a_star] +1-eps  
    
    return policy

def calc_q_and_take_action(ddqn, state, eps):
    '''
    Calculate Q-values for current state, and take an action according to an epsilon-greedy policy.
    Inputs:
        ddqn   - DDQN model. An object holding the online / offline Q-networks, and some related methods.
        state  - Current state. Numpy array, shape (1, num_states).
        eps    - Exploration parameter.
        Nactions - number of actions available. 
    Returns:
        q_online_curr   - Q(s,a) for current state s. Numpy array, shape (1, num_actions) or  (num_actions,).
        curr_action     - Selected action (0 or 1, i.e. left or right), sampled from epsilon-greedy policy. Integer.
    '''
    # FYI:
    # ddqn.online_model & ddqn.offline_model 
    # are Pytorch modules for online / offline Q-networks, which take the state as input, 
    # and output the Q-values for all actions.
    # Input shape (batch_size, num_states). Output shape (batch_size, num_actions).
    
    
    # YOUR CODE HERE
    
    #Seems we need to cast the incoming state to a tensor for later function uses. Note that tensor constructor 
    #performs a deep copy

    #state_tensor = torch.tensor(state, requires_grad = False, ).to(device)
    #By detach() the tensor should never need a gradient and is detached from the computational graph it seems
    state_tensor = torch.FloatTensor(state).detach()
    #state_tensor = state_tensor.detach()
    
    q_online_curr = ddqn.online_model( state_tensor )
    
    #set_trace()
    
    #Cast to numpy and remove axtra dimension. 
    q_np = q_online_curr.detach().numpy().squeeze()
    
  
    actionProbs = eps_greedy_policy(q_np, eps)
    
    #The actions are indexes 0,...,nactions-1, and are exactly the as many as the q-values. 
    Nactions = len(q_np)
    actions = range(Nactions)
    
    curr_action = np.random.choice(  Nactions       ,    p=actionProbs) 
    
    return q_online_curr, curr_action


In [None]:
def sample_batch_and_calculate_loss(ddqn, replay_buffer, batch_size, gamma):
    '''
    Sample mini-batch from replay buffer, and compute the mini-batch loss
    Inputs:
        ddqn          - DDQN model. An object holding the online / offline Q-networks, and some related methods.
        replay_buffer - Replay buffer object (from which smaples will be drawn)
        batch_size    - Batch size
        gamma         - Discount factor
    Returns:
        Mini-batch loss, on which .backward() will be called to compute gradient.
    '''
    # Sample a minibatch of transitions from replay buffer
    curr_state, curr_action, reward, next_state, nonterminal = replay_buffer.sample_minibatch(batch_size)

    #set_trace()
    # FYI:
    # ddqn.online_model & ddqn.offline_model are Pytorch modules for online / offline Q-networks, which take the state 
    # as input, and output the Q-values for all actions.
    # Input shape (batch_size, num_states). Output shape (batch_size, num_actions).

    # YOUR CODE HERE
    
    #TODO: Something should have no_grad() here
    q_online_next = ddqn.online_model(next_state)
    q_offline_next = ddqn.offline_model(next_state)
    q_online_curr = ddqn.online_model(curr_state)
    
    q_target = calculate_q_targets(q_online_next, q_offline_next, reward, nonterminal, gamma)
    #q_target.
    loss = ddqn.calc_loss(q_online_curr, q_target.detach(), curr_action)

    return loss

In [None]:
def train_loop_ddqn( env, ddqn, replay_buffer, num_episodes, enable_visualization=False, batch_size=64, gamma=.94):        
    Transition = namedtuple("Transition", ["s", "a", "r", "next_s", "t"])
    episodeLogTuple = namedtuple("episodeLog", ["Px", "Py","angleRad", "action"])
    trainingLog = []
    
    #Initial and final probability of taking a random action
    eps = 1.
    eps_end = 0.1 
    eps_decay = .001
    tau = 1000
    cnt_updates = 0
    R_buffer = []
    R_avg = []
    for i in range(num_episodes):
        state = env.reset() # Initial state
        state = state[None,:] # Add singleton dimension, to represent as batch of size 1.
        finish_episode = False # Initialize
        ep_reward = 0 # Initialize "Episodic reward", i.e. the total reward for episode, when disregarding discount factor.
        q_buffer = []
        steps = 0
        maxSteps = 200
        
        episodeLog = episodeLogTuple(Px =[], Py =[], action =[], angleRad = [])
        
        while not finish_episode and steps < maxSteps:
           
            if enable_visualization:
                env.render() # comment this line out if you don't want to / cannot render the environment on your system
            steps += 1

            # Take one step in environment. No need to compute gradients,
            # we will just store transition to replay buffer, and later sample a whole batch
            # from the replay buffer to actually take a gradient step.
            q_online_curr, curr_action = calc_q_and_take_action(ddqn, state, eps)
            q_buffer.append(q_online_curr)
            
            #Velocity is constant for now
            new_state, reward, finish_episode,  = env.step(curr_action) # take one step in the evironment
            
            episodeLog.Px.append(new_state[0] )
            episodeLog.Py.append(new_state[1] )
            episodeLog.angleRad.append(new_state[2] )
            
            #set_trace()
            
            new_state = new_state[None,:]
            
            
            # Assess whether terminal state was reached.
            # The episode may end due to having reached 200 steps, but we should not regard this as reaching the terminal state, and hence not disregard Q(s',a) from the Q target.
            # https://arxiv.org/abs/1712.00378
            nonterminal_to_buffer = not finish_episode or steps == maxSteps
            
            # Store experienced transition to replay buffer
            replay_buffer.add(Transition(s=state, a=curr_action, r=reward, next_s=new_state, t=nonterminal_to_buffer))

            state = new_state
            ep_reward += reward
            
            # If replay buffer contains more than 1000 samples, perform one training step
            if replay_buffer.buffer_length > 1000:
                
                loss = sample_batch_and_calculate_loss(ddqn, replay_buffer, batch_size, gamma)
                ddqn.optimizer.zero_grad()
                loss.backward()
                ddqn.optimizer.step()

                cnt_updates += 1
                if cnt_updates % tau == 0:
                    ddqn.update_target_network()
        #########################
        ## End of episode
        eps = max(eps - eps_decay, eps_end) # decrease epsilon        
        R_buffer.append(ep_reward)
        
        trainingLog.append(episodeLog)
        
        # Running average of episodic rewards (total reward, disregarding discount factor)
        R_avg.append(.05 * R_buffer[i] + .95 * R_avg[i-1])  if i > 0 else  R_avg.append(R_buffer[i])

        if(i%1 == 0):
            print('Episode: {:d}, Total Reward (running avg): {:4.0f}'.format( i, R_avg[-1]))
        #print('Episode: {:d}, Total Reward (running avg): {:4.0f} ({:.2f}) Epsilon: {:.3f}, Avg Q: {:.4g}'.format(i, ep_reward, R_avg[-1], eps, np.mean(np.array(q_buffer))))
        
        # If running average > 195 (close to 200), the task is considered solved
        if R_avg[-1] > 195:
            return R_buffer, R_avg
    return R_buffer, R_avg, trainingLog

Train.

In [None]:
# Create the environment
#env = gym.make("CartPole-v0")

# Enable visualization? Does not work in all environments.
enable_visualization = False

#Actions are full turn left, straight, full turn right
#actions = (-1,0,1)


# Initializations
num_actions = 3 #TODO: Hardcoded now, do something more fancy later.
num_states = len(truck.initState)

#Training hyperparameters. 
num_episodes = 5
batch_size = 128
gamma = .94
learning_rate = 1e-4

# Object holding our online / offline Q-Networks
ddqn = DoubleQLearningModel(device, num_states, num_actions, learning_rate)

# Create replay buffer, where experience in form of tuples <s,a,r,s',t>, gathered from the environment is stored 
# for training
replay_buffer = ExperienceReplay(device, num_states)

# Train
#set_trace()
#R, R_avg = train_loop_ddqn(ddqn, env, replay_buffer, num_episodes, enable_visualization=enable_visualization, batch_size=batch_size, gamma=gamma)

#DDQNhelpers.
#DDQNhelpers.
#set_trace()
R, R_avg, trainingLog = train_loop_ddqn( truck, ddqn, replay_buffer, num_episodes, enable_visualization, batch_size, gamma)

In [None]:
calculate_q_targets


In [None]:
#for episode in train_log: 
%matplotlib auto

#print(type(episode))
#plt.plot( episode[pos_x], episode[pos_y] )

def printEpisode(episode, i ):
    #Plot startpoint
    plt.plot(episode.Px[0],episode.Py[0],  'bo' )

    #Plot whole trajectory. 
    plt.plot( episode.Px, episode.Py, label = "Episode " + str(i) )
    #legend()
    plt.gca().legend()
    plt.plot(episode.Px[-1],episode.Py[-1],  'kx' )
    
    
    
for i, episode in enumerate(trainingLog):
    if (i%1==0):
        printEpisode(episode,i)
    
#
#episode_end = train_log[-1]
#episode_middle = train_log[4]
#episode_first = train_log[0]
#
#printEpisode(episode_end)
#printEpisode(episode_middle)
#printEpisode(episode_first)
#printEpisode(train_log[1])
#    
    
plt.figure() 

#for episode in train_log: 
#plt.plot(episode.steering_angle)
    