State: 1-D array of image + coordinates of previous bbox
Action: Left, Right, None

In this second phase, we pass a 1-D array of the image, in which one cell indicates the position of the object. Along with this, we include the coordinates (in this case, just the position along the 1-D array) of the previous bounding box in the state. 

Thus, we have a very simple network of this form:


       i1             
                         left_prob
       i2       
                         none_prob
       ..       
                         right_prob
       i10       
       
       prev_bbox
       
Each of the inputs is connected to each of the outputs. The outputs consist of the probabilities of going right, left and staying at the same position. The action then gives the direction in which this previous bounding box should move in order to coincide with the current actual bounding box.

This method trains on trajectories for which labels for only the first and the last frame are provided. After each episode terminates, the reward is given by +1 if the final bounding box coincides with the actual bounding box, and -1 otherwise. For all intermediate steps, which do not have a label associated with them, the reward is set to 0.

In [1]:
import numpy as np

In [2]:
num_positions = 10
num_actions = 3
gamma = 0.9
alpha = 0.01
epsilon = 0.1

In [3]:
#W = np.random.rand(num_positions + 1, num_actions)

W = np.load('7000.npy')

Generate 10000 random trajectories and perform Q-learning

In [None]:
def feedforward(s, W):
    return 1.0 / (1.0 + np.exp(-1.0 * np.dot(s, W)))

def epsilon_greedy(actions):
    if np.random.rand() <= epsilon:
        return np.argmax(actions)
    return np.random.randint(0, len(actions))

def backpropagate(gradients):
    global W
    W = W - alpha * gradients

In [None]:
for i in xrange(7000, 100000):
    if (i + 1) % 10 == 0:
        print i + 1
    obj_start = np.random.randint(0, num_positions)
    obj = obj_start
    
    trajectory = []
    states = []
    actions = []
        
    # Generate a single trajectory
    for j in xrange(10):
        a = np.random.randint(0, num_actions)
        while (obj == 0 and a == 0) or (obj == num_positions - 1 and a == 2):
            a = np.random.randint(0, num_actions)
        obj = obj - 1 if a == 0 else obj if a == 1 else obj + 1
        trajectory.append(obj)
        
    prev_box = obj_start
    j = 0
        
    # Feedforward for all steps in the trajectory
    while j < len(trajectory):
        current_state = np.zeros(num_positions + 1)
        current_state[-1] = prev_box
        current_state[trajectory[j]] = 1
                
        # Evaluate the Q-network to get the Q-values, and on the basis of that, select an action, and
        # consequently calculate the current box coordinates from the action on the previous box coordinates
        current_qvalues = feedforward(current_state, W)
        current_action = epsilon_greedy(current_qvalues)
        current_box = prev_box - 1 if current_action == 0 else prev_box if current_action == 1 else prev_box + 1
                
        # If the current box coordinates are invalid (out of bounds), we set a penalty for them and backpropagate.
        # Otherwise, we add the new state and action to their respective arrays.
        if current_box < 0 or current_box >= num_positions:
            gradients = np.matrix(current_state).T * np.matrix([0 if current_action != x else 1 for x in xrange(num_actions)])
            backpropagate(gradients)
            j = 0
            states = []
            actions = []
            prev_box = obj_start
        else:
            states.append(current_state)
            actions.append(current_action)
            prev_box = current_box
            j += 1
                
    # We now have valid (not necessarily correct) results for the entire trajectory. We assign a reward +1 if
    # the predicted final frame is the same as the ground truth, otherwise we assign a reward -1. To each
    # intermediate step in the trajectory, we assign the reward 0.
    reward = 1 if current_box == trajectory[-1] else -1
    prev_box = obj_start
    old_W = W
    
    for j in xrange(len(trajectory) - 2, -1, -1):
        current_state = states[j]
        current_action = actions[j]
        prev_box = current_state[-1]
        current_box = prev_box - 1 if current_action == 0 else prev_box if current_action == 1 else prev_box + 1
        
        current_qvalues = feedforward(current_state, old_W)
        
        next_state = current_state
        next_state[-1] = current_box
        
        next_qvalues = feedforward(next_state, old_W)
        
        max_action = np.argmax(next_qvalues)
                        
        # We will only update weights for the max action chosen for the next state, so all other actions are made to
        # have the same output as the previous output for Q-values so that their loss is 0 and thus not updated.
        target_qvalues = np.array([(reward + gamma * next_qvalues[x]) if x == max_action else current_qvalues[x] for x in xrange(len(next_qvalues))])
        
        target_qvalues = np.matrix(target_qvalues)
        current_qvalues = np.matrix(current_qvalues)
        next_state = np.matrix(next_state)
        current_state = np.matrix(current_state)
        
        # Obtaining gradient values and backpropagating
        gradients = current_state.T * (target_qvalues - current_qvalues)
        backpropagate(gradients)
        
        # Only the reward for the final step is +/- 1, and so, we set the reward to 0 for all intermediate steps.
        reward = 0
        
    if (i + 1) % 1000 == 0:
        np.save(str(i + 1) + '.npy', W)

  


7010
7020
7030
7040
7050
7060
7070
7080
7090
7100
7110
7120
7130
7140
7150
7160
7170
7180
7190
7200
7210
7220
7230
7240
7250
7260
7270
7280
7290
7300
7310
7320
7330
7340
7350
7360
7370
7380
7390
7400
7410
7420
7430
7440
7450
7460
7470
7480
7490
7500
7510
7520
7530
7540
7550
7560
7570
7580
7590
7600
7610
7620
7630
7640
7650
7660
7670
7680
7690
7700
7710
7720
7730
7740
7750
7760
7770
7780
7790
7800
7810
7820
7830
7840
7850
7860
7870
7880
7890
7900
7910
7920
7930
7940
7950
7960
7970
7980
7990
8000
8010
8020
8030
8040
8050
8060
8070
8080
8090
8100
8110
8120
8130
8140
8150
8160
8170
8180
8190
8200
8210
8220
8230
8240
8250
8260
8270
8280
8290
8300
8310
8320
8330
8340
8350
8360
8370
8380
8390
8400
8410
8420
8430
8440
8450
8460
8470
8480
8490
8500
8510
8520
8530
8540
8550
8560
8570
8580
8590
8600
8610
8620
8630
8640
8650
8660
8670
8680
8690
8700
8710
8720
8730
8740
8750
8760
8770
8780
8790
8800
8810
8820
8830
8840
8850
8860
8870
8880
8890
8900
8910
8920
8930
8940
8950
8960
8970
8980
8990
9000
