## Testing saved weights

In [1]:
import numpy as np
import os

In [2]:
num_positions = 10
num_actions = 3
gamma = 0.9
alpha = 0.01
epsilon = 0.1

In [3]:
weights = {}

for f in os.listdir('QL_IV_checkpoints/'):
    weights[f.split('.')[0]] = np.load('QL_IV_checkpoints/' + f)

In [4]:
def feedforward(s, W):
    return 1.0 / (1.0 + np.exp(-1.0 * np.dot(s, W)))

def epsilon_greedy(actions):
    if np.random.rand() <= epsilon:
        return np.argmax(actions)
    return np.random.randint(0, len(actions))

In [5]:
correct = {}
total = 0

for f in weights:
    correct[f] = 0.0

for i in xrange(10000):
    obj_start = np.random.randint(0, num_positions)
    obj_start = [0 if x != obj_start else 1 for x in xrange(num_positions)]
    obj = obj_start

    trajectory = []

    # Generate a single trajectory
    for j in xrange(10):
        a = np.random.randint(0, num_actions)
        obj_coord = obj.index(1)
        while (obj_coord == 0 and a == 0) or (obj_coord == num_positions - 1 and a == 2):
            a = np.random.randint(0, num_actions)
        obj_coord = obj_coord - 1 if a == 0 else obj_coord if a == 1 else obj_coord + 1
        obj = [0 if x != obj_coord else 1 for x in xrange(num_positions)]
        trajectory.append(obj)

    for f in weights:
        W = weights[f]
        prev_box = obj_start
        j = 0

        # Feedforward for all steps in the trajectory
        while j < len(trajectory):
            current_state = trajectory[j] + prev_box

            # Evaluate the Q-network to get the Q-values, and on the basis of that, select an action, and
            # consequently calculate the current box coordinates from the action on the previous box coordinates
            current_qvalues = feedforward(current_state, W)
            current_action = epsilon_greedy(current_qvalues)
            prev_box_coord = prev_box.index(1)
            current_box_coord = prev_box_coord - 1 if current_action == 0 else prev_box_coord if current_action == 1 else prev_box_coord + 1
            current_box = [0 if x != current_box_coord else 1 for x in xrange(num_positions)]

            # If the current box coordinates are invalid (out of bounds), we set a penalty for them and backpropagate.
            # Otherwise, we add the new state and action to their respective arrays.
            if current_box_coord < 0 or current_box_coord >= num_positions:
                break
            else:
                prev_box = current_box
                j += 1

        last_box = current_box

        if last_box == trajectory[-1]:    
            correct[f] += 1

    total += 1

for f in weights:
    print f + ' : ' + str(correct[f] * 1.0 / total)

  


qlIV_18000 : 0.1026
qlIV_8000 : 0.095
qlIV_15000 : 0.0942
qlIV_7000 : 0.0977
qlIV_9000 : 0.0977
qlIV_3000 : 0.0924
qlIV_5000 : 0.0896
qlIV_14000 : 0.0927
qlIV_2000 : 0.0922
qlIV_6000 : 0.0947
qlIV_19000 : 0.0931
qlIV_17000 : 0.0905
qlIV_4000 : 0.0963
qlIV_12000 : 0.0932
qlIV_10000 : 0.0927
qlIV_16000 : 0.0963
qlIV_11000 : 0.0952
qlIV_13000 : 0.0922
qlIV_1000 : 0.0883
