## Testing saved weights

In [1]:
import numpy as np
import os

In [2]:
num_positions = 10
num_actions = 3
gamma = 0.9
alpha = 0.01
epsilon = 0.1

In [4]:
weights = {}

for f in os.listdir('../QL_III_checkpoints/'):
    weights[f.split('.')[0]] = np.load('../QL_III_checkpoints/' + f)

In [5]:
def feedforward(s, W):
    return 1.0 / (1.0 + np.exp(-1.0 * np.dot(s, W)))

def epsilon_greedy(actions):
    if np.random.rand() <= epsilon:
        return np.argmax(actions)
    return np.random.randint(0, len(actions))

def backpropagate(gradients):
    global W
    W = W - alpha * gradients

In [6]:
correct = {}
total = 0

for f in weights:
    correct[f] = 0.0

for i in xrange(10000):
    obj_start = np.random.randint(0, num_positions)
    obj = obj_start

    trajectory = []

    # Generate a single trajectory
    for j in xrange(10):
        a = np.random.randint(0, num_actions)
        while (obj == 0 and a == 0) or (obj == num_positions - 1 and a == 2):
            a = np.random.randint(0, num_actions)
        obj = obj - 1 if a == 0 else obj if a == 1 else obj + 1
        trajectory.append(obj)

    for f in weights:
        W = weights[f]
        prev_box = obj_start
        j = 0
        # Feedforward for all steps in the trajectory
        while j < len(trajectory):
            current_state = np.zeros(num_positions + 1)
            current_state[-1] = prev_box
            current_state[trajectory[j]] = 1

            # Evaluate the Q-network to get the Q-values, and on the basis of that, select an action, and
            # consequently calculate the current box coordinates from the action on the previous box coordinates
            current_qvalues = feedforward(current_state, W)
            current_action = epsilon_greedy(current_qvalues)
            current_box = prev_box - 1 if current_action == 0 else prev_box if current_action == 1 else prev_box + 1

            # If the current box coordinates are invalid (out of bounds), we set a penalty for them and backpropagate.
            # Otherwise, we add the new state and action to their respective arrays.
            if current_box < 0 or current_box >= num_positions:
                break
            else:
                prev_box = current_box
                j += 1

        last_box = current_box

        if last_box == trajectory[-1]:    
            correct[f] += 1
            
        
    total += 1
    
for f in weights:
    print f + ' : ' + str(correct[f] * 1.0 / total)

                

  


5000 : 0.0981
6000 : 0.0958
4000 : 0.0955
2000 : 0.0976
3000 : 0.0953
7000 : 0.0967
8000 : 0.1022
1000 : 0.0984
