In [28]:
# Reinforcement learning in 1d maze
import numpy as np
import random
import tensorflow as tf
import matplotlib.pyplot as plt

# explain the theory behind the solution
print('My theoretic solution to the linear maze problem:')
print('')
print('Neural Network:')
print('\t- input neurons = 5 (since the maze has 5 states)')
print('\t- hidden layers = 0 (not needed in this problem)')
print('\t- output neurons = 3 (since we are predicting one action from a set of 3 possible actions)')
print('')
print('Loss Function:')
print('E(St, At) = (Rt + gamma * max(Q(St+1, a; w) - Q(St, At; w))^2')
print('\t- St = state at time t')
print('\t- At = action at time t')
print('\t- Rt = immediate reward received from the environment at time t')
print('\t- w = weights = network parameters')
print('\t- gamma = 0.5')
print('\t- (Rt + gamma * max(Q(St+1, a; w)) = actual value of the network')
print('\t- Q(St, At; w) = estimated / predicted value of the network')
print('')
print('Loss function explained:')
print('The loss value given a state and an action = (actual value - estimated value)^2')
print('')
print('Training the model:')
print('\t- I train the model by feeding it with a one-hot vector, e.g. [0,0,1,0,0] to represent a state.')
print('')
print('Evaluating the model:')
print('\t- I evaluate the model by feeding the network with each possible state')
print('\t- This gives us the optimal move (the highest value in the output) based on states and rewards')
print('')

# load dataset
states = np.array([0, 1, 2, 3, 4]).reshape((1, 5))
rewards = np.array([1, 0, 0, 0, 2]).reshape((1, 5))
actions = np.array([0, 1, 2]).reshape(1, 3)

# Set learning parameters
num_iterations = 10000
discount_factor = 0.7
learning_rate = 0.5

# establish the feed-forward part of the network
inputs = tf.placeholder(shape=[1, 5], dtype=tf.float32)
W = tf.Variable(tf.random_uniform([5, 3], 0.1, 0.9))
y = tf.matmul(inputs, W)
predict = tf.argmax(y, 1)

# obtain the loss by taking the sum of squares difference between the actual value and predicted Q values
y_ = tf.placeholder(shape=[1, 3], dtype=tf.float32)
loss = tf.reduce_sum(tf.square((discount_factor * y_) - y))
train_step = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(loss)

# train the network
init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    identity_matrix = np.identity(5)
    
    for i in range(num_iterations):
        # random state:
        rand_index = random.randint(0, 4)
        a, allQ = sess.run([predict, y], feed_dict={inputs: identity_matrix[rand_index: rand_index+1]})
        curr_action = a[0]
        
        # perform 5 steps from the random start state:
        for j in range(5):
            # obtain a new state and reward from the environment based on the predicted action
            if curr_action == 0 and rand_index-1 >= 0:
                # action = left
                rand_index -= 1
                new_state = identity_matrix[rand_index: rand_index+1]
                reward = rewards[0][rand_index]
                
            elif curr_action == 2 and rand_index+1 <= 4:
                # action = right
                rand_index += 1
                new_state = identity_matrix[rand_index: rand_index+1]
                reward = rewards[0][rand_index]
            else:
                # action = stay
                new_state = identity_matrix[rand_index: rand_index+1]
                reward = rewards[0][rand_index]
            
            # obtain the Q values from feeding the new state through the network
            a, new_Q = sess.run([predict, y], feed_dict={inputs: new_state})
            
            # obtain the maximum new_Q value
            max_new_Q = np.max(new_Q)
            curr_action = a[0]
            
            # set target value
            target_Q = allQ
            target_Q[0, curr_action] = reward + learning_rate * max_new_Q
            
            # train the network using the target and predicted Q values
            _, W1 = sess.run([train_step, W], feed_dict={
                inputs: identity_matrix[rand_index: rand_index+1],
                y_: target_Q
            })

    print('########## Evaluate #############')
    for i in range(5):
        one_hot_vector = np.zeros((1, 5))
        one_hot_vector[0][i] = 1
        print('STATE {0}:'.format(i))
        print('')
        print('Input:')
        print(one_hot_vector)
        print('')
        print('Output:')
        eval_a, eval_q_out = sess.run([predict, y], feed_dict={inputs: one_hot_vector})
        print('Action: {0}'.format(eval_a[0]))
        print('Q values: {0}'.format(eval_q_out))
        print('\n')

My theoretic solution to the linear maze problem:

Neural Network:
	- input neurons = 5 (since the maze has 5 states)
	- hidden layers = 0 (not needed in this problem)
	- output neurons = 3 (since we are predicting one action from a set of 3 possible actions)

Loss Function:
E(St, At) = (Rt + gamma * max(Q(St+1, a; w) - Q(St, At; w))^2
	- St = state at time t
	- At = action at time t
	- Rt = immediate reward received from the environment at time t
	- w = weights = network parameters
	- gamma = 0.5
	- (Rt + gamma * max(Q(St+1, a; w)) = actual value of the network
	- Q(St, At; w) = estimated / predicted value of the network

Loss function explained:
The loss value given a state and an action = (actual value - estimated value)^2

Training the model:
	- I train the model by feeding it with a one-hot vector, e.g. [0,0,1,0,0] to represent a state.

Evaluating the model:
	- I evaluate the model by feeding the network with each possible state
	- This gives us the optimal move (the highest valu

########## Evaluate #############
STATE 0:

Input:
[[ 1.  0.  0.  0.  0.]]

Output:
Action: 2
Q values: [[  2.48779497e-28   3.18237197e-28   1.07692301e+00]]


STATE 1:

Input:
[[ 0.  1.  0.  0.  0.]]

Output:
Action: 2
Q values: [[  1.74145643e-28   2.22766033e-28   5.53849433e-15]]


STATE 2:

Input:
[[ 0.  0.  1.  0.  0.]]

Output:
Action: 2
Q values: [[  1.74145643e-28   2.22766033e-28   1.47509653e-24]]


STATE 3:

Input:
[[ 0.  0.  0.  1.  0.]]

Output:
Action: 2
Q values: [[  1.21901952e-28   1.55936216e-28   7.74748307e-27]]


STATE 4:

Input:
[[ 0.  0.  0.  0.  1.]]

Output:
Action: 2
Q values: [[  8.53313655e-29   1.09155354e-28   2.15384603e+00]]


