In [17]:
import numpy as np

In [18]:
# R matrix
R = np.matrix([[-1,-1,-1,-1, 0,-1], # (S,A) -> R(0,4) = 0
               [-1,-1,-1, 0,-1,100],# (S,A) -> R(1,3) = 0; R(1,5) = 100 
               [-1,-1,-1, 0,-1,-1], # (S,A) -> R(2,3) = 0
               [-1, 0, 0,-1, 0,-1], # (S,A) -> R(3,1) = 0; R(3,2) = 0; R(3,4) = 0
               [ 0,-1,-1, 0,-1,100],# (S,A) -> R(4,0) = 0; R(4,3) = 0; R(4,5) = 100
               [-1,-1,-1,-1,-1,-1]])# (S,A) -> R(5,1) = 0; R(5,4) = 0; R(5,5) = 100

In [19]:
R

matrix([[ -1,  -1,  -1,  -1,   0,  -1],
        [ -1,  -1,  -1,   0,  -1, 100],
        [ -1,  -1,  -1,   0,  -1,  -1],
        [ -1,   0,   0,  -1,   0,  -1],
        [  0,  -1,  -1,   0,  -1, 100],
        [ -1,  -1,  -1,  -1,  -1,  -1]])

In [20]:
# Q matrix
Q = np.matrix(np.zeros([6, 6]))

In [21]:
Q

matrix([[0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.]])

In [22]:
# Gamma (Learning rate).
gamma = 0.8

In [23]:
# Initial state. (Usually to be chosen at random)
initial_state = 1

In [24]:
current_state_row = R[initial_state,]

In [25]:
current_state_row

matrix([[ -1,  -1,  -1,   0,  -1, 100]])

In [26]:
av_act = np.where(current_state_row >= 0)[1] 

In [27]:
av_act

array([3, 5], dtype=int64)

In [28]:
# This function returns all available actions in the state given as an argument
def available_actions(state):
    current_state_row = R[state,]                # Returns a matrix ([[ -1,  -1,  -1,   0,  -1, 100]])
    av_act = np.where(current_state_row >= 0)[1] # where([[ -1,  -1,  -1,   0,  -1, 100]]) >= 0 | array([3, 5])
    return av_act # return all available actions greater than or equal to zero | av_act = array([3, 5])

In [29]:
# Get available actions in the current state
available_act = available_actions(initial_state) # We summon the previous function to obtain av_act

In [30]:
available_act # Elements that represent actions available based on the Q-Value matrix

array([3, 5], dtype=int64)

In [31]:
# This function chooses at random which action to be performed within the range
# of all the available actions.
def sample_next_action(available_act):
    next_action = int(np.random.choice(available_act,1))
    return next_action

In [32]:
def update(current_state, action, gamma):
    max_index = np.where(Q[action,] == np.max(Q[action,]))[1]
    
    if max_index.shape[0] > 1:
        max_index = int(np. random.choice(max_index, size = 1)) # This condition takes place, so is a random choice 
    else:
        max_index = int(max_index)     
    max_value=Q[action, max_index]
    
    # Q learning formula   Reward Matrix is 100 and max_value is just 0
    Q[current_state, action] = R[current_state, action] + gamma * max_value


In [33]:
# Sample next action to be performed
action = sample_next_action(available_act)

In [34]:
action # Changed from expected, since it is a random factor

3

In [35]:
max_index = np.where(Q[action,] == np.max(Q[action,]))[1]
max_index # Where Q[5,...] or actions have a max value of zero, so is an array of all elements in Q[5,...] of shape 1

array([0, 1, 2, 3, 4, 5], dtype=int64)

In [36]:
Q[action,] # action = 5 Row-Vector-Reward-Environment

matrix([[0., 0., 0., 0., 0., 0.]])

In [37]:
np.max(Q[action,]) # Since all rewards are equal, then 0.0 is max

0.0

In [38]:
update(initial_state, action, gamma)
Q

matrix([[0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.]])

In [39]:
current_state = np.random.randint(0, int(Q.shape[0]))
current_state

4

In [40]:
available_act = available_actions(current_state)
available_act

array([0, 3, 5], dtype=int64)

In [41]:
action = sample_next_action(available_act)
action

3

In [44]:
#-------------------------------------------------------------
# Training

# Train over 10 000 iterations. (Re-iterate the process above).
for i in range(10000):
    current_state = np.random.randint(0, int(Q.shape[0])-1)
    available_act = available_actions(current_state)
    action = int(np.random.choice(available_act))
    update(current_state, action, gamma)
    
# Normalize the "trained" Q matrix
print("Train Q matrix: ")
print(Q/np.max(Q)*100)

Train Q matrix: 
[[  0.    0.    0.    0.   80.    0. ]
 [  0.    0.    0.   64.    0.  100. ]
 [  0.    0.    0.   64.    0.    0. ]
 [  0.   80.   51.2   0.   80.    0. ]
 [ 64.    0.    0.   64.    0.  100. ]
 [  0.    0.    0.    0.    0.    0. ]]


In [46]:
#-------------------------------------------------------------
#Testing

# Goal state = 5
# Best sequence path starting from 2 -> 2, 3, 1, 5

current_state = 1
steps = [current_state]

while current_state != 5:
    next_step_index = np.where(Q[current_state, ] == np.max(Q[current_state, ]))[1]

    if next_step_index.shape[0] > 1:
        next_step_index = int(np.random.choice(next_step_index, size=1))
    else:
        next_step_index = int(next_step_index)

    steps.append(next_step_index)
    current_state = next_step_index
    
# Print selected sequence of steps
print("Selected path: ")
print(steps)

Selected path: 
[1, 5]
