In [1]:
import numpy as np
from time import time

# R matrix
R = np.matrix([[-1, 0, -1, -1, -1, -1,-1, -1, -1, -1],
            [0, -1, 0, -1, -1, -1,-1, -1, -1, -1],
            [-1, 0, -1, 0, -1, 0, 0, -1, -1, -1],
            [-1, -1, 0, -1, 0, -1,-1, -1, -1, -1],
            [-1, -1, -1, 0, -1, -1,-1, -1, -1, -1],
            [-1, -1, 0, -1, -1, -1,-1, -1, -1, 100],
            [-1, -1, 0, -1, -1, -1,-1, 0, 0, -1],
            [-1, -1, -1, -1, -1, -1,0, -1, -1, -1],
            [-1, -1, -1, -1, -1, -1,0, -1, -1, 100],
            [-1, -1, -1, -1, -1, 0,-1, -1, 0, -1]])
# Q matrix
Q = np.matrix(np.zeros([10,10]))

# Gamma (learning parameter).
gamma = 0.8

# Initial state. (Usually to be chosen at random)
initial_state = 1

starttime=time()

# This function returns all available actions in the state given as an argument
def available_actions(state):
    current_state_row = R[state,]
    av_act = np.where(current_state_row >= 0)[1]
    return av_act

# Get available actions in the current state
available_act = available_actions(initial_state) 

# This function chooses at random which action to be performed within the range 
# of all the available actions.
def sample_next_action(available_actions_range):
    next_action = int(np.random.choice(available_act,1))
    return next_action

# Sample next action to be performed
action = sample_next_action(available_act)

# This function updates the Q matrix according to the path selected and the Q 
# learning algorithm
def update(current_state, action, gamma):
    
    max_index = np.where(Q[action,] == np.max(Q[action,]))[1]

    if max_index.shape[0] > 1:
        max_index = int(np.random.choice(max_index, size = 1))
    else:
        max_index = int(max_index)
    max_value = Q[action, max_index]
    
    # Q learning formula
    Q[current_state, action] = R[current_state, action] + gamma * max_value

# Update Q matrix
update(initial_state,action,gamma)

#-------------------------------------------------------------------------------
# Training

# Train over 10 000 iterations. (Re-iterate the process above).
for i in range(3000):
    current_state = np.random.randint(0, int(Q.shape[0]))
    available_act = available_actions(current_state)
    action = sample_next_action(available_act)
    update(current_state,action,gamma)

endtime=time()
# Normalize the "trained" Q matrix
print("Trained Q matrix:")
print(Q/np.max(Q)*100)
#print (Q)

print ('elapsed time is %s' % (endtime-starttime))

Trained Q matrix:
[[   0.     51.2     0.      0.      0.      0.      0.      0.      0.
     0.  ]
 [  40.96    0.     64.      0.      0.      0.      0.      0.      0.
     0.  ]
 [   0.     51.2     0.     51.2     0.     80.     64.      0.      0.
     0.  ]
 [   0.      0.     64.      0.     40.96    0.      0.      0.      0.
     0.  ]
 [   0.      0.      0.     51.2     0.      0.      0.      0.      0.
     0.  ]
 [   0.      0.     64.      0.      0.      0.      0.      0.      0.
   100.  ]
 [   0.      0.     64.      0.      0.      0.      0.     51.2    80.
     0.  ]
 [   0.      0.      0.      0.      0.      0.     64.      0.      0.
     0.  ]
 [   0.      0.      0.      0.      0.      0.     64.      0.      0.
   100.  ]
 [   0.      0.      0.      0.      0.     80.      0.      0.     80.
     0.  ]]
elapsed time is 0.1737508773803711
