### RL Q-Learning Example

In [8]:
# the goal is that reaching the topmost point, R6 for this case from arbitrary point
print(
    """
    ----------------------------------------------------
    |                                                   | 
    |     R1               R2                    R3     | 
    |                                                   |
    |-------------                          |           |
    |            |                          |           |
    |     R4     |         R5               |    R6     |
    |            |                          |           |
    |            |                           -----------|
    |                                                   |
    |     R7               R8                    R9     |
    |                                                   |
    ----------------------------------------------------
    """
)


    ----------------------------------------------------
    |                                                   | 
    |     R1               R2                    R3     | 
    |                                                   |
    |-------------                          |           |
    |            |                          |           |
    |     R4     |         R5               |    R6*    |
    |            |                          |           |
    |            |                           -----------|
    |                                                   |
    |     R7               R8                    R9     |
    |                                                   |
    ----------------------------------------------------
    


In [32]:
# import numpy
import numpy as np

In [33]:
# define the Q-Learning params

# discount factor
gamma = 0.75

# learning rate
alpha = 0.9

In [34]:
# states

states = {
    'R1' : 0,
    'R2' : 1,
    'R3' : 2,
    'R4' : 3,
    'R5' : 4,
    'R6' : 5,
    'R7' : 6,
    'R8' : 7,
    'R9' : 8
}

In [35]:
# actions

actions = [0,1,2,3,4,5,6,7,8]

In [36]:
# rewards

rewards = np.array([[0,1,0,0,0,0,0,0,0],
                    [1,0,1,0,0,0,0,0,0],
                    [0,1,0,0,0,1,0,0,0],
                    [0,0,0,0,0,0,1,0,0],
                    [0,1,0,0,0,0,0,1,0],
                    [0,0,1,0,0,0,0,0,0],
                    [0,0,0,1,0,0,0,1,0],
                    [0,0,0,0,1,0,1,0,1],
                    [0,0,0,0,0,0,0,1,0]])

In [37]:
# Maps indices to locations

state_to_location = dict((state,location) for location,state in states.items())

In [38]:
# Define the actions
actions = [0,1,2,3,4,5,6,7,8]

In [39]:
"""
write a function that 
INPUTS
- start point
- end point
OUTPUT
- optimal route for reaching the end location from the starting location
"""

'\nwrite a function that \nINPUTS\n- start point\n- end point\nOUTPUT\n- optimal route for reaching the end location from the starting location\n'

In [42]:
# the function

def get_route(start_node,end_node):
    # copy rewards to set the priority according to user sended end node
    new_rewards = np.copy(rewards)
    end_state = states[end_node]
    new_rewards[end_state,end_state] = 99

    # Q-Learning
    Q = np.array(np.zeros([9,9]))

    for i in range(1000):
        current_state = np.random.randint(0,9)
        traversable_actions = []
        for j in range(9):
            if(new_rewards[current_state,j] > 0):
                traversable_actions.append(j)
        next_state = np.random.choice(traversable_actions)
        # calculate the temporal difference
        td = new_rewards[current_state,next_state] + gamma * Q[next_state,np.argmax(Q[next_state,])] - Q[current_state,next_state]
        # update Q via Belmann Eqn.
        Q[current_state,next_state] += alpha * td
    
    route = [start_node]
    next_node = start_node
    
    while(next_node != end_node):
        start_state = states[start_node]
        next_state = np.argmax(Q[start_state,])
        next_node = state_to_location[next_state]
        route.append(next_node)
        start_node = next_node
    
    return route

In [44]:
# test
print(get_route('R7', 'R1'))

['R7', 'R8', 'R5', 'R2', 'R1']
