In [12]:
import numpy as np

n_states = 16
n_actions = 4
goal_state = 15

Q_table = np.zeros((n_states,n_actions))

In [13]:
print(Q_table)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [14]:
# A learning rate is a variable by which the model decides how much it should chose to update its weights during the training phase. 
learning_rate = 0.8
# Low discount factor means that I only care about immediate rewards whereas high discount factor means that I care about future rewards 
# A simple example would be if I pick the coin in front of me or I take a risky path that could lead to future rewards
discount_factor = 0.95
# An exploration probability is used to decide wether we will select a random action or the best know action. It is different from discount factor
exploration_prob = 0.2
# An epoch means that the model has passed through an entire phase of learning with all the available training data. 
# If there are 100 training data samples, then one epoch is when your model trains from all 100 once.
epochs = 1000

In [15]:
for epoch in range(epochs):
    current_state = np.random.randint(0,n_states)
    while current_state != goal_state:
        # Here the reason we have this set up in this way is because we try to balance exploration vs exploitation
        # Here we arent manually defining wether to explore or exploit, rather we are deciding it based on a random value 
        # "np.random.rand() < exploration_prob"
        # So if it is less than exploration proabability then explore else exploit.
        
        # there is a difference between exploration probability and discount factor. The exploration probability is used for you to decide wether you will pick a action based on wether you want to explore ( random action ) or exploit ( best known action ) but discount factor is used during the evaulation phase, it is used to evaluate wether your agent will be short sighted (care about immediate rewards) or long sighted (care about future rewards)
        if np.random.rand() < exploration_prob:
            action = np.random.randint(0, n_actions)
        else:
            action = np.argmax(Q_table[current_state])
        # move to the next state, or wrap around -> used in circular queue
        next_state = ( current_state + 1 ) % n_states
        reward = 1 if next_state == goal_state else 0
        
        # So the way the following part of the formula “(reward + discount_factor * np.max(Q_table[next_state]) - Q_table[current_state, action])” works, is that it adds the rewards experienced ( for the current action and selected state ) with the rewards that could be earned in the future by the next state ( it choses the max action ) depending on how much you value immediate rewards or future rewards ( discount factor ) and we subtract that with the currently held assumption of the reward that occured by selecting the current state and the current action.
        Q_table[current_state,action] += learning_rate * (reward + discount_factor * np.max(Q_table[next_state]) - Q_table[current_state, action])
        
        current_state = next_state

In [16]:
# There is a problem in the above code which is that irrespective of the action I do ( after selecting the state ), I always end up in the next state depending on the itteration. 

# So my action does not determine the next state I always go to the next state in the itteration "next_state = ( current_state + 1 ) % n_states"

# another thing is that I do not train my agent to learn about the last state ( 16th state) because according to my current logic the state selection goes sequentially ( 1 then 2 then 3 ) and it goes on until it reaches the goal which is 15 and then it restarts it so the last row never gets executed and hence the model knows nothing about that world. 

In [17]:
# so the Q table contains knowledge about which action in which state yeilds the maximum reward and it is used for the agent to create a model of the world 
print(Q_table)

[[0.48767498 0.48377358 0.39013998 0.46816797]
 [0.51330923 0.51330923 0.51334208 0.51334203]
 [0.54036009 0.54036008 0.5403255  0.54036008]
 [0.56880009 0.5687998  0.56880009 0.56880009]
 [0.59873694 0.59873694 0.59873541 0.59873694]
 [0.63024941 0.63024941 0.63024941 0.63024941]
 [0.66342043 0.66342043 0.66342043 0.66342043]
 [0.6983373  0.6983373  0.6983373  0.6983373 ]
 [0.73509189 0.73509189 0.73509189 0.73509189]
 [0.77378094 0.77378094 0.77378094 0.77378094]
 [0.81450625 0.81450625 0.81450625 0.81450625]
 [0.857375   0.857375   0.857375   0.857375  ]
 [0.9025     0.9025     0.9025     0.9025    ]
 [0.95       0.95       0.95       0.95      ]
 [1.         1.         1.         1.        ]
 [0.         0.         0.         0.        ]]


In [18]:
# this code solved the above issues

n_states = 16
n_actions = 4
goal_state = 15

Q_table = np.zeros((n_states,n_actions))


learning_rate = 0.8
discount_factor = 0.95
exploration_prob = 0.2
epochs = 1000

for epoch in range(epochs):
    current_state = np.random.randint(0,n_states)
    count = 0
    while count != n_states:
        if np.random.rand() < exploration_prob:
            action = np.random.randint(0, n_actions)
        else:
            action = np.argmax(Q_table[current_state])
        
        if action % 2 == 0: next_state = ( current_state + 1 ) % n_states
        else: next_state = current_state - 1
        
        reward = 1 if next_state == goal_state else 0
        
        Q_table[current_state,action] += learning_rate * (reward + discount_factor * np.max(Q_table[next_state]) - Q_table[current_state, action])
        
        current_state = next_state
        count += 1
        
print(Q_table)


[[ 8.35391026  9.25641026  8.35391026  9.25641026]
 [ 7.93621474  8.79358974  7.93621474  8.79358974]
 [ 7.53940367  8.35391026  7.53940012  8.35391026]
 [ 7.16197607  7.93621474  7.16241377  7.93588486]
 [ 6.80430978  7.5369914   6.79861174  7.53940401]
 [ 6.46409651  5.78717628  6.46409638  7.16243381]
 [ 6.80431212  6.13106626  6.80340703  6.67154943]
 [ 7.16243381  6.4640932   7.16243381  6.46409651]
 [ 7.53940401  6.80431212  7.53940401  6.80431212]
 [ 7.93621474  7.16242009  7.93621474  7.16243381]
 [ 8.35391026  7.53940401  8.35391026  7.53940401]
 [ 8.79358974  7.93621474  8.79358974  7.93621474]
 [ 9.25641026  8.35391026  9.25641026  8.35391026]
 [ 9.74358974  8.79358974  9.74358974  8.79358974]
 [10.25641026  9.25641026 10.25641026  9.25641026]
 [ 8.79358974  9.74358974  8.79358974  9.74358974]]
