#### Reinforced Learning with Q-Table

* Create a table (2D array) that represents actions * states of the env
* Set q-learn-score = 0 for each action in every state
* Update the q-learn-score for the action in state after its done
* New score = (1 - alpha) * current_state_q_learn_score + alpha * (reward + gamma * next_state_q_learn_score)
* alpha is learning rate, gamma is a weight

In [4]:
import gym

In [5]:
env = gym.make("Taxi-v3", render_mode = "ansi").env

env.reset(seed = 0)

env = env.unwrapped

* R[0], G[1], B[2] & Y[3] are stops where Taxi can pick up and drop off riders
* Blue stop is where to pick up the rider
* Purple stop is where to drop off the rider

`reset` method will ensure the stops' current places won't change on re-execute

In [6]:
print(env.render())

+---------+
|R: | : :G|
| : | : : |
| : : : : |
|[43m [0m| : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+




In [7]:
import numpy as np

states = env.observation_space.n

actions = env.action_space.n

q_table = np.zeros([states, actions])

q_table

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

#### Train the driver

In [8]:
alpha = 0.4
gamma = 0.6

def train():
  state = env.reset()[0]

  done = False

  while not done:
    # select the index of action with highest q-learn score in the current state
    action = np.argmax(q_table[state])

    current_state_q_learn_score = q_table[state, action]

    next_state, reward, done, _, _ = env.step(action)

    # return the action with highest q-learn-score in the next state
    next_state_q_learn_score = np.max(q_table[next_state])

    q_table[state, action] = (1 - alpha) * current_state_q_learn_score + alpha * (reward + gamma * next_state_q_learn_score)

    state = next_state

In [9]:
episodes = 100000

for i in range(episodes):
  train()

"training finished"

'training finished'

In [10]:
q_table

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [-2.33785914, -2.34639386, -2.35004636, -2.34639386, -2.27325184,
        -4.        ],
       [-1.99901985, -1.97946063, -1.97912352, -1.97946063, -0.7504    ,
        -4.        ],
       ...,
       [-1.45447014, -1.38927616, -1.45447014, -1.50226579, -4.        ,
        -4.        ],
       [-2.19147423, -2.19821217, -2.19147423, -2.17645319, -4.        ,
        -4.        ],
       [-0.736     , -0.736     , -0.736     ,  1.28      , -4.        ,
        -4.        ]])

#### Evaluate the driver after 100 rides

In [11]:
def ride(record = False):
  state = env.reset()[0]

  done = False
  epochs = 0
  penalties = 0
  frames = []

  while not done:
    # select the index of action with highest q-learn score in the current state
    action = np.argmax(q_table[state])

    state, reward, done, _, _ = env.step(action)

    epochs +=1

    if reward == -10:
      penalties += 1

    if record:
      frames.append({
        "state": state,
        "action": action,
        "reward": reward
      })

  return epochs, penalties, frames

In [12]:
episodes = 100
total_epochs = 0
total_penalties = 0

for _ in range(episodes):
  epochs, penalties, _ = ride()

  total_epochs += epochs
  total_penalties += penalties

"AVG time taken per episode:", (total_epochs / episodes), "AVG penalties per episode:", (total_penalties / episodes)

('AVG time taken per episode:', 12.35, 'AVG penalties per episode:', 0.0)

#### A ride with the driver

In [18]:
epochs, penalties, frames = ride(record = True)

"time taken:", epochs, "penalties:", penalties

('time taken:', 9, 'penalties:', 0)

In [19]:
from IPython.display import clear_output
from time import sleep

def print_frames(frames):
  actions = ["North", "South", "East", "West", "PickUp", "DropOff"]

  for i, frame in enumerate(frames):
    clear_output(wait = True)
    env.s = frame["state"]
    print (env.render())
    print("time step:", i + 1)
    print("state:", frame["state"], "action:", actions[frame["action"]], "reward:", frame["reward"])
    sleep(0.1)

print_frames(frames)

+---------+
|R: | : :[35m[34;1m[43mG[0m[0m[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

time step: 9
state: 85 action: DropOff reward: 20
