In [3]:
import gym

In [4]:
env = gym.make("Taxi-v3", render_mode = "ansi").env

env.reset(seed = 0)

env = env.unwrapped

* R[0], G[1], B[2] & Y[3] are stops where Taxi can pick up and drop off riders
* Blue stop is where to pick up the rider
* Purple stop is where to drop off the rider

`reset` method will ensure the stops' current places won't change on re-execute

In [5]:
print(env.render())

+---------+
|R: | : :G|
| : | : : |
| : : : : |
|[43m [0m| : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+




Possible actions: North[0], South[1], East[2], West[3], PickUp[4], DropOff[5]

In [6]:
"number of possbile actions:", env.action_space

('number of possbile actions:', Discrete(6))

Possible states: number of locations (5*5) * 5 states of riders' location (at one of the 4 stops or in the taxi) * 4 possible destinations

In [7]:
"number of possible states", env.observation_space

('number of possible states', Discrete(500))

`encode` returns status index for (location of the driver row index, column index, the stop to pick up the rider from, the stop to drop them off)

In [8]:
state = env.encode(3, 1, 2, 0)

state

328

In [9]:
env.s = state

print (env.render())

+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| |[43m [0m: | : |
|[34;1mY[0m| : |B: |
+---------+




Get the reward for every possible action at the current state (probability, next_state, reward, is target)

In [10]:
env.P[env.s]

{0: [(1.0, 428, -1, False)],
 1: [(1.0, 228, -1, False)],
 2: [(1.0, 348, -1, False)],
 3: [(1.0, 328, -1, False)],
 4: [(1.0, 328, -10, False)],
 5: [(1.0, 328, -10, False)]}

#### A ride with the driver

In [18]:
done = False
epochs = 0
penalties = 0
frames = []

while not done:
  # auto select action to perform
  action = env.action_space.sample()

  state, reward, done, _, _ = env.step(action)

  epochs +=1

  if reward == -10:
    penalties += 1

  frames.append({
    "state": state,
    "action": action,
    "reward": reward
  })

"time taken:", epochs, "penalties:", penalties

('time taken:', 88, 'penalties:', 27)

In [19]:
from IPython.display import clear_output
from time import sleep

def print_frames(frames):
  actions = ["North", "South", "East", "West", "PickUp", "DropOff"]

  for i, frame in enumerate(frames):
    clear_output(wait = True)
    env.s = frame["state"]
    print (env.render())
    print("time step:", i + 1)
    print("state:", frame["state"], "action:", actions[frame["action"]], "reward:", frame["reward"])
    sleep(0.1)

print_frames(frames)


+---------+
|[35m[34;1m[43mR[0m[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

time step: 88
state: 0 action: DropOff reward: 20


#### Evaluate the driver after 100 rides

In [20]:
def ride():
  env.reset()

  done = False
  epochs = 0
  penalties = 0

  while not done:
    # auto select action to perform
    action = env.action_space.sample()

    _, reward, done, _, _ = env.step(action)

    epochs +=1

    if reward == -10:
      penalties += 1

  return epochs, penalties

In [21]:
episodes = 100
total_epochs = 0
total_penalties = 0

for _ in range(episodes):
  epochs, penalties = ride()

  total_epochs += epochs
  total_penalties += penalties

"AVG time taken per episode:", (total_epochs / episodes), "AVG penalties per episode:", (total_penalties / episodes)


('AVG time taken per episode:', 2704.06, 'AVG penalties per episode:', 877.84)