In [20]:
import gymnasium as gym
import random

random.seed(1234)

streets = gym.make("Taxi-v3", render_mode='ansi').env #New versions keep getting released; if -v3 doesn't work, try -v2 or -v4
streets.reset()
print(streets.render())

+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| | : |[43m [0m: |
|[35mY[0m| : |B: |
+---------+




-  R, G, B, and Y are pickup or dropoff locations.
-  The BLUE letter indicates where we need to pick someone up from.
-  The MAGENTA letter indicates where that passenger wants to go to.
-  The solid lines represent walls that the taxi cannot cross.
-  The filled rectangle represents the taxi itself - it's yellow when empty, and green when carrying a passenger.

Our little world here, which we've called "streets", is a 5x5 grid. The state of this world at any time can be defined by:

-  Where the taxi is (one of 5x5 = 25 locations)
-  What the current destination is (4 possibilities)
-  Where the passenger is (5 possibilities: at one of the destinations, or inside the taxi)

So there are a total of 25 x 4 x 5 = 500 possible states that describe our world.

For each state, there are six possible actions:

-  Move South, East, North, or West
-  Pickup a passenger
-  Drop off a passenger

Q-Learning will take place using the following rewards and penalties at each state:

-  A successfull drop-off yields +20 points
-  Every time step taken while driving a passenger yields a -1 point penalty
-  Picking up or dropping off at an illegal location yields a -10 point penalty

Moving across a wall just isn't allowed at all.

Let's define an initial state, with the taxi at location (2, 3), the passenger at pickup location 2, and the destination at location 0:

In [21]:
initial_state = streets.unwrapped.encode(4, 3, 4, 3)
streets.unwrapped.s = initial_state
streets.unwrapped.lastaction = None

print(streets.render())

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35m[42mB[0m[0m: |
+---------+




In [22]:
streets.unwrapped.P[initial_state]

{0: [(1.0, 479, -1, False)],
 1: [(1.0, 379, -1, False)],
 2: [(1.0, 499, -1, False)],
 3: [(1.0, 479, -1, False)],
 4: [(1.0, 479, -10, False)],
 5: [(1.0, 475, 20, True)]}

In [23]:
import numpy as np

q_table = np.zeros([streets.observation_space.n, streets.action_space.n])
q_table

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]], shape=(500, 6))

In [24]:
learning_rate=0.1
discount_factor=0.6
exploration=0.1
epochs = 10000

for taxi_run in range(epochs):
    state, _ = streets.reset()
    done = False

    while not done:
        random_value = random.uniform(0,1)
        if(random_value < exploration):
            action = streets.action_space.sample()
        else:
            action = np.argmax(q_table[state])

        next_state, reward, terminated, truncated, _ = streets.step(action)
        done = terminated or truncated

        prev_q = q_table[state, action]
        next_max_q = np.max(q_table[next_state])
        new_q = (1-learning_rate) * prev_q + learning_rate * (reward + discount_factor*next_max_q)
        q_table[state,action] = new_q
        state = next_state

In [25]:
q_table[initial_state]

array([10.83532316,  5.55020289,  5.31479569, 10.81319392,  1.98366985,
       20.        ])

In [26]:
from IPython.display import clear_output
from time import sleep

for tripnum in range(1,11):
    state, _ = streets.reset()

    done =False

    while not done:
        action = np.argmax(q_table[state])
        next_state,reward,terminated,truncated,_ = streets.step(action)
        done = terminated or truncated
        clear_output(wait=True)
        print("Trip number:" + str(tripnum))
        print(streets.render())
        sleep(.5)
        state = next_state

    sleep(.5)


Trip number:1
+---------+
|[35mR[0m: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| | : | : |
|[43mY[0m| : |B: |
+---------+
  (South)



KeyboardInterrupt: 