# Teach a Taxi to pick up and drop off passengers at the right locations with Reinforcement Learning

In [32]:
!pip install gym==0.21

Collecting gym==0.21
  Using cached gym-0.21.0-py3-none-any.whl
Installing collected packages: gym
  Attempting uninstall: gym
    Found existing installation: gym 0.20.0
    Uninstalling gym-0.20.0:
      Successfully uninstalled gym-0.20.0
Successfully installed gym-0.21.0


In [33]:
import gym
import numpy as np
import pickle, os

In [41]:
env = gym.make("Taxi-v3")

In [42]:
state = env.reset()

In [43]:
state

346

In [44]:
env.observation_space.n

500

In [45]:
env.render()

+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| | :[43m [0m| : |
|[35mY[0m| : |B: |
+---------+



In [46]:
state

346

<h1>Possible Actions</h1>

down (0), up (1), right (2), left (3), pick-up (4), and drop-off (5)

In [47]:
n_states = env.observation_space.n
n_actions = env.action_space.n

In [48]:
n_actions

6

In [49]:
n_states

500

In [50]:
env.env.s = 150

In [51]:
state

346

In [52]:
state = env.reset()

In [53]:
state

462

In [54]:
env.render()

+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[43mB[0m: |
+---------+



In [55]:
env.step(3)

(462, -1, False, {'prob': 1.0})

In [56]:
env.render()

+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[43mB[0m: |
+---------+
  (West)


In [57]:
state

462

<h1>How good does behaving completely random do?</h1>

In [58]:
state = env.reset()
counter = 0
g = 0
reward = None

In [59]:
env.render()

+---------+
|[35mR[0m: | : :[34;1mG[0m|
| : | : : |
| : : : :[43m [0m|
| | : | : |
|Y| : |B: |
+---------+



In [60]:
while reward != 20:
    state, reward, done, info = env.step(env.action_space.sample())
    counter += 1
    g += reward

In [61]:
print("Solved in {} Steps with a total reward of {}".format(counter,g))

Solved in 8440 Steps with a total reward of -33907


## Let's look at just one episode and see how the Q values change after each step using the formula below

In [62]:
Q = np.zeros([n_states, n_actions])

In [63]:
Q

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

In [64]:
Q.shape

(500, 6)

In [65]:
episodes = 1
G = 0
alpha = 0.618

In [66]:
for episode in range(1,episodes+1):
    done = False
    G, reward = 0,0
    state = env.reset()
    firstState = state
    print("Initial State = {}".format(state))
    while reward != 20:
        action = np.argmax(Q[state]) 
        state2, reward, done, info = env.step(action)
        Q[state,action] += alpha * (reward + np.max(Q[state2]) - Q[state,action]) 
        G += reward
        state = state2

Initial State = 87


In [67]:
finalState = state

## Let's look at the first step:

In [68]:
firstState

87

## Let's look at the final step:

In [69]:
finalState

475

In [70]:
Q

array([[ 0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ],
       [ 0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ],
       [ 0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ],
       ...,
       [ 0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ],
       [ 0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ],
       [-1.236, -1.236, -0.618, -0.618, -6.18 , -6.18 ]])

## Let's run over multiple episodes so that we can converge on a optimal policy

In [71]:
Q = np.zeros([n_states, n_actions])
episodes = 10000
rewardTracker = []

In [72]:
G = 0
alpha = 0.1

In [73]:
for episode in range(1,episodes+1):
    done = False
    G, reward = 0,0
    state = env.reset()
    while done != True:
        action = np.argmax(Q[state]) 
        state2, reward, done, info = env.step(action) 
        Q[state,action] += alpha * ((reward + (np.max(Q[state2]))  - Q[state,action]))
        G += reward
        state = state2
        
    if episode % 100 == 0:
        print('Episode {} Total Reward: {}'.format(episode,G))

Episode 100 Total Reward: -47
Episode 200 Total Reward: -56
Episode 300 Total Reward: -130
Episode 400 Total Reward: -136
Episode 500 Total Reward: 13
Episode 600 Total Reward: -63
Episode 700 Total Reward: 9
Episode 800 Total Reward: 1
Episode 900 Total Reward: 11
Episode 1000 Total Reward: -65
Episode 1100 Total Reward: 12
Episode 1200 Total Reward: 6
Episode 1300 Total Reward: 7
Episode 1400 Total Reward: 9
Episode 1500 Total Reward: 13
Episode 1600 Total Reward: 8
Episode 1700 Total Reward: 11
Episode 1800 Total Reward: 10
Episode 1900 Total Reward: -6
Episode 2000 Total Reward: 9
Episode 2100 Total Reward: 7
Episode 2200 Total Reward: 10
Episode 2300 Total Reward: 7
Episode 2400 Total Reward: 11
Episode 2500 Total Reward: 5
Episode 2600 Total Reward: 8
Episode 2700 Total Reward: 12
Episode 2800 Total Reward: 8
Episode 2900 Total Reward: 9
Episode 3000 Total Reward: 6
Episode 3100 Total Reward: 10
Episode 3200 Total Reward: 4
Episode 3300 Total Reward: 5
Episode 3400 Total Reward: 

In [74]:
Q.shape

(500, 6)

In [75]:
import pandas as pd
pd.DataFrame(Q)

Unnamed: 0,0,1,2,3,4,5
0,0.000000,0.000000,0.000000,0.000000,0.0,0.0
1,-3.789128,-3.800000,-3.778527,-3.800000,11.0,-4.0
2,-2.476716,-2.500000,-2.464524,-2.500000,15.0,-3.0
3,-3.055359,-3.000000,-3.020012,-3.000000,12.0,-3.0
4,1.534940,-7.500000,-7.454885,-7.500000,-8.0,-8.0
...,...,...,...,...,...,...
495,0.000000,0.000000,0.000000,0.000000,0.0,0.0
496,-2.500000,-2.512134,-2.500000,-2.455456,-3.0,-3.0
497,-1.100000,-1.081813,-1.100000,-1.132567,-2.0,-2.0
498,-2.200000,-2.246531,-2.200000,-2.310856,-3.0,-3.0


## Now that we have learned the optimal Q Values we have developed a optimal policy and have no need to train the agent anymore

In [76]:
state = env.reset()
done = None

In [77]:
state

288

In [78]:
while done != True:
    # We simply take the action with the highest Q Value
    action = np.argmax(Q[state])
    state, reward, done, info = env.step(action)
    env.render()

+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : :[43m [0m: |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+
  (West)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : :[43m [0m: : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+
  (West)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| :[43m [0m: : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+
  (West)
+---------+
|[35mR[0m: | : :G|
| : | : : |
|[43m [0m: : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+
  (West)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
|[43m [0m| : | : |
|[34;1mY[0m| : |B: |
+---------+
  (South)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[34;1m[43mY[0m[0m| : |B: |
+---------+
  (South)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[42mY[0m| : |B: |
+---------+
  (Pickup)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
|[42m_[0m| : | : |
|Y| : |B: |
+---------+
  (North)
+---------+
|[35mR[0m: | : :G|
| : | : 

In [79]:
with open("smartTaxi_qTable.pkl", 'wb') as f:
    pickle.dump(Q, f)

In [80]:
with open("smartTaxi_qTable.pkl", 'rb') as f:
    Qtest = pickle.load(f)