In [6]:
import gym
import numpy as np

env = gym.make("Taxi-v3")
env.s = 89 # set state to 89
env.render()

+---------+
|R: | : :[35mG[0m|
| : |[43m [0m: : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+



In [7]:
state_size = env.observation_space.n
action_size = env.action_space.n

q_table = np.zeros((state_size, action_size))

FILE_SAVE = "q_table.npy"
total_episodes = 5000         # Total episodes
total_test_episodes = 100     # Total test episodes
max_steps = 99                # Max steps per episode

learning_rate = 0.7           # Learning rate
discount_rate = 0.95         # Discounting rate

# Exploration parameters
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.01            # Minimum exploration probability 
decay_rate = 0.01             # Exponential decay rate for exploration prob

In [9]:
state_size

500

In [10]:
for episode in range(total_episodes):
    state = env.reset()
    done = False
    for step in range(max_steps):
        epsilon = min(min_epsilon, epsilon*decay_rate)
        # check if we exploit or not
        if np.random.rand() < epsilon:
            # exploit
            action = np.argmax(q_table[state])
        else:
            action = np.random.randint(0, action_size)
        # get the reward and the next state
        new_state, reward, done, _ = env.step(action)
        # update the q_table via Bellman equation
        update = reward + discount_rate*q_table[new_state].max() - q_table[state,action]
        q_table[state,action] = q_table[state,action] + learning_rate*update
        state = new_state
        if done:
            break
    if episode % 1e3 == 0:
        print("done episode ", episode)

done episode  0
done episode  1000
done episode  2000
done episode  3000
done episode  4000


In [40]:
np.save(FILE_SAVE, q_table)

In [None]:
import time
from google.colab import output

env.reset()
rewards = []

for episode in range(total_test_episodes):
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    #print("****************************************************")
    #print("EPISODE ", episode)

    for step in range(max_steps):
        # UNCOMMENT IT IF YOU WANT TO SEE OUR AGENT PLAYING
        #env.render()
        # Take the action (index) that have the maximum expected future reward given that state
        action = np.argmax(q_table[state,:])

        new_state, reward, done, info = env.step(action)
        env.render()
        total_rewards += reward
        print(total_rewards)
        time.sleep(1)
        output.clear()

        if done:
            rewards.append(total_rewards)
            #print ("Score", total_rewards)
            break
        state = new_state
env.close()
print ("Score over time: " +  str(sum(rewards)/total_test_episodes))