In [1]:
#importing necessary libraries

import numpy as np
import gym
import random

In [2]:
# Defining an environment
env = gym.make("Taxi-v3")

In [3]:
# how the output will look like
env.render()

+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : :[43m [0m: |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+



In [4]:
# define the number of actions and state agent can choose from

action_size = env.action_space.n
state_size = env.observation_space.n

print("Action space :",action_size)
print("State size : ",state_size)

Action space : 6
State size :  500


In [5]:
# Defining a Q_table

q_table = np.zeros((state_size, action_size))
print(q_table)

[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]


In [6]:
# defining Hyper-parameters

total_epochs = 2000
total_test_epochs = 100
max_steps = 100
lr = 0.81
gamma = 0.96


# Defining exploration-exploitation parameters

epsilon = 0.9
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.1

In [7]:
# iterate for total number of episodes

for epochs in range(total_epochs):
    
    #reset the state
    state = env.reset()
    done = False
    
    #iterate for max step
    for step in range(max_steps):
        
        # Choose a random exploration-exploitation-tradeoff
        exp_exp_tradeoff = random.uniform(0,1)
        
        # if exploration-exploitation-tradeoff is greater then epsilon the exploit and choose the best action
        # from the Q-Table
        if exp_exp_tradeoff > epsilon:
            action = np.argmax(q_table[state,:])
            
        # if exploration-exploitation-tradeoff is less then epsilon then choose a random action
        else:
            action = env.action_space.sample()
            
        # acquire the reward and next state
        new_state, reward, done, info = env.step(action)
        
        # update the Q_table based on formula
        q_table[state, action] = q_table[state, action] + lr * (reward + gamma * 
                                    np.max(q_table[new_state, :]) - q_table[state, action])
        
        # set state to next state
        state = new_state
        
        if done: break
    
    # reduce the epsilon to reduce the exploitaion 
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate*epochs)

In [8]:
# reset the env and declare vars
env.reset()
reward_list = []
total_reward = 0

# iterate for total test epochs
for epochs in range(total_test_epochs):
    
    # set state by reseting the environment
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    print('=========================')
    print('EPISODE: ', epochs)
    
    #iterate for every step
    for step in range(max_steps):
        
        env.render()
        
        # choose the action greedily
        action = np.argmax(q_table[state,:])
        
        state, reward, done, info = env.step(action)
        
        # add the reward to total_reward for each iteration
        total_reward += reward
        
        if done:
            reward_list.append(total_reward)
            print('Score: ', total_reward)
            break
            
    # set state to next state
    state = new_state

env.close()
print('Score Over Time: {}'.format(sum(reward_list)/total_test_epochs))

EPISODE:  0
+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : : : :[43m [0m|
| | : | : |
|Y| : |[35mB[0m: |
+---------+

+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : : :[43m [0m: |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (West)
+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : :[43m [0m: : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (West)
+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| :[43m [0m: : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (West)
+---------+
|[34;1mR[0m: | : :G|
| : | : : |
|[43m [0m: : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (West)
+---------+
|[34;1mR[0m: | : :G|
|[43m [0m: | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (North)
+---------+
|[34;1m[43mR[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (North)
+---------+
|[42mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (Pickup)
+---------+
|R: | : :G|
|[

  (East)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : :[43m [0m: |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (East)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | : |[43m [0m: |
|Y| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[34;1m[43mB[0m[0m: |
+---------+
  (South)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[42mB[0m: |
+---------+
  (Pickup)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | : |[42m_[0m: |
|Y| : |B: |
+---------+
  (North)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : :[42m_[0m: |
| | : | : |
|Y| : |B: |
+---------+
  (North)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : :[42m_[0m: : |
| | : | : |
|Y| : |B: |
+---------+
  (West)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| :[42m_[0m: : : |
| | : | : |
|Y| : |B: |
+---------+
  (West)
+---------+
|[35mR[0m: | : :G|
| :[42m_[0m| : : |
| : : : : 

EPISODE:  96
+---------+
|[35mR[0m: | : :[34;1mG[0m|
| : | : : |
|[43m [0m: : : : |
| | : | : |
|Y| : |B: |
+---------+

+---------+
|[35mR[0m: | : :[34;1mG[0m|
| : | : : |
| :[43m [0m: : : |
| | : | : |
|Y| : |B: |
+---------+
  (East)
+---------+
|[35mR[0m: | : :[34;1mG[0m|
| : | : : |
| : :[43m [0m: : |
| | : | : |
|Y| : |B: |
+---------+
  (East)
+---------+
|[35mR[0m: | : :[34;1mG[0m|
| : | : : |
| : : :[43m [0m: |
| | : | : |
|Y| : |B: |
+---------+
  (East)
+---------+
|[35mR[0m: | : :[34;1mG[0m|
| : | :[43m [0m: |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (North)
+---------+
|[35mR[0m: | : :[34;1mG[0m|
| : | : :[43m [0m|
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (East)
+---------+
|[35mR[0m: | : :[34;1m[43mG[0m[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (North)
+---------+
|[35mR[0m: | : :[42mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Pickup)
+---------+
|[35mR[0m: |