In [7]:
#set imports
import gym
import numpy as np
from numpy import random
import matplotlib.pyplot as plt
from IPython.display import clear_output

In [8]:
env = gym.make('Taxi-v3')

In [9]:
episodes = 11

for episode in range(1, episodes):
    state = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        state, reward, done, info = env.step(env.action_space.sample())
        score += reward
        clear_output(wait = True)
    print('Episode: {}\nScore: {}'.format(episode, score))
env.close()

Episode: 10
Score: -740


In [10]:
#creating Q-Table
actions = env.action_space.n
state = env.observation_space.n

q_table = np.zeros((state, actions))

In [11]:
#q learning params
num_episodes = 10000
max_steps_per_episode = 100

learning_rate = 0.1
discount_rate = 0.99

exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.001

rewards_all_episodes = []


In [13]:
# Q-Learning Alg
for episode in range(num_episodes):
    state = env.reset()
    done = False
    rewards_current_episode = 0
    
    for step in range(max_steps_per_episode):
        
        # Exploartion vs Exploitation trade-off
        exploration_threshold = random.uniform(0,1)
        if exploration_threshold > exploration_rate:
            action = np.argmax(q_table[state,:])
        else:
            action = env.action_space.sample()
            
        new_state, reward, done, info = env.step(action)
        
        #update q table
        q_table[state,action] = q_table[state, action] * (1-learning_rate) + learning_rate * (reward + discount_rate * np.max(q_table[new_state,:]))
            
        state = new_state
        
        rewards_current_episode += reward
        
        if done == True:
            break
        
        exploration_rate = min_exploration_rate + \
                            (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate * episode)
        
        rewards_all_episodes.append(rewards_current_episode)
        
print("********** Training Finished **********")
        

********** Training Finished **********


In [14]:
q_table

array([[  0.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ],
       [ -2.03281873,   0.28866161,  -0.04025509,   0.59282459,
          9.6220697 ,  -7.14733933],
       [  0.7925925 ,   4.98979894,   1.74679221,   1.54609564,
         14.11880599,  -6.1018764 ],
       ...,
       [ -1.49970105,   1.50758142,  -1.57127289,  -1.59153739,
         -8.84464263,  -7.22714369],
       [ -2.91542406,  -2.87047607,  -2.9620968 ,   0.68699843,
        -10.74749365,  -7.60449797],
       [  3.77442993,   2.86954838,  -0.18992349,  18.78315284,
          0.32545457,  -1.89086279]])

In [15]:
#calculate and pring average reward per thousand episodes
rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes), (num_episodes/1000))
count = 1000

print('Average per thousand episodes')
for r in rewards_per_thousand_episodes:
    print(count, ' : ' , str(sum(r/1000)))
    count += 1000
    

Average per thousand episodes
1000  :  -3955.775000000005
2000  :  -3410.2620000000056
3000  :  -2782.786000000002
4000  :  -2178.5899999999774
5000  :  -1350.4909999999668
6000  :  -348.0239999999623
7000  :  -173.89299999999443
8000  :  -158.11399999999819
9000  :  -157.14399999999821
10000  :  -156.71399999999807


In [24]:
#visualize agent
import time

for episode in range(3):
    state = env.reset()
    done = False
    print('episode is:' + str(episode))
    time.sleep(1)
    
    for step in range(max_steps_per_episode):
        clear_output(wait=True)
        env.render()
        time.sleep(0.4)
        action = np.argmax(q_table[state, :])
        
        new_state, reward, done, info= env.step(action)
        
        if done:
            clear_output(wait=True)
            env.render()
            if reward == 1:
                print('***** Reached Goal *****')
                time.sleep(2)
                clear_output(wait=True)
            else:
                print('***** Failed *****')
                time.sleep(2)
                clear_output(wait=True)
            break
                
        state = new_state
env.close()

                

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35m[34;1m[43mY[0m[0m[0m| : |B: |
+---------+
  (Dropoff)
***** Failed *****
