In [6]:
import gymnasium as gym
import numpy as np
import time
import random
from IPython.display import clear_output

In [7]:

env = gym.make("Taxi-v3", render_mode='ansi').env
env.reset()
print(env.render())


+---------+
|[34;1mR[0m: | : :G|
| : |[43m [0m: : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+




In [42]:
# Initialize the q-table with zero values
q_table = np.zeros([env.observation_space.n, env.action_space.n])

# Hyperparameters
learning_rate = 0.7  
discount_rate = 0.7  

num_episodes = 100000
max_steps_per_episode = 1000


exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = 0.05
exploration_decay_rate = 0.0005

# Random generator
rng =np.random.default_rng()

In [46]:
rewards_all_episodes = []

# 2 For life or until learning is stopped
for episode in range(num_episodes):
    # Reset the environment
    observation, info = env.reset()
    step = 0
    rewards_current_episode  = 0
    
    for step in range(max_steps_per_episode):
        # 3. Choose an action a in the current world state (s)
        ## First we randomize a number
        exploration_rate_threshold = random.uniform(0, 1)
        
        ## If this number > greater than epsilon --> exploitation (taking the biggest Q value for this state)
        if exploration_rate_threshold > exploration_rate:
            action = np.argmax(q_table[observation,:])

        # Else doing a random choice --> exploration
        else:
            action = env.action_space.sample()
        

        # Take the action (a) and observe the outcome state(s') and reward (r)
        new_observation, reward, terminated, truncated, info = env.step(action)


        # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
        # qtable[new_state,:] : all the actions we can take from new state
        q_table[observation, action] = q_table[observation, action] * (1 - learning_rate) +  learning_rate * (reward + discount_rate * np.max(q_table[new_observation, :]))
   
        
        rewards_current_episode  += reward
        
        # Our new state is state
        observation = new_observation
        
        # If done (if we're dead) : finish episode
        if terminated or truncated: 
            break
        
    # Reduce epsilon (because we need less and less exploration)
    exploration_rate = min_exploration_rate + (max_exploration_rate - min_exploration_rate)*np.exp(-exploration_decay_rate*episode) 
    rewards_all_episodes.append(rewards_current_episode )

# Calculate and print the average reward per thousand episodes
rewards_per_ten_thousand_episodes = np.split(np.array(rewards_all_episodes),num_episodes/10000)
count = 10000

print("********Average reward per thousand episodes********\n")
for r in rewards_per_ten_thousand_episodes:
    print(count, ": ", str(sum(r/10000)))
    count += 10000
    


********Average reward per thousand episodes********

10000 :  -64.63250000000065
20000 :  5.241500000000036
30000 :  5.297300000000056
40000 :  5.3533000000000195
50000 :  5.308900000000062
60000 :  5.304100000000031
70000 :  5.34470000000003
80000 :  5.34550000000002
90000 :  5.362300000000025
100000 :  5.30130000000001


In [45]:

for episode in range(3):
    observation, info = env.reset()
    print("********EPISODE ", episode+1, "********\n\n\n\n")
    time.sleep(1)

    for step in range(25):
        clear_output(wait=True)
        print(env.render())
        time.sleep(0.3)

        action = np.argmax(q_table[observation,:])
        new_observation, reward, terminated, truncated, info = env.step(action)
        if terminated or truncated: 
            clear_output(wait=True)
            print(env.render())
            if terminated:
                print("****You reached the goal!****")
                time.sleep(3)
            else:
                print("****Something else happened!****")
                time.sleep(3)
                clear_output(wait=True)
            break
        observation = new_observation
        

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35m[34;1m[43mY[0m[0m[0m| : |B: |
+---------+
  (Dropoff)

****You reached the goal!****
