In [3]:
import gymnasium as gym
import numpy as np
import time
import random
from IPython.display import clear_output

In [14]:
env = gym.make('FrozenLake-v1', desc=None, map_name="4x4", is_slippery=True, render_mode='ansi').env
# env = gym.make("Taxi-v3", render_mode='ansi').env

env.reset()
print(env.render())



[41mS[0mFFF
FHFH
FFFH
HFFG



In [15]:


# Hyperparameters
num_episodes = 10000
max_steps_per_episode = 100

learning_rate = 0.1
discount_rate = 0.99

exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.001



In [16]:

def train_q_learn(learning_rate, discount_rate, num_episodes, max_steps_per_episode, exploration_rate, max_exploration_rate, min_exploration_rate, exploration_decay_rate):
    rewards_all_episodes = []
    # Initialize the q-table with zero values
    q_table = np.zeros([env.observation_space.n, env.action_space.n])

    # Random generator
    rng =np.random.default_rng()

    # 2 For life or until learning is stopped
    for episode in range(num_episodes):
        # Reset the environment
        observation, info = env.reset()
        step = 0
        rewards_current_episode  = 0
        
        for step in range(max_steps_per_episode):
            # 3. Choose an action a in the current world state (s)
            ## First we randomize a number
            exploration_rate_threshold = random.uniform(0, 1)
            
            ## If this number > greater than epsilon --> exploitation (taking the biggest Q value for this state)
            if exploration_rate_threshold > exploration_rate:
                action = np.argmax(q_table[observation,:])

            # Else doing a random choice --> exploration
            else:
                action = env.action_space.sample()
            

            # Take the action (a) and observe the outcome state(s') and reward (r)
            new_observation, reward, terminated, truncated, info = env.step(action)


            # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
            # qtable[new_state,:] : all the actions we can take from new state
            q_table[observation, action] = q_table[observation, action] * (1 - learning_rate) +  learning_rate * (reward + discount_rate * np.max(q_table[new_observation, :]))
    
            
            rewards_current_episode  += reward
            
            # Our new state is state
            observation = new_observation
            
            # If done (if we're dead) : finish episode
            if terminated or truncated: 
                break
            
        # Reduce epsilon (because we need less and less exploration)
        exploration_rate = min_exploration_rate + (max_exploration_rate - min_exploration_rate)*np.exp(-exploration_decay_rate*episode) 
        rewards_all_episodes.append(rewards_current_episode )

    # Calculate and print the average reward per thousand episodes
    rewards_per_ten_thousand_episodes = np.split(np.array(rewards_all_episodes),num_episodes/10000)
    count = 10000

    print("********Average reward per thousand episodes********\n")
    for r in rewards_per_ten_thousand_episodes:
        print(count, ": ", str(sum(r/10000)))
        count += 10000
    return q_table
    


In [20]:
def visualise(q_table):
    for episode in range(3):
        observation, info = env.reset()
        print("********EPISODE ", episode+1, "********\n\n\n\n")
        time.sleep(1)

        for step in range(max_steps_per_episode):
            clear_output(wait=True)
            print(env.render())
            time.sleep(0.3)

            action = np.argmax(q_table[observation,:])
            new_observation, reward, terminated, truncated, info = env.step(action)
            if terminated or truncated: 
                clear_output(wait=True)
                print(env.render())
                if reward > 0:
                    print("****You reached the goal!****")
                    time.sleep(3)
                else:
                    print("****Something else happened!****")
                    time.sleep(3)
                    clear_output(wait=True)
                break
            observation = new_observation
        

In [18]:
q_table = train_q_learn(learning_rate, discount_rate, num_episodes, max_steps_per_episode, exploration_rate, max_exploration_rate, min_exploration_rate, exploration_decay_rate)


********Average reward per thousand episodes********

10000 :  0.07970000000000126
20000 :  0.1319000000000018
30000 :  0.1355000000000014
40000 :  0.12870000000000215
50000 :  0.13860000000000106
60000 :  0.13470000000000149
70000 :  0.12770000000000226
80000 :  0.13600000000000134
90000 :  0.14690000000000014
100000 :  0.13840000000000108


In [21]:
visualise(q_table)

  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG

****Something else happened!****
