In [1]:
import numpy as np
import time 
import gymnasium as gym

In [2]:
#Creating the environment
env = gym.make('FrozenLake-v1', render_mode = 'human', is_slippery = False)

In [3]:
#Initializign the Q Table
q_table = np.zeros((env.observation_space.n, env.action_space.n))

In [5]:
#Setting the Hyperparameters for the Algorimth
EPOCHS = 10000 # hwo many episodes will the agent play
ALPHA = .8 #The Learning Rate
GAMMA = .9 #The Discount Factor

In [7]:
#Exploration vs Exploitation Parameters

epsilon = 1 #Exploration Rate
min_epsilon = .01 #Minimum Propbability of Exploration (Constant) 
max_epsilon = 1 #Maximum Probability of Exploration (Constant)

decay_rate = .001 #Exponential Rate of reduction of Exploration Probability

In [11]:
#Exploration vs Exploitation
def greedy_action_selection(q_table,epsilon,state):
    
    #A random cutoff for the exploration and exploitation
    random_number = np.random.random()
    
    if epsilon < random_number:
        #Exploit! Use the maximum Q(s,a)
        row_actions = q_table[state,:]
        action = np.argmax(row_actions) #Get the highest valued action from the q table
    else:
        #Explore! Perform a Random action from the action space
        action = env.action_space.sample()
    return action 

In [9]:
#Compute the next Q Value
def compute_next_q_value(old_q_value, next_opt_q_value, reward):
    q_value =old_q_value + ALPHA*(reward + GAMMA*next_opt_q_value - old_q_value)
    return q_value

In [10]:
def epsilon_reduction(epsilon, epoch):
    epsilon_new = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-epsilon*epoch)
    return epsilon_new

In [None]:
#Implementing the Q Learnign Algorthm
rewads = []

for episode in range(EPOCHS):
    initial_state = env.reset()[0]
    total_rewards = 0
    terminate = False
    
    while not terminate:
        #choose action 
        action = greedy_action_selection(q_table, epsilon, initial_state)
        #perfom that action
        next_state, reward, termiante, truncate, info = env.step(action)
        #retrieve the old q value
        old_q_value = q_table[initial_state,action]
        #finding the next optimal q value
        next_opt_q_value = np.max(q_table[next_state,:])
        #Calculating the next q_value
        new_q_value = compute_next_q_value(old_q_value, next_opt_q_value, reward)
        
        #updating the Q Table
        q_table[initial_state, action] = new_q_value
        
        #the current state will become the initial state
        initial_state = next_state
        
        total_rewards += total_rewards
    
    #Epsilon Reduction 
    epoch = episode +1
    epsilon = epsilon_reduction(epsilon ,epoch)
    rewads.append(total_rewards)
    
env.close()

In [None]:
#Expoliting the Q Table so that the agent can play the Game in an optimal way
state = env.reset()[0]

for i in range(100):
    env.render()
    action = np.argmax(q_table[state:])
    next_state, reward, terminate, truncate, info = env.step(action)
    time.sleep(.01)
    
    if terminate:
        break
env.close()