# Step 0: Import the dependencies

In [1]:
import numpy as np 
import gym
import random 

# Step 1: Create the environment

In [2]:
env = gym.make("FrozenLake-v0")

# Step 2: Create the Q-table and initialize it

In [4]:
action_size = env.action_space.n
state_size = env.observation_space.n

In [5]:
qtable = np.zeros((state_size, action_size))
print(qtable)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


# Step 3: Create the hyperparameters

In [6]:
total_episodes = 15000
learning_rate = 0.8
max_steps = 99
gamma = 0.95

# Exploration parameters
epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.005

# Step 4: The Q learning algorithm

In [9]:
# List of rewards
rewards = []

for episode in range(total_episodes):
    # Reset the environment
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0

    for steps in range(max_steps):
        # Creating the Exploration/Exploitation 
        exp_exp_tradeoff = random.uniform(0, 1)

        if exp_exp_tradeoff > epsilon:
            action = np.argmax(qtable[state,:])
        else:
            action = env.action_space.sample()

        new_state, reward, done, info = env.step(action)

        qtable[state, action] = qtable[state, action] + learning_rate * (reward + gamma * np.max(qtable[new_state, :]) - qtable[state, action])
        total_rewards += reward 

        state = new_state

        if done == True:
            break

    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)
    rewards.append(total_rewards)

print("Score over time: " + str(sum(rewards)/total_episodes))
print(qtable)

Score over time: 0.48593333333333333
[[3.99133680e-02 1.82501304e-01 8.96079987e-02 5.21185212e-02]
 [5.20128005e-03 1.77933759e-02 6.45975520e-03 5.73594992e-02]
 [2.00758941e-02 1.23758749e-02 1.39490888e-02 2.83465531e-02]
 [3.43995372e-03 7.15368032e-03 2.90200626e-03 1.68910587e-02]
 [4.20762234e-01 2.50919233e-02 2.28544619e-02 3.02623736e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [2.14917934e-02 2.74201359e-06 2.78675406e-05 1.08670944e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [6.43493400e-03 2.48547600e-04 2.03566725e-02 7.43490650e-01]
 [1.95845920e-02 8.41516512e-01 6.43419378e-03 8.88554108e-03]
 [5.79564143e-01 1.14826399e-03 6.56442835e-04 5.77302177e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [2.18381617e-04 1.20238887e-01 9.35635161e-01 9.76723005e-02]
 [1.38240547e-01 9.98375613e-01 2.29324860e-01 1.70635665e-01]
 [0.00000000e+00 0

# Step 5: Use our Q-table to play FrozenLake

In [10]:

for episode in range(5):
    state = env.reset()
    step = 0
    done = False
    print("****************************************************")
    print("EPISODE ", episode)

    for step in range(max_steps):
        action = np.argmax(qtable[state,:])
        
        new_state, reward, done, info = env.step(action)
        
        if done:
            env.render()
            print("Number of steps", step)
            break
        state = new_state
env.close()

****************************************************
EPISODE  0
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 96
****************************************************
EPISODE  1
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
Number of steps 76
****************************************************
EPISODE  2
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 18
****************************************************
EPISODE  3
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 37
****************************************************
EPISODE  4
