In [2]:
import gym
import numpy as np

# Create the FrozenLake environment
env = gym.make('FrozenLake-v1')

# Define the Q-value function as a matrix with size (num_states, num_actions)
num_states = env.observation_space.n
num_actions = env.action_space.n
Q = np.zeros((num_states, num_actions))

# Define the parameters of the Q-learning algorithm
alpha = 0.8  # learning rate
gamma = 0.9  # discount factor
epsilon = 0.2  # exploration probability

explo = 0
explt = 0

# Define a function to select an action based on the Q-value function and exploration-exploitation strategy
def select_action(state):
    if np.random.rand() < epsilon:
        global explo
        explo += 1
        return env.action_space.sample()  # explore
    else:
        global explt
        explt += 1
        return np.argmax(Q[state])  # exploit

# Define the main loop of the Q-learning algorithm
num_episodes = 10000
for episode in range(num_episodes):
    state = env.reset()
    done = False
    while not done:
        action = select_action(state)
        next_state, reward, done, _ = env.step(action)
        Q[state, action] += alpha * (reward + gamma * np.max(Q[next_state]) - Q[state, action])  # update the Q-value function
        state = next_state

# Print the learned Q-value function and optimal policy
print(f"Exploration: {explo}")
print(f"Exploitation: {explt}\n")
print("Final Q-value function:")
print(Q)
optimal_policy = np.argmax(Q, axis=1)
print("\nFinal Optimal policy:")
print(optimal_policy.reshape((4,4)))

Exploration: 27098
Exploitation: 108647

Final Q-value function:
[[1.97800358e-02 1.47068297e-01 2.14790115e-02 2.17483824e-02]
 [2.04191460e-02 1.86033009e-02 1.52067067e-02 2.00209028e-02]
 [9.25277560e-03 1.31719946e-02 1.10172363e-02 1.62581999e-02]
 [9.69287978e-03 1.46656660e-02 3.95939414e-03 1.24142029e-02]
 [2.86356162e-01 3.20638831e-03 2.17113485e-03 1.54623308e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.29911927e-02 4.45490168e-03 3.39809242e-03 2.53124377e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.91592698e-01 8.17807750e-03 8.65607864e-04 3.42630368e-01]
 [6.15519965e-02 4.93761394e-01 3.18621015e-02 2.96062872e-01]
 [5.65815378e-01 1.39981975e-02 7.83034627e-03 6.20240224e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [6.10668123e-01 5.16788940e-03 5.06294289e-01 8.81107747e-02]
 [3.81022258e-01 7.88122541e-01 4.20020416e-01 2.4056