In [4]:
import gym
import numpy as np

# Create the FrozenLake environment
env = gym.make('FrozenLake-v1')

# Set the parameters
num_episodes = 10000
learning_rate = 0.1
discount_factor = 0.99
epsilon = 0.1

# Initialize the Q-table
state_space = env.observation_space.n
action_space = env.action_space.n
q_table = np.zeros((state_space, action_space))

# Function to choose an action based on epsilon-greedy strategy
def choose_action(state):
    if np.random.uniform(0, 1) < epsilon:
        # Explore: Choose a random action
        return env.action_space.sample()
    else:
        # Exploit: Choose the action with the highest Q-value
        return np.argmax(q_table[state, :])

# SARSA algorithm
for episode in range(num_episodes):
    state = env.reset()
    action = choose_action(state)
    done = False

    while not done:
        next_state, reward, done, _ = env.step(action)
        next_action = choose_action(next_state)

        # Update the Q-table using the SARSA update rule
        q_table[state, action] += learning_rate * (
                reward + discount_factor * q_table[next_state, next_action] - q_table[state, action])

        state = next_state
        action = next_action

# Print the learned Q-table
print(q_table)

# Close the environment
env.close()


[[0.25876907 0.23879996 0.23803605 0.23096059]
 [0.11866574 0.11403757 0.13300786 0.21365296]
 [0.19503694 0.17000915 0.14639205 0.14812399]
 [0.10365165 0.04539578 0.01606758 0.06036132]
 [0.29205401 0.17407049 0.2264924  0.13592705]
 [0.         0.         0.         0.        ]
 [0.21726968 0.07956192 0.12123048 0.08414007]
 [0.         0.         0.         0.        ]
 [0.2114673  0.23294224 0.18368418 0.34209398]
 [0.18976728 0.38362609 0.31831717 0.25304019]
 [0.42734518 0.26044887 0.35093428 0.15169826]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.1954622  0.34039069 0.50674481 0.42125808]
 [0.49753341 0.58198447 0.60263119 0.692796  ]
 [0.         0.         0.         0.        ]]


In [5]:
import numpy as np 
import gym

env = gym.make('FrozenLake-v1')

num_episodes = 10000
learning_rate = 0.1
discount_factor = 0.99
epsilon = 0.1

state_space = env.observation_space.n
action_space = env.action_space.n
q_table = np.zeros((state_space,action_space))


def choose_action(state):
  if np.random.uniform(0,1) < epsilon:
    return env.action_space.sample()
  else:
    return np.argmax(q_table[state,:])

for episode in range(num_episodes):
  state = env.reset()
  action = choose_action(state)
  done = False

  while not done:
    next_state, reward, done, _ = env.step(action)
    next_action = choose_action(next_state)

    q_table[state,action] += learning_rate*(reward + discount_factor*q_table[next_state,next_action] - q_table[state,action])

    state = next_state
    action = next_action

print(q_table)
env.close() 

[[0.05172114 0.04139756 0.03854514 0.04709246]
 [0.0369702  0.00597369 0.00066492 0.00575708]
 [0.00760772 0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.05783639 0.04667535 0.03240354 0.02572372]
 [0.         0.         0.         0.        ]
 [0.04478876 0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.02236335 0.03171724 0.03974446 0.0706408 ]
 [0.0803139  0.06224499 0.05762617 0.04127075]
 [0.19453813 0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.04793232 0.19959425 0.08629607 0.0938519 ]
 [0.0835237  0.19       0.03393359 0.41602378]
 [0.         0.         0.         0.        ]]
