In [7]:
import gym
import numpy as np
import random

env = gym.make("FrozenLake-v1", is_slippery=True)  # 4x4 slippery lake
n_states = env.observation_space.n
n_actions = env.action_space.n


In [8]:
def simple_hill_climbing(env, episodes=1000):
    policy = np.random.choice(n_actions, n_states)
    best_reward = 0

    for _ in range(episodes):
        new_policy = policy.copy()
        state = random.randint(0, n_states - 1)
        new_policy[state] = random.randint(0, n_actions - 1)

        reward = run_policy(env, new_policy)
        if reward > best_reward:
            policy, best_reward = new_policy, reward

    return policy

def run_policy(env, policy, trials=100):
    total_reward = 0
    for _ in range(trials):
        state = env.reset()[0]
        done = False
        while not done:
            state, reward, done, _, _ = env.step(policy[state])
        total_reward += reward
    return total_reward / trials


In [9]:
def stochastic_hill_climbing(env, episodes=1000):
    policy = np.random.choice(n_actions, n_states)
    best_reward = 0

    for _ in range(episodes):
        new_policy = np.random.choice(n_actions, n_states)  # Entirely random
        reward = run_policy(env, new_policy)

        if reward > best_reward:
            policy, best_reward = new_policy, reward

    return policy


In [10]:
def steepest_ascent_hill_climbing(env, episodes=1000):
    policy = np.random.choice(n_actions, n_states)
    best_reward = run_policy(env, policy)

    for _ in range(episodes):
        neighbors = []
        for s in range(n_states):
            for a in range(n_actions):
                if a != policy[s]:
                    new_policy = policy.copy()
                    new_policy[s] = a
                    neighbors.append(new_policy)

        # Evaluate all neighbors
        best_neighbor = max(neighbors, key=lambda p: run_policy(env, p))
        reward = run_policy(env, best_neighbor)

        if reward > best_reward:
            policy, best_reward = best_neighbor, reward

    return policy


In [None]:
final_policy = simple_hill_climbing(env)  # or stochastic_hill_climbing, steepest_ascent_hill_climbing

# Test the final policy
def evaluate_policy(env, policy, trials=100):
    success = 0
    for _ in range(trials):
        state = env.reset()[0]
        done = False
        while not done:
            state, reward, done, _, _ = env.step(policy[state])
        success += reward
    return success / trials

print("Final Success Rate:", evaluate_policy(env, final_policy))


  if not isinstance(terminated, (bool, np.bool8)):
