In [None]:
import gymnasium as gym
import numpy as np
from gymnasium import spaces

class SimpleGrid(gym.Env):
    metadata = {"render.modes": []}

    def __init__(self, size=11, start=(0, 0), goal=(5, 5), max_steps=50):
        super().__init__()
        self.size = size
        self.start = np.array(start)
        self.goal = np.array(goal)
        self.max_steps = max_steps

        self.action_space = spaces.Discrete(4)
        self.observation_space = spaces.Box(
            low=0, high=size - 1, shape=(2,), dtype=np.int32
        )

        self.reset()

    def reset(self, seed=seed):
        self.pos = self.start.copy()
        self.steps = 0
        return self.pos.copy()

    def step(self, action):
        self.steps += 1

        if action == 0:   # up
            self.pos[1] += 1
        elif action == 1: # down
            self.pos[1] -= 1
        elif action == 2: # right
            self.pos[0] += 1
        elif action == 3: # left
            self.pos[0] -= 1

        self.pos = np.clip(self.pos, 0, self.size - 1)

        done = False
        reward = 0.0

        if np.array_equal(self.pos, self.goal):
            reward = 1.0
            done = True
        elif self.steps >= self.max_steps:
            done = True

        return self.pos.copy(), reward, done, {}


In [8]:
def random_rollout_success(env, n_rollouts=1000):
    successes = 0
    for _ in range(n_rollouts):
        obs = env.reset()
        done = False
        while not done:
            action = env.action_space.sample()
            obs, reward, done, _ = env.step(action)
            if reward > 0:
                successes += 1
                break
    return successes / n_rollouts


In [9]:
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env

def ppo_success_probability(
    env_fn,
    n_seeds=10,
    train_steps=50_000,
    eval_episodes=5
):
    successes = 0

    for seed in range(n_seeds):
        env = make_vec_env(env_fn, n_envs=1, seed=seed)

        model = PPO(
            "MlpPolicy",
            env,
            seed=seed,
            verbose=0,
            gamma=0.99,
            n_steps=128,
            ent_coef=0.0
        )

        model.learn(total_timesteps=train_steps)

        # Deterministic evaluation
        eval_env = env_fn()
        success = False
        for _ in range(eval_episodes):
            obs = eval_env.reset()
            done = False
            while not done:
                action, _ = model.predict(obs, deterministic=True)
                obs, reward, done, _ = eval_env.step(action)
                if reward > 0:
                    success = True
                    break

        if success:
            successes += 1

    return successes / n_seeds


In [None]:
import matplotlib.pyplot as plt

def run_experiment():
    k = 30
    grid_size = 21
    start = (10, 10)

    distances = [2, 4, 6, 8, 10]

    R_vals = []
    P_vals = []

    for d in distances:
        goal = (10 + d, 10)

        def env_fn():
            return SimpleGrid(
                size=grid_size,
                start=start,
                goal=goal,
                max_steps=k
            )

        env = env_fn()

        R = random_rollout_success(env, n_rollouts=2000)
        P = ppo_success_probability(env_fn, n_seeds=10)

        R_vals.append(R)
        P_vals.append(P)

        print(f"d={d}: R={R:.4f}, P={P:.4f}")

    plt.scatter(R_vals, P_vals)
    plt.xlabel("R(s, g): Random rollout success")
    plt.ylabel("P(s, g): PPO convergence probability")
    plt.title("PPO vs Random Exploration")
    plt.grid(True)
    plt.show()

run_experiment()


TypeError: SimpleGrid.reset() got an unexpected keyword argument 'seed'