In [1]:
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env

In [2]:
import numpy as np

class InfiniteHorizonEnv(gym.Env):
    def __init__(self, gamma=0.99):
        super().__init__()
        self.action_space = spaces.Discrete(2)
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(1,), dtype=np.float32)
        self.state = 0
        self.gamma = gamma  # Discount rate
        self.current_step = 0

    def step(self, action):
        self.state += -1 if action == 0 else 1
        immediate_reward = abs(self.state)  # Immediate reward without discounting
        
        # Apply discounting to the immediate reward
        discounted_reward = immediate_reward * (self.gamma ** self.current_step)
        self.current_step += 1

        info = {'immediate_reward': immediate_reward}  # Keep the immediate reward in the info for reference
        return np.array([self.state]).astype(np.float32), discounted_reward, False, info

    def reset(self):
        self.state = 0
        self.current_step = 0
        return np.array([self.state]).astype(np.float32)
    
    def render(self, mode='human'):
        print(f"State: {self.state}")


from gym.envs.registration import register

register(
    id='InfiniteHorizonEnv-v0',  # Use an id that uniquely identifies your env
    entry_point='utils.env:InfiniteHorizonEnv',  # Point to where your environment is defined. Use the format 'folder.filename:ClassName'
)

In [7]:
def evaluate_policy(env, policy, gamma=0.99, threshold=0.001):
    value_table = np.zeros(env.observation_space.n)
    while True:
        delta = 0
        for state in range(env.observation_space.n):
            v = 0
            for action, action_prob in enumerate(policy[state]):
                for state_prob, next_state, reward, done in env.P[state][action]:
                    v += action_prob * state_prob * (reward + gamma * value_table[next_state])
            delta = max(delta, np.abs(value_table[state] - v))
            value_table[state] = v
        if delta < threshold:
            break
    return value_table


In [4]:
def evaluate_policy(env, policy, gamma=0.99, threshold=0.001):
    value_table = np.zeros(env.observation_space.n)
    while True:
        delta = 0
        for state in range(env.observation_space.n):
            v = 0
            for action, action_prob in enumerate(policy[state]):
                for state_prob, next_state, reward, done in env.P[state][action]:
                    v += action_prob * state_prob * (reward + gamma * value_table[next_state])
            delta = max(delta, np.abs(value_table[state] - v))
            value_table[state] = v
        if delta < threshold:
            break
    return value_table

In [6]:
env_id = "CartPole-v1"
# Vectorized environments allow for parallel execution of multiple instances
env = make_vec_env(env_id, n_envs=4)

In [7]:
model = PPO("MlpPolicy", env, gamma=0.9, verbose=1)

Using cuda device


In [8]:
model.learn(total_timesteps=100000)

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 21.9     |
|    ep_rew_mean     | 21.9     |
| time/              |          |
|    fps             | 1601     |
|    iterations      | 1        |
|    time_elapsed    | 5        |
|    total_timesteps | 8192     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 34.5         |
|    ep_rew_mean          | 34.5         |
| time/                   |              |
|    fps                  | 1083         |
|    iterations           | 2            |
|    time_elapsed         | 15           |
|    total_timesteps      | 16384        |
| train/                  |              |
|    approx_kl            | 0.0144494325 |
|    clip_fraction        | 0.203        |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.682       |
|    explained_variance   | -0.0024      |
|    learning_r

<stable_baselines3.ppo.ppo.PPO at 0x7ff10b9ffe00>

In [12]:
model.save("ppo_cartpole")

In [15]:
model1 = PPO.load("model/ppo_cartpole")

In [16]:
env1 = gym.make("CartPole-v1")