# Example of how to use History Wrappers 📚

## Define the Environment and Wrapper 🎁
Create your environment and wrap it using the HistoryWrapper.

You need to use such an environment wrapper whenever you break Markov assumption.
The main problem here is the ***continuity cost,*** here implemented with `weight=beta`, that's what breaks markov assumption.


In [None]:
import gym
from stable_baselines3 import PPO
import numpy as np

# Your HistoryWrapper class implementation here
class HistoryWrapper(gym.Wrapper):
    def __init__(self, env: gym.Env, steps: int, use_continuity_cost: bool):
        super().__init__(env)
        assert steps > 1, "steps must be > 1"
        self.steps = steps
        self.use_continuity_cost = use_continuity_cost
        self.beta #weight of continuity cost

        # concat obs with action
        self.step_low = np.concatenate([env.observation_space.low, env.action_space.low])
        self.step_high = np.concatenate([env.observation_space.high, env.action_space.high])

        # stack for each step
        obs_low = np.tile(self.step_low, (self.steps, 1))
        obs_high = np.tile(self.step_high, (self.steps, 1))

        self.observation_space = Box(low=obs_low.flatten(), high=obs_high.flatten(), dtype=np.float32)

        self.history = self._make_history()

    def _make_history(self):
        return [np.zeros_like(self.step_low) for _ in range(self.steps)]

    def _continuity_cost(self, obs):
        continuity_cost = 0
        for i in range(1, len(obs)):
            action = obs[i][-len(self.env.action_space.low):]
            last_action = obs[i-1][-len(self.env.action_space.low):]
            continuity_cost += self.beta *np.sum(np.square(action - last_action))
        return continuity_cost / (self.steps - 1)

    def step(self, action):
        obs, reward, terminated, truncated, info = self.env.step(action)
        self.history.pop(0)

        obs = np.concatenate([obs, action])
        self.history.append(obs)
        obs = np.array(self.history, dtype=np.float32)

        if self.use_continuity_cost:
            continuity_cost = self._continuity_cost(obs)
            reward -= continuity_cost
            info["continuity_cost"] = continuity_cost

        return obs.flatten(), reward, terminated, truncated, info

    def reset(self, seed: Optional[int] = None, options: Optional[dict] = None):
        self.history = self._make_history()
        self.history.pop(0)
        obs, info = self.env.reset(seed=seed, options=options)
        obs = np.concatenate([obs, np.zeros_like(self.env.action_space.low)])
        self.history.append(obs)
        return np.array(self.history, dtype=np.float32).flatten(), info

# Create and wrap the environment
env = gym.make('CartPole-v1')
wrapped_env = HistoryWrapper(env, steps=4, use_continuity_cost=True)


## Training with Stable Baselines3 🎥

Use Stable Baselines3 to train a model with the wrapped environment.

In [None]:
from stable_baselines3 import PPO

# Define the RL model
model = PPO('MlpPolicy', wrapped_env, verbose=1)

# Train the model
model.learn(total_timesteps=10000)

# Save the model
model.save("ppo_history_cartpole")


##Evaluation 🏃
Evaluate the trained model to see how it performs with the history wrapper.



In [None]:
# Load the model
model = PPO.load("ppo_history_cartpole")

obs = wrapped_env.reset()
for i in range(1000):
    action, _states = model.predict(obs)
    obs, reward, done, info = wrapped_env.step(action)
    wrapped_env.render()
    if done:
        obs = wrapped_env.reset()

wrapped_env.close()
