In [5]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np
from scipy.stats import norm

class OptionHedgingEnv(gym.Env):
    metadata = {"render_modes": []}  # Optional, no rendering used here

    def __init__(self, days=30, S0=50, K=50, sigma=0.2, r=0.01, short_calls=10, hedge_cost_coeff=0.01):
        super().__init__()
        self.days = days
        self.S0 = S0
        self.K = K
        self.sigma = sigma
        self.r = r
        self.short_calls = short_calls
        self.dt = 1 / 365
        self.hedge_cost_coeff = hedge_cost_coeff

        self.action_space = spaces.Box(low=-1, high=1, shape=(1,), dtype=np.float32)  # hedge ratio
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(6,), dtype=np.float32)

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.day = 0
        self.S = self.S0
        self.hedge_position = 0
        self.prev_hedge = 0
        self.total_pnl = 0

        self.daily_returns = np.random.normal(loc=0, scale=self.sigma * np.sqrt(self.dt), size=self.days)
        self.state = self._get_state()
        return self.state, {}

    def step(self, action):
        hedge_ratio = float(np.clip(action[0], -1, 1))
        self.prev_hedge = self.hedge_position
        self.hedge_position = hedge_ratio * self.short_calls

        dS = self.S * (np.exp(self.daily_returns[self.day]) - 1)
        self.S += dS
        self.day += 1
        T = (self.days - self.day) / 365

        # Greeks
        delta = -self.short_calls * self._call_delta(self.S, self.K, T, self.r, self.sigma)
        d_portfolio = delta * dS
        d_hedge = self.hedge_position * dS
        hedge_cost = self.hedge_cost_coeff * abs(self.hedge_position - self.prev_hedge)

        pnl = d_portfolio + d_hedge
        reward = -(pnl ** 2) - hedge_cost

        self.total_pnl += pnl
        done = self.day >= self.days
        self.state = self._get_state()
        return self.state, reward, done, False, {}

    def _get_state(self):
        T = (self.days - self.day) / 365
        delta = -self.short_calls * self._call_delta(self.S, self.K, T, self.r, self.sigma)
        gamma = -self.short_calls * self._call_gamma(self.S, self.K, T, self.r, self.sigma)
        vega = -self.short_calls * self._call_vega(self.S, self.K, T, self.r, self.sigma)
        return np.array([self.S, T, self.sigma, delta, gamma, vega], dtype=np.float32)

    def _call_delta(self, S, K, T, r, sigma):
        d1 = (np.log(S / K) + (r + 0.5 * sigma**2) * T) / (sigma * np.sqrt(T))
        return norm.cdf(d1)

    def _call_gamma(self, S, K, T, r, sigma):
        d1 = (np.log(S / K) + (r + 0.5 * sigma**2) * T) / (sigma * np.sqrt(T))
        return norm.pdf(d1) / (S * sigma * np.sqrt(T))

    def _call_vega(self, S, K, T, r, sigma):
        d1 = (np.log(S / K) + (r + 0.5 * sigma**2) * T) / (sigma * np.sqrt(T))
        return S * norm.pdf(d1) * np.sqrt(T) / 100

In [None]:
from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env

env = OptionHedgingEnv()
check_env(env)

# Train PPO agent
model = PPO("MlpPolicy", env, verbose=1, learning_rate=0.0003, n_steps=2048, batch_size=64, ent_coef=0.0)
model.learn(total_timesteps=100_000)

# Save model
model.save("ppo_hedging_agent")

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


  d1 = (np.log(S / K) + (r + 0.5 * sigma**2) * T) / (sigma * np.sqrt(T))
  d1 = (np.log(S / K) + (r + 0.5 * sigma**2) * T) / (sigma * np.sqrt(T))
  return norm.pdf(d1) / (S * sigma * np.sqrt(T))
  d1 = (np.log(S / K) + (r + 0.5 * sigma**2) * T) / (sigma * np.sqrt(T))


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 30       |
|    ep_rew_mean     | -804     |
| time/              |          |
|    fps             | 2369     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 30           |
|    ep_rew_mean          | -715         |
| time/                   |              |
|    fps                  | 1889         |
|    iterations           | 2            |
|    time_elapsed         | 2            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0073957318 |
|    clip_fraction        | 0.0853       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.41        |
|    explained_variance   | 0.000108     |
|    learning_r