In [1]:
import numpy as np
import pandas as pd
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv

# Define the high-level environment
class HighLevelEnv(gym.Env):
    def __init__(self, data):
        super(HighLevelEnv, self).__init__()
        self.data = data
        self.current_step = 0
        self.action_space = gym.spaces.Box(low=0, high=1, shape=(1,), dtype=np.float32)
        self.observation_space = gym.spaces.Box(low=-np.inf, high=np.inf, shape=(3,), dtype=np.float32)

    def reset(self):
        self.current_step = 0
        return self._next_observation()

    def _next_observation(self):
        # Example observation: [remaining_volume, remaining_time, current_price]
        return np.array([self.data['volume'][self.current_step],
                         self.data['remaining_time'][self.current_step],
                         self.data['price'][self.current_step]])

    def step(self, action):
        self.current_step += 1
        reward = -np.abs(self.data['VWAP'][self.current_step] - np.sum(self.data['price'][:self.current_step] * self.data['volume'][:self.current_step]) / np.sum(self.data['volume'][:self.current_step]))
        done = self.current_step >= len(self.data) - 1
        return self._next_observation(), reward, done, {}

# Define the low-level environment
class LowLevelEnv(gym.Env):
    def __init__(self, data, sub_goal):
        super(LowLevelEnv, self).__init__()
        self.data = data
        self.sub_goal = sub_goal
        self.current_step = 0
        self.action_space = gym.spaces.Box(low=0, high=1, shape=(1,), dtype=np.float32)
        self.observation_space = gym.spaces.Box(low=-np.inf, high=np.inf, shape=(3,), dtype=np.float32)

    def reset(self):
        self.current_step = 0
        return self._next_observation()

    def _next_observation(self):
        return np.array([self.data['volume'][self.current_step],
                         self.data['remaining_time'][self.current_step],
                         self.data['price'][self.current_step]])

    def step(self, action):
        self.current_step += 1
        VWAP_target = self.sub_goal
        reward = -np.abs(VWAP_target - np.sum(self.data['price'][:self.current_step] * self.data['volume'][:self.current_step]) / np.sum(self.data['volume'][:self.current_step]))
        done = self.current_step >= len(self.data) - 1
        return self._next_observation(), reward, done, {}

# Create the environment
data = pd.DataFrame({
    'price': np.random.rand(100),
    'volume': np.random.rand(100),
    'remaining_time': np.linspace(100, 0, 100),
    'VWAP': np.random.rand(100)
})

high_level_env = DummyVecEnv([lambda: HighLevelEnv(data)])
low_level_env = DummyVecEnv([lambda: LowLevelEnv(data, sub_goal=0.5)])

# Train the high-level controller
high_level_model = PPO("MlpPolicy", high_level_env, verbose=1)
high_level_model.learn(total_timesteps=10000)

# Extract sub-goal from high-level policy
obs = high_level_env.reset()
sub_goal, _ = high_level_model.predict(obs)

# Train the low-level controller
low_level_env = DummyVecEnv([lambda: LowLevelEnv(data, sub_goal=sub_goal)])
low_level_model = PPO("MlpPolicy", low_level_env, verbose=1)
low_level_model.learn(total_timesteps=10000)

# Save models
high_level_model.save("high_level_model")
low_level_model.save("low_level_model")



Using cpu device
-----------------------------
| time/              |      |
|    fps             | 4443 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 3299        |
|    iterations           | 2           |
|    time_elapsed         | 1           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.006080671 |
|    clip_fraction        | 0.0558      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.43       |
|    explained_variance   | 0.00154     |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0112     |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.00205    |
|    std                  | 1.02        |
|    value_loss           | 0.804       |
-----------------