In [1]:
# test_setup.py
import os
import numpy as np
import pandas as pd
from PIL import Image

from environment.trading_environment import TradingEnv
from models.yolo_encoder import YOLOEncoder
from models.policy import YOLOResNetExtractor

import torch
from stable_baselines3 import PPO

# ------------------------
# 1. Create fake image dataset
# ------------------------
os.makedirs("fake_images", exist_ok=True)

num_samples = 10
paths = []
timestamps = pd.date_range("2023-01-01", periods=num_samples, freq="T")

for i in range(num_samples):
    path = f"fake_images/img_{i}.png"
    # random noise image
    arr = (np.random.rand(640, 640, 3) * 255).astype(np.uint8)
    Image.fromarray(arr).save(path)
    paths.append(path)

# ------------------------
# 2. Build fake meta_df
# ------------------------
meta_df = pd.DataFrame({
    "timestamp": timestamps,
    "path": paths,
    "close": np.linspace(100, 110, num_samples)  # linearly increasing prices
})

print("Meta_df sample:")
print(meta_df.head())

# ------------------------
# 3. Build YOLO encoder + env
# ------------------------
yolo_encoder = YOLOEncoder("./models/best.pt")   # expects weights, can be dummy if just testing hooks
env = TradingEnv(meta_df)

# ------------------------
# 4. Step through env
# ------------------------
obs, _ = env.reset()
print("\nObs keys:", obs.keys())
print("Image shape:", obs["image"].shape)
print("Features:", obs["features"])

obs, reward, done, _, info = env.step(env.action_space.sample())
print("\nStep result:")
print("Reward:", reward)
print("Done:", done)
print("Info:", info)

# ------------------------
# 5. Test PPO wiring (small run)
# ------------------------
policy_kwargs = dict(
    features_extractor_class=YOLOResNetExtractor,
    features_extractor_kwargs=dict(yolo_encoder=yolo_encoder, features_dim=512+3),
)

model = PPO(
    "MultiInputPolicy",
    env,
    n_steps=32,       # small rollout buffer
    batch_size=8,
    n_epochs=1,       # quick update
    learning_rate=3e-4,
    policy_kwargs=policy_kwargs,
    verbose=1,
    device="cpu"
)

model.learn(total_timesteps=64)  # very short run


  timestamps = pd.date_range("2023-01-01", periods=num_samples, freq="T")


Meta_df sample:
            timestamp                   path       close
0 2023-01-01 00:00:00  fake_images/img_0.png  100.000000
1 2023-01-01 00:01:00  fake_images/img_1.png  101.111111
2 2023-01-01 00:02:00  fake_images/img_2.png  102.222222
3 2023-01-01 00:03:00  fake_images/img_3.png  103.333333
4 2023-01-01 00:04:00  fake_images/img_4.png  104.444444

Obs keys: dict_keys(['image', 'features'])
Image shape: (3, 640, 640)
Features: [        100       10000           0]

Step result:
Reward: 0.0
Done: False
Info: {'timestamp': Timestamp('2023-01-01 00:00:00'), 'balance': 10000.0, 'position': 0, 'reward_breakdown': {'pnl': 0.0, 'transaction_cost': -0.0, 'overtrade_penalty': 0.0, 'holding_penalty': 0.0}}
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 10       |
|    ep_rew_mean     | -3.67    |
| time/              |          |
|    fps             |

<stable_baselines3.ppo.ppo.PPO at 0x14aa6cec0>