In [1]:
# train.py
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecMonitor
import pandas as pd
from env import TradingEnv
from extractor import CNNLSTMPolicy
from pathlib import Path

meta_path = Path("../dataset/meta.csv").resolve()

meta_df = pd.read_csv(meta_path, parse_dates=["timestamp"])
meta_root = meta_path.parent.resolve()
meta_root = meta_path.parent.resolve()

def make_env():
    def _init():
        return TradingEnv(meta_df=meta_df, root_dir=meta_root)
    return _init

vec_env = DummyVecEnv([make_env()])
vec_env = VecMonitor(vec_env)

model = PPO(
    policy=CNNLSTMPolicy,
    env=vec_env,
    verbose=1,
    n_steps=128,
    batch_size=32,
    learning_rate=3e-4,
    tensorboard_log="./logs"
)

model.learn(total_timesteps=128)
model.save("ppo-cnnlstm-trading")


INFO:env:[ENV INIT] Loaded 214 rows with 5 TFs.


Using cpu device
Logging to ./logs/PPO_6
[STEP 32] A=1, R=0.00, EpR=-4.09, Pos=1, Bal=100000.36, Eq=100000.36, RPNL=0.36, UPNL=0.00
[STEP 64] A=2, R=2.51, EpR=-13.39, Pos=1, Bal=99999.55, Eq=100002.06, RPNL=-0.45, UPNL=2.51
[STEP 96] A=1, R=9.51, EpR=-12.67, Pos=-1, Bal=100000.00, Eq=100009.51, RPNL=-0.00, UPNL=9.51
[STEP 128] A=0, R=0.00, EpR=47.44, Pos=0, Bal=100004.20, Eq=100004.20, RPNL=4.20, UPNL=0.00
----------------------------
| time/              |     |
|    fps             | 37  |
|    iterations      | 1   |
|    time_elapsed    | 3   |
|    total_timesteps | 128 |
----------------------------


In [None]:
# --- Imports ---
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecMonitor
from env import TradingEnv
from extractor import CNNLSTMPolicy
from pathlib import Path
import pandas as pd

# --- Config ---
meta_path = Path("../dataset/meta.csv").resolve()
meta_df = pd.read_csv(meta_path, parse_dates=["timestamp"])
meta_root = meta_path.parent.resolve()

# --- Train/Test split ---
split_ratio = 0.8
split_idx = int(len(meta_df) * split_ratio)
train_df = meta_df.iloc[:split_idx].reset_index(drop=True)
test_df = meta_df.iloc[split_idx:].reset_index(drop=True)

print(f"[INFO] Training on {len(train_df)} samples, testing on {len(test_df)}")

# --- Env builders ---
def make_train_env():
    def _init():
        return TradingEnv(meta_df=train_df, root_dir=meta_root)
    return _init

def make_test_env():
    def _init():
        return TradingEnv(meta_df=test_df, root_dir=meta_root)
    return _init

# --- Train ---
train_env = DummyVecEnv([make_train_env()])
train_env = VecMonitor(train_env)

model = PPO(
    policy=CNNLSTMPolicy,
    env=train_env,
    verbose=1,
    n_steps=128,
    batch_size=32,
    learning_rate=3e-4,
    tensorboard_log="./logs"
)

print("\n[🔁 TRAINING STARTED]")
model.learn(total_timesteps=100_000)
model.save("ppo-cnnlstm-trading")
print("[✅ TRAINING DONE — Model Saved as 'ppo-cnnlstm-trading']")

# --- Test ---
test_env = DummyVecEnv([make_test_env()])
model = PPO.load("ppo-cnnlstm-trading", env=test_env)

obs = test_env.reset()
total_rewards = []
episode_reward = 0

print("\n[🚀 TESTING STARTED]")
for _ in range(500):  # can change to len(test_df) or full sweep
    action, _ = model.predict(obs)
    obs, reward, done, _, _ = test_env.step(action)
    episode_reward += reward[0]

    if done[0]:
        total_rewards.append(episode_reward)
        print(f"Episode reward: {episode_reward:.2f}")
        episode_reward = 0
        obs = test_env.reset()

# --- Report ---
print("\n[📊 TESTING COMPLETE]")
if total_rewards:
    print(f"Average Reward: {sum(total_rewards)/len(total_rewards):.2f}")
    print(f"Max Reward: {max(total_rewards):.2f}")
    print(f"Min Reward: {min(total_rewards):.2f}")
else:
    print("No episodes completed during testing.")


INFO:env:[ENV INIT] Loaded 171 rows with 5 TFs.


[INFO] Training on 171 samples, testing on 43
Using cpu device

[🔁 TRAINING STARTED]
Logging to ./logs/PPO_7
[STEP 32] A=0, R=0.00, EpR=-19.09, Pos=0, Bal=99991.56, Eq=99991.56, RPNL=-8.44, UPNL=0.00
[STEP 64] A=0, R=-1.88, EpR=-24.93, Pos=-1, Bal=99985.72, Eq=99983.84, RPNL=-14.28, UPNL=-1.88
[STEP 96] A=0, R=0.00, EpR=-7.82, Pos=0, Bal=99981.59, Eq=99981.59, RPNL=-18.41, UPNL=0.00
[STEP 128] A=0, R=1.17, EpR=35.29, Pos=1, Bal=99993.18, Eq=99994.35, RPNL=-6.82, UPNL=1.17
----------------------------
| time/              |     |
|    fps             | 36  |
|    iterations      | 1   |
|    time_elapsed    | 3   |
|    total_timesteps | 128 |
----------------------------
[STEP 160] A=2, R=-0.18, EpR=65.74, Pos=1, Bal=100008.15, Eq=100007.97, RPNL=8.15, UPNL=-0.18
[STEP 32] A=2, R=0.00, EpR=-11.77, Pos=-1, Bal=99999.19, Eq=99999.19, RPNL=-0.81, UPNL=0.00
[STEP 64] A=0, R=0.00, EpR=-6.44, Pos=0, Bal=99996.68, Eq=99996.68, RPNL=-3.32, UPNL=0.00
------------------------------------------
|