In [1]:
import os
os.chdir('../')
os.getcwd()

'e:\\github_clone\\Trader-RL'

In [2]:
import pandas as pd
# Available in the github repo : examples/data/BTC_USD-Hourly.csv
url = "https://raw.githubusercontent.com/ClementPerroud/Gym-Trading-Env/main/examples/data/BTC_USD-Hourly.csv"
df = pd.read_csv(url, parse_dates=["date"], index_col= "date")
df.sort_index(inplace= True)
df.dropna(inplace= True)
df.drop_duplicates(inplace=True)

In [3]:
df.head()

Unnamed: 0_level_0,unix,symbol,open,high,low,close,volume,Volume USD
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-05-15 06:00:00,1526364000,BTC/USD,8733.86,8796.68,8707.28,8740.99,4906603.14,559.93
2018-05-15 07:00:00,1526367600,BTC/USD,8740.99,8766.0,8721.11,8739.0,2390398.89,273.58
2018-05-15 08:00:00,1526371200,BTC/USD,8739.0,8750.27,8660.53,8728.49,7986062.84,917.79
2018-05-15 09:00:00,1526374800,BTC/USD,8728.49,8754.4,8701.35,8708.32,1593991.98,182.62
2018-05-15 10:00:00,1526378400,BTC/USD,8708.32,8865.0,8695.11,8795.9,11101273.74,1260.69


In [4]:
# Create the feature : ( close[t] - close[t-1] )/ close[t-1]
df["feature_close"] = df["close"].pct_change()

# Create the feature : open[t] / close[t]
df["feature_open"] = df["open"]/df["close"]

# Create the feature : high[t] / close[t]
df["feature_high"] = df["high"]/df["close"]

# Create the feature : low[t] / close[t]
df["feature_low"] = df["low"]/df["close"]

 # Create the feature : volume[t] / max(*volume[t-7*24:t+1])
df["feature_volume"] = df["Volume USD"] / df["Volume USD"].rolling(7*24).max()

df.dropna(inplace= True)

In [5]:
import gymnasium as gym
import numpy as np
import gym_trading_env
env = gym.make("TradingEnv",
        name= "BTCUSD",
        df = df, # Your dataset with your custom features
        positions = [ -1, 0, 1], # -1 (=SHORT), 0(=OUT), +1 (=LONG)
        trading_fees = 0.01/100, # 0.01% per stock buy / sell (Binance fees)
        borrow_interest_rate= 0.0003/100, # 0.0003% per timestep (one timestep = 1h here)
    )

In [6]:
import torch
from src.agent import RLSeq2Seq, Encoder, Decoder
from src.utility.config import config

config['input_dim'] = env.observation_space.shape[0]
config['action_dim'] = env.action_space.n if hasattr(env.action_space, 'n') else env.action_space.shape[0]

encoder = Encoder(config)
decoder = Decoder(config)
trader = RLSeq2Seq(config=config, decoder=decoder, encoder=encoder)

In [7]:
trader.load_models(checkpoint="models\\stock")

[32m13:31:50[0m | [1mINFO    [0m | [36mModels loaded from models\stock[0m


In [8]:
import numpy as np
import torch

terminated, truncated = False, False
obs, info = env.reset()

ep_rewards = []
cum_reward = 0.0

prev_obs = None          # last observation (for prev_context)
prev_action = None       # last action index (for one-hot)
A = env.action_space.n   # action_dim

while not (terminated or truncated):
    # ---- build prev_context = [prev_obs, one_hot(prev_action)] ----
    if prev_obs is not None and prev_action is not None:
        prev_ctx = np.concatenate([
            np.asarray(prev_obs, dtype=np.float32).reshape(-1),
            np.eye(A, dtype=np.float32)[int(prev_action)]
        ])
    else:
        prev_ctx = None

    # ---- agent decides an action ----
    result = trader.select_action(state=obs, prev_context=prev_ctx)
    action = result[0] if isinstance(result, tuple) else result  # handle either return shape

    # ---- environment step ----
    next_obs, reward, terminated, truncated, info = env.step(action)

    # ---- bookkeeping ----
    cum_reward += float(reward)
    ep_rewards.append(float(reward))

    # ---- advance ----
    prev_obs = obs
    prev_action = action
    obs = next_obs

print(f"Episode cumulative reward: {cum_reward:.6f}")
cum_rewards_per_step = np.cumsum(ep_rewards)


Market Return : 423.10%   |   Portfolio Return : -94.95%   |   
Episode cumulative reward: -2.985389


In [6]:
from src.agent import PPO
from src.utility.config import config

config['input_dim'] = env.observation_space.shape[0]
config['action_dim'] = env.action_space.n if hasattr(env.action_space, 'n') else env.action_space.shape[0]
agent = PPO(config)

In [7]:
agent.load(checkpoint_path="models\\stock")

[32m10:29:28[0m | [1mINFO    [0m | [36mPPO loaded from models\stock\ppo_checkpoint.pth[0m


In [8]:
terminated, truncated = False, False
obs, info = env.reset()
ep_rewards = []
cum_reward = 0.0

while not (terminated or truncated):
    action = agent.select_action(state=obs)  # your policy here
    obs, reward, terminated, truncated, info = env.step(action)
    cum_reward += reward
    ep_rewards.append(reward)


print(f"Episode cumulative reward: {cum_reward:.6f}")

# If you want the running cumulative reward at each step:
cum_rewards_per_step = np.cumsum(ep_rewards)


Market Return : 423.10%   |   Portfolio Return : 552.70%   |   
Episode cumulative reward: 1.875943


In [22]:
env.observation_space.sample()

array([-0.75081104,  0.9879661 ,  0.52541214,  1.6537322 , -0.22571419,
        1.1118147 , -0.515132  ], dtype=float32)

In [9]:
from gym_trading_env.renderer import Renderer
renderer = Renderer(render_logs_dir="model_logs\\stock\\render_logs_ppo")
renderer.run()

 * Serving Flask app 'gym_trading_env.renderer'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [20/Oct/2025 10:34:21] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [20/Oct/2025 10:34:22] "GET /favicon.ico HTTP/1.1" 404 -
127.0.0.1 - - [20/Oct/2025 10:34:23] "GET /update_data/BTCUSD_2025-10-19_14-19-39.pkl HTTP/1.1" 200 -
127.0.0.1 - - [20/Oct/2025 10:34:23] "GET /metrics HTTP/1.1" 200 -
