In [1]:
import pandas as pd
import pickle 
from finrl.meta.preprocessor.preprocessors import FeatureEngineer, data_split
from finrl.meta.env_stock_trading.env_stocktrading import StockTradingEnv

# load the DataFrame from a pickle file
df = pd.read_pickle('dataset/processed.pkl')
TRAIN_START_DATE = '2010-01-01'
TRAIN_END_DATE = '2021-01-01'
TRADE_START_DATE = '2021-01-02'
TRADE_END_DATE = '2023-03-26'

In [2]:
INDICATORS = ['macd','rsi_14', 'rsi_21','rsi_28',
 'boll_ub',
 'boll_lb',
 'rsi_30',
 'cci_30',
 'dx_30',
 'close_30_sma',
 'close_60_sma']

In [3]:
train = data_split(df, TRAIN_START_DATE,TRAIN_END_DATE)
trade = data_split(df, TRADE_START_DATE,TRADE_END_DATE)
print(len(train))
print(len(trade))
stock_dimension = len(train.tic.unique())
state_space = 1 + 2*stock_dimension + len(INDICATORS)*stock_dimension
print(f"Stock Dimension: {stock_dimension}, State Space: {state_space}")

80301
16211
Stock Dimension: 29, State Space: 378


In [4]:
buy_cost_list = sell_cost_list = [0.001] * stock_dimension
num_stock_shares = [0] * stock_dimension

env_kwargs = {
    "hmax": 200,
    "initial_amount": 1000000,
    "num_stock_shares": num_stock_shares,
    "buy_cost_pct": buy_cost_list,
    "sell_cost_pct": sell_cost_list,
    "state_space": state_space,
    "stock_dim": stock_dimension,
    "tech_indicator_list": INDICATORS,
    "action_space": stock_dimension,
    "reward_scaling": 1e-4
}

In [5]:
import psutil
import ray
ray._private.utils.get_system_memory = lambda: psutil.virtual_memory().total
from ray.tune.registry import register_env
from gymnasium.wrappers import EnvCompatibility

In [6]:
def env_creator(env_config):
    # env_config is passed as {} and defaults are set here
    df = env_config.get('df', train)
    hmax = env_config.get('hmax', 200)
    initial_amount = env_config.get('initial_amount', 1000000)
    num_stock_shares = env_config.get('num_stock_shares', [0] * stock_dimension)
    buy_cost_pct = env_config.get('buy_cost_pct', buy_cost_list)
    sell_cost_pct = env_config.get('sell_cost_pct', sell_cost_list)
    state_space = env_config.get('state_space', 1 + 2*stock_dimension + len(INDICATORS)*stock_dimension)
    stock_dim = env_config.get('stock_dim', stock_dimension)
    tech_indicator_list = env_config.get('tech_indicator_list', INDICATORS)
    action_space = env_config.get('action_space', stock_dimension)
    reward_scaling = env_config.get('reward_scaling', 1e-3)

    return EnvCompatibility(StockTradingEnv(
        df=df,
        hmax=hmax,
        initial_amount=initial_amount,
        num_stock_shares=num_stock_shares,
        buy_cost_pct=buy_cost_pct,
        sell_cost_pct=sell_cost_pct,
        state_space=state_space,
        stock_dim=stock_dim,
        tech_indicator_list=tech_indicator_list,
        action_space=action_space,
        reward_scaling=reward_scaling
    ))

In [7]:
use_ddppo = False

In [8]:
from ray.rllib.agents import ppo
ray.shutdown()
print(f"ray is being initialized")
# ray.init(_temp_dir="FinRL/RLLIB/results", num_cpus=1, num_gpus=0)
# ray.init()

ray is being initialized


In [9]:
#if use_ddppo:
#    config = ppo.DDPPOConfig()
#else:
#    config = ppo.PPOConfig()

# Here set envirnonment parameters if different from default
#config = config.environment(
#    env_config={'hmax':200, 'initial_amount':1000000}
#)

# Training
#config = config.training()

# Resources
#config = config.resources(
#    num_gpus=0,
#    num_cpus_per_worker=1,
#    num_gpus_per_worker=0,
#    num_trainer_workers=1,
#    num_gpus_per_trainer_worker = 1,
#    num_cpus_per_trainer_worker = 1,
#)
# Framework
#config = config.framework(
#    framework="torch",
#)
# Rollouts
#config = config.rollouts(
#    num_rollout_workers=10,
#    num_envs_per_worker=1,
#)

In [14]:
config = ppo.PPOConfig()  
config = config.training(gamma=0.9, lr=0.001, kl_coeff=0.3)  
config = config.resources(num_gpus=0)  
config = config.rollouts(num_rollout_workers=8)  

In [15]:
# registering the environment to ray
register_env("finrl", env_creator)
if use_ddppo:
    trainer = ppo.DDPPOTrainer(env='finrl', config=config)
else:
    trainer = ppo.PPOTrainer(env='finrl', config=config)



[2m[1m[36m(autoscaler +4m28s)[0m Tip: use `ray status` to view detailed cluster status. To disable these messages, set RAY_SCHEDULER_EVENTS=0.


[2m[36m(RolloutWorker pid=31964)[0m   logger.warn("Casting input x to numpy array.")
[2m[36m(RolloutWorker pid=31963)[0m   logger.warn("Casting input x to numpy array.")
[2m[36m(RolloutWorker pid=31957)[0m   logger.warn("Casting input x to numpy array.")
[2m[36m(RolloutWorker pid=31959)[0m   logger.warn("Casting input x to numpy array.")
[2m[36m(RolloutWorker pid=31961)[0m   logger.warn("Casting input x to numpy array.")
[2m[36m(RolloutWorker pid=31958)[0m   logger.warn("Casting input x to numpy array.")
[2m[36m(RolloutWorker pid=31962)[0m   logger.warn("Casting input x to numpy array.")
[2m[36m(RolloutWorker pid=31960)[0m   logger.warn("Casting input x to numpy array.")
2023-03-28 13:39:50,864	INFO trainable.py:172 -- Trainable.setup took 35.542 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


In [17]:
from tqdm.notebook import tqdm
# Train away -------------------------------------------------------------
total_episodes=100
agent_name = 'ppo'
ep = 0
results = []
bar = tqdm(total=total_episodes, desc="Episodes")
while ep <= total_episodes:
    results.append(trainer.train())
    ep += 1
    bar.update(n=1)
    if ep % 10 == 0:
        cwd_checkpoint = "results/checkpoints/" + str(agent_name) + '_' + str(ep)
        trainer.save(cwd_checkpoint)
        print(f"Checkpoint saved in directory {cwd_checkpoint}")
bar.close()

Episodes:   0%|          | 0/100 [00:00<?, ?it/s]

Checkpoint saved in directory results/checkpoints/ppo_10
Checkpoint saved in directory results/checkpoints/ppo_20
Checkpoint saved in directory results/checkpoints/ppo_30
[2m[36m(RolloutWorker pid=31957)[0m day: 2768, episode: 10
[2m[36m(RolloutWorker pid=31957)[0m begin_total_asset: 1000000.00
[2m[36m(RolloutWorker pid=31957)[0m end_total_asset: 2524461.64
[2m[36m(RolloutWorker pid=31957)[0m total_reward: 1524461.64
[2m[36m(RolloutWorker pid=31957)[0m total_cost: 514052.79
[2m[36m(RolloutWorker pid=31957)[0m total_trades: 69578
[2m[36m(RolloutWorker pid=31957)[0m Sharpe: 0.554
[2m[36m(RolloutWorker pid=31959)[0m day: 2768, episode: 10
[2m[36m(RolloutWorker pid=31959)[0m begin_total_asset: 1000000.00
[2m[36m(RolloutWorker pid=31959)[0m end_total_asset: 2205599.30
[2m[36m(RolloutWorker pid=31959)[0m total_reward: 1205599.30
[2m[36m(RolloutWorker pid=31959)[0m total_cost: 492031.77
[2m[36m(RolloutWorker pid=31959)[0m total_trades: 68640
[2m[36m(Ro

In [23]:
## Mean Rewards ### 
#print(results[-1]['episode_reward_mean'])
print(results[-1])


{'custom_metrics': {}, 'episode_media': {}, 'info': {'learner': {'default_policy': {'learner_stats': {'cur_kl_coeff': 5.125781059265137, 'cur_lr': 0.0010000000474974513, 'total_loss': 9.018438, 'policy_loss': -0.0073167793, 'vf_loss': 8.977703, 'vf_explained_var': -3.5250058e-09, 'kl': 0.009374749, 'entropy': 41.85276, 'entropy_coeff': 0.0, 'model': {}}, 'custom_metrics': {}, 'num_agent_steps_trained': 128.0, 'num_grad_updates_lifetime': 98115.5, 'diff_num_grad_updates_vs_sampler_policy': 464.5}}, 'num_env_steps_sampled': 424000, 'num_env_steps_trained': 424000, 'num_agent_steps_sampled': 424000, 'num_agent_steps_trained': 424000}, 'sampler_results': {'episode_reward_max': 4323.120832900457, 'episode_reward_min': 913.9147909888835, 'episode_reward_mean': 1996.1647900378043, 'episode_len_mean': 2769.0, 'episode_media': {}, 'episodes_this_iter': 8, 'policy_reward_min': {}, 'policy_reward_max': {}, 'policy_reward_mean': {}, 'custom_metrics': {}, 'hist_stats': {'episode_reward': [1636.1391

In [24]:
# save the trained agent
if use_ddppo:
    agent_name = 'ddppo'
cwd_checkpoint = "results/checkpoints/" + str(agent_name) + '_' + str(ep)
trainer.save(cwd_checkpoint)

'results/checkpoints/ppo_101/checkpoint_000106'