An alternative model - that may be able to score better would be if we would apply a custom policy and value function.
In the approach we take more control on how the model interpretes the environment and comes with a policy or values a certain state.

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import os
import sys
from pathlib import Path
from stable_baselines3 import A2C

# Find the root directory by traversing up the directory tree
def find_project_root(current_path):
    if os.path.exists(os.path.join(current_path, "README.md")):
        return current_path
    parent_path = os.path.dirname(current_path)
    if parent_path == current_path:
        raise ValueError("Project root not found.")
    return find_project_root(parent_path)


current_dir = os.getcwd()
project_root = find_project_root(current_dir)
sys.path.append(str(project_root))
print(f"Project root: {project_root}")

# flake8: noqa  # Disable Flake8 for the following block
import gymnasium as gym
from src.models.bank_env import BankEnv
from src.models.bank_model import Bankmodel
from src.visualization import visualize
import src.models.train as train
from src.tests import test_bankmodel_a2c_train as tests
from src.data.definitions import MODEL_PATH, TENSORBOARD_LOGS

In [1]:
from stable_baselines3.common.torch_layers import BasePolicy, FlattenExtractor, MlpExtractor
import torch.nn as nn

class CustomPolicy(BasePolicy):
    def __init__(self, *args, **kwargs):
        super(CustomPolicy, self).__init__(*args, **kwargs, net_arch=[256, 256])  
        self.policy_net = nn.Sequential(
            FlattenExtractor(self.observation_space),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Linear(64, self.action_space.n),
            nn.Softmax(dim=-1)
        )
    
    def forward(self, obs, deterministic=False):
        return super().forward(obs, deterministic=deterministic)


ImportError: cannot import name 'BasePolicy' from 'stable_baselines3.common.torch_layers' (f:\OneDrive\Documents\GitHub\rl-alm\rl-alm\lib\site-packages\stable_baselines3\common\torch_layers.py)

In [None]:
env = gym.make(env_id, render_mode="human")

# Define a Lineair Learning Rate Scheduler
def linear_schedule(initial_value: float):
    """Linear learning rate schedule."""

    def func(progress_remaining: float) -> float:
        """Progress will decrease from 1 (beginning) to 0."""
        return progress_remaining * initial_value
    return func

initial_lr = 0.001
n_steps = 60
model_name = 'A2C_Alt'
ent_coef= 0.001
model = A2C(
        "custom_policy",        
        env,
        device="cpu",
        tensorboard_log=TENSORBOARD_LOGS,
        verbose=0,
        n_steps=n_steps,
        learning_rate=linear_schedule(initial_lr),
        ent_coef=ent_coef,
    )

In [None]:
steps = 300_000

model, mean, episode_iqr, episode_cvar, episode_rewards = train.train( model, env, total_timesteps=steps, conf_level=0.95, tb_log_name= TENSORBOARD_LOGS)
modelpath = Path(
    
        MODEL_PATH,
        model_name + "_" + str(steps) + ".zip",
    )
model.save(modelpath)

print(f"Model name: {model_name}, Steps: {steps}, Mean: {mean}, IQR: {episode_iqr}, CVaR: {episode_cvar}")
visualize.plot_rewards(episode_rewards, interpolate_line=False)