# PPO agent 학습용 scratch codes

In [1]:
import os
import numpy as np
import torch
import pandas as pd

from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import SubprocVecEnv
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.utils import get_schedule_fn

import gymnasium as gym
from stable_baselines3.common.utils import set_random_seed

import const

from typing import List, Dict, Callable

In [None]:
data_root = "./data/processed"
SEEDS = [1000, 2000, 3000, 4000, 5000]

In [5]:
def make_env(env_id: str, rank: int, seed: int = 0) -> Callable:
    """
    Utility function for multiprocessed env.

    :param env_id: (str) the environment ID
    :param num_env: (int) the number of environment you wish to have in subprocesses
    :param seed: (int) the inital seed for RNG
    :param rank: (int) index of the subprocess
    :return: (Callable)
    """

    def _init() -> gym.Env:
        env = gym.make(env_id)
        env.reset(seed=seed + rank)
        return env

    set_random_seed(seed)
    return _init


def fold_data_load(root_dir: str):
    dataset_list: List[Dict[str, pd.DataFrame]] = []
    
    folds = os.listdir(root_dir)

    for fold in folds:
        fold_path = os.path.join(root_dir, fold)
        dataset = {}
        for data_type in const.SPLIT:
            data = pd.read_csv(os.path.join(fold_path, f"{data_type}_processed.csv"))
            dataset[data_type] = data
        dataset_list.append(dataset)
    
    return dataset_list  

In [None]:
ppo_hyperparams = const.PPOEnvConst()
data_list = fold_data_load(data_root)

for each_fold in data_list:
    for seed in SEEDS:
        # --- Training Envs ---
        # 여러 병렬 환경 생성
        train_envs = [make_env(train_data, seed=i) for i in range(ppo_hyperparams.N_ENVS)]
        vec_env = SubprocVecEnv(train_envs)

        # PPO 모델 생성
        model = PPO(
            "MlpPolicy",
            vec_env,
            n_steps=N_STEPS,
            batch_size=BATCH_SIZE,
            n_epochs=N_EPOCHS,
            gamma=GAMMA,
            gae_lambda=GAE_LAMBDA,
            clip_range=CLIP_RANGE,
            learning_rate=lr_schedule,
            policy_kwargs=POLICY_KWARGS,
            verbose=1
        )

        # 이전 윈도우 best 모델로부터 초기화(두 번째 윈도우부터)
        if seed_policy_path is not None:
            model.policy.load_parameters(torch.load(seed_policy_path))

        # Validation 콜백 설정
        log_dir = f"./logs/window_{window_idx}_seed_{seed}"
        os.makedirs(log_dir, exist_ok=True)
        val_callback = ValidationCallback(val_env=val_env, eval_freq=252*5, verbose=1)

        # 훈련 진행
        model.learn(total_timesteps=TOTAL_TIMESTEPS, callback=val_callback)

        # 벨리데이션 결과 best_model 로딩
        candidate_best_model_path = os.path.join(log_dir, "best_model.zip")
        if os.path.exists(candidate_best_model_path):
            candidate_model = PPO.load(candidate_best_model_path)
            # best_model를 다시 val_env에서 평가
            candidate_reward = val_callback.evaluate_policy(candidate_model, val_env, n_eval_episodes=5)
            if candidate_reward > window_best_mean_reward:
                window_best_mean_reward = candidate_reward
                window_best_model_path = candidate_best_model_path

        vec_env.close()

        # 이번 윈도우의 best 모델 선택
        best_model_path_per_window.append(window_best_model_path)
        seed_policy_path = window_best_model_path  # 다음 윈도우 훈련 초기 정책으로 사용

        return best_model_path_per_window