## Import

In [1]:
import pickle
from collections import deque
import matplotlib.pyplot as plt
import warnings

from env.env import *
from state.state import *
from agent.PPOAgent_ms import *
from models.CTTS import *

In [2]:
warnings.filterwarnings("ignore", category=FutureWarning)

## Load

In [3]:
with open('../data/processed/kospi200_ffill_clean_version.pkl', 'rb') as f:
    df = pickle.load(f)

df.head(5)

Unnamed: 0,date,time,open,high,low,close,prevClose,vol
2010-02-16 09:01:00,20100216,901,207.55,207.65,207.5,207.6,207.5,3985.0
2010-02-16 09:02:00,20100216,902,207.6,207.65,207.25,207.55,207.5,5095.0
2010-02-16 09:03:00,20100216,903,207.55,207.8,207.5,207.6,207.5,2175.0
2010-02-16 09:04:00,20100216,904,207.55,207.85,207.55,207.8,207.5,1301.0
2010-02-16 09:05:00,20100216,905,207.8,208.15,207.8,208.05,207.5,3870.0


In [4]:
target_values = ['open', 'high', 'low', 'close', 
                'vol','return_5', 'return_10', 'volume_change', 'ema_5', 
                'ema_20', 'ema_cross', 'cci', 'sar', '%K', 
                # '%D', 'roc', 'rsi', 'obv', 'ad_line', 
                'bb_upper', 'bb_lower', 'bb_width', 'atr', 'gap_size']

state = State(target_values)
scaler = RobustScaler()

In [5]:
def is_day_changed(**kwargs):
    # 날짜를 기준으로 구분 : 날짜가 달라지면 done = True 
    next_timestep = kwargs['next_timestep']
    current_timestep = kwargs['current_timestep']

    return False

In [945]:
state = State(target_values)
env = FuturesEnvironment(full_df=df, 
                         date_range=("2010-02-16", "2010-02-17"), 
                         window_size=30, 
                         state_type=state, 
                         reward_ftn=reward_unrealized_pnl_diff_log, 
                         done_ftn=is_day_changed, 
                         start_budget=20000000,
                         scaler=scaler,
                         position_cap=50)

Robust Scaling Completed.


In [946]:
execution_strength = 3
action_space = list(range(-execution_strength, execution_strength+1))
n_actions = len(action_space)
device = torch.device("cpu")  

In [947]:
network = MultiStatePV(input_dim=len(target_values),              # 입력 feature 수 (D)
                        agent_input_dim=2,        # agent 상태 feature 수
                        embed_dim=32,              # CNN + Transformer 임베딩 차원 (d_model)
                        kernel_size=4,            # CNN 커널 사이즈
                        stride=1,                 # CNN stride
                        action_size=n_actions, 
                        device=device,                 # positional encoding에 필요
                        agent_hidden_dim=32, 
                        agent_out_dim=32,
                        fusion_hidden_dim=64,
                        num_layers=3,           # Transformer 층 수
                        num_heads=4,            # Multi-head attention 헤드 수
                        d_ff=64,                # FFN hidden size
                        dropout=0.1)

In [948]:
it = iter(env.dataset)

In [1267]:
next(it)

StopIteration: 

In [1646]:
env.reset()
print()




In [1647]:
while not env.dataset.reach_end(env.current_timestep):
    print(env.step(1)[-1])
    print(env.current_timestep)

False
2010-02-16 09:50:00
False
2010-02-16 09:51:00
False
2010-02-16 09:52:00
False
2010-02-16 09:53:00
False
2010-02-16 09:54:00
False
2010-02-16 09:55:00
False
2010-02-16 09:56:00
False
2010-02-16 09:57:00
False
2010-02-16 09:58:00
False
2010-02-16 09:59:00
False
2010-02-16 10:00:00
False
2010-02-16 10:01:00
False
2010-02-16 10:02:00
False
2010-02-16 10:03:00
False
2010-02-16 10:04:00
False
2010-02-16 10:05:00
False
2010-02-16 10:06:00
False
2010-02-16 10:07:00
False
2010-02-16 10:08:00
False
2010-02-16 10:09:00
False
2010-02-16 10:10:00
False
2010-02-16 10:11:00
False
2010-02-16 10:12:00
False
2010-02-16 10:13:00
False
2010-02-16 10:14:00
False
2010-02-16 10:15:00
False
2010-02-16 10:16:00
False
2010-02-16 10:17:00
False
2010-02-16 10:18:00
False
2010-02-16 10:19:00
False
2010-02-16 10:20:00
False
2010-02-16 10:21:00
False
2010-02-16 10:22:00
False
2010-02-16 10:23:00
False
2010-02-16 10:24:00
False
2010-02-16 10:25:00
False
2010-02-16 10:26:00
False
2010-02-16 10:27:00
False
2010-0

In [None]:
agent = PPOAgent(
    action_space=action_space,
    n_actions=n_actions,
    model=network,
    value_coeff=0.5,
    entropy_coeff=0.01,
    clip_eps=0.2,
    gamma=0.99,
    lr=1e-3,
    batch_size=32,  # 현재 안 쓰이고 있음  
    epoch=10
)

N_ITERATIONS = 1000
N_STEPS = 60

episode_rewards = []
moving_avg_rewards = deque(maxlen=50)
episode = 0

state = env.reset() 

while not env.dataset.reach_end(env.current_timestep):
    
    memory = []
    done = False

    state = state if env.next_state is None else env.conti()

    ts_state = torch.tensor(state[0], dtype=torch.float32).unsqueeze(0)
    agent_state = torch.tensor(state[1], dtype=torch.float32).unsqueeze(0)

    state = (ts_state, agent_state)
    

    ep_reward = 0
    ep_len = 0

    for _ in range(N_STEPS):
        if done:
            print("is_done")
            break

        action, log_prob = agent.get_action(state)

        next_state, reward, done = env.step(action)

        ts_state = torch.tensor(next_state[0], dtype=torch.float32).unsqueeze(0)
        agent_state = torch.tensor(next_state[1], dtype=torch.float32).unsqueeze(0)
        next_state = (ts_state, agent_state)

        memory.append([
            state,
            torch.tensor([[action]]),
            torch.tensor([reward], dtype=torch.float32),
            next_state,
            torch.tensor([done], dtype=torch.float32),
            torch.tensor([log_prob], dtype=torch.float32)
        ])
        state = next_state
        ep_reward += reward
        ep_len += 1

    episode_rewards.append(ep_reward)
    moving_avg_rewards.append(ep_reward)

    advantage = agent.cal_advantage(memory)
    loss = agent.train(memory, advantage)

    avg_reward = np.mean(moving_avg_rewards)
    print(f"Episode {episode:3d} | Loss: {loss: .4f} | Action: {action} |Reward: {ep_reward:3.0f} | Avg(50): {avg_reward: .2f} | Len: {ep_len}")

    if (episode+1) % 50 == 0:
        print(env)

    episode += 1

# 후에 결과 시각화
plt.plot(episode_rewards, label='Episode Reward')
plt.plot(np.convolve(episode_rewards, np.ones(50)/50, mode='valid'), label='Moving Avg (50)')
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.legend()
plt.grid()
plt.title('PPO Training Performance')
plt.show()

2010-02-16 09:50:00
False
2010-02-16 09:51:00
False
2010-02-16 09:52:00
False
2010-02-16 09:53:00
False
2010-02-16 09:54:00
False
2010-02-16 09:55:00
False
2010-02-16 09:56:00
False
2010-02-16 09:57:00
False
2010-02-16 09:58:00
False
2010-02-16 09:59:00
False
2010-02-16 10:00:00
False
2010-02-16 10:01:00
False
2010-02-16 10:02:00
False
2010-02-16 10:03:00
False
2010-02-16 10:04:00
False
2010-02-16 10:05:00
False
2010-02-16 10:06:00
False
2010-02-16 10:07:00
False
2010-02-16 10:08:00
False
2010-02-16 10:09:00
False
2010-02-16 10:10:00
False
2010-02-16 10:11:00
False
2010-02-16 10:12:00
False
2010-02-16 10:13:00
False
2010-02-16 10:14:00
False
2010-02-16 10:15:00
False
2010-02-16 10:16:00
False
2010-02-16 10:17:00
False
2010-02-16 10:18:00
False
2010-02-16 10:19:00
False
2010-02-16 10:20:00
False
2010-02-16 10:21:00
False
2010-02-16 10:22:00
False
2010-02-16 10:23:00
False
2010-02-16 10:24:00
False
2010-02-16 10:25:00
False
2010-02-16 10:26:00
False
2010-02-16 10:27:00
False
2010-02-16 1

StopIteration: 

timestep 오류 찾아보기 ..

- 단기적으로 방향을 맞추는가?  
- 행동 선택의 비율이 어떻게 되는가  
- loss
- 보상 지표 