# Library import & Drive mount

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/Reinforcement_Learning/

/content/drive/MyDrive/Reinforcement_Learning


In [3]:
import torch
import torch.nn as nn
from torch.nn.utils import weight_norm
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from collections import deque
import torch.optim as optim
import importlib
from calculate_tech_ind import calculate_macd, calculate_rsi, calculate_cci, calculate_adx
from model import PreLSTM, PolicyNetwork

In [4]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
print(device)

def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
set_seed(1)

cuda


# Data download and Preprocessing

In [5]:
symbols = {
    'S&P 500': '^GSPC',
    'Dow Jones': '^DJI',
    'KOSPI': '^KS11'
}
start_date = '2012-12-01'
end_date = '2023-12-31'

market_data = {}
for name, ticker in symbols.items():
    data = yf.download(ticker, start=start_date, end=end_date)
    market_data[name] = data

snp500_data = market_data['S&P 500']
dowjones_data = market_data['Dow Jones']
kospi_data = market_data['KOSPI']
#print(snp500_data)

# 공통 거래일 계산
common_dates = snp500_data.index.intersection(dowjones_data.index).intersection(kospi_data.index)

# 공통 거래일 기준 데이터 정렬
snp500_data_aligned = snp500_data.loc[common_dates]
dowjones_data_aligned = dowjones_data.loc[common_dates]
kospi_data_aligned = kospi_data.loc[common_dates]

snp500_data_aligned = calculate_macd(snp500_data_aligned)
snp500_data_aligned = calculate_rsi(snp500_data_aligned)
snp500_data_aligned = calculate_cci(snp500_data_aligned)

dowjones_data_aligned = calculate_macd(dowjones_data_aligned)
dowjones_data_aligned = calculate_rsi(dowjones_data_aligned)
dowjones_data_aligned = calculate_cci(dowjones_data_aligned)

kospi_data_aligned = calculate_macd(kospi_data_aligned)
kospi_data_aligned = calculate_rsi(kospi_data_aligned)
kospi_data_aligned = calculate_cci(kospi_data_aligned)

# Close, MACD, RSI 병합
merged_close = pd.concat([
    snp500_data_aligned[['Close']].rename(columns={'Close': 'S&P 500 Close'}),
    dowjones_data_aligned[['Close']].rename(columns={'Close': 'Dow Jones Close'}),
    kospi_data_aligned[['Close']].rename(columns={'Close': 'KOSPI Close'})
], axis=1)

merged_macd = pd.concat([
    snp500_data_aligned[['MACD']].rename(columns={'MACD': 'S&P 500 MACD'}),
    dowjones_data_aligned[['MACD']].rename(columns={'MACD': 'Dow Jones MACD'}),
    kospi_data_aligned[['MACD']].rename(columns={'MACD': 'KOSPI MACD'})
], axis=1)

merged_rsi = pd.concat([
    snp500_data_aligned[['RSI']].rename(columns={'RSI': 'S&P 500 RSI'}),
    dowjones_data_aligned[['RSI']].rename(columns={'RSI': 'Dow Jones RSI'}),
    kospi_data_aligned[['RSI']].rename(columns={'RSI': 'KOSPI RSI'})
], axis=1)

merged_cci = pd.concat([
    snp500_data_aligned[['CCI']].rename(columns={'CCI': 'S&P 500 CCI'}),
    dowjones_data_aligned[['CCI']].rename(columns={'CCI': 'Dow Jones CCI'}),
    kospi_data_aligned[['CCI']].rename(columns={'CCI': 'KOSPI CCI'})
], axis=1)

# 전체 병합 데이터 생성
merged_data_all = pd.concat([merged_close, merged_macd, merged_rsi, merged_cci], axis=1)

# 날짜 기준 필터링
merged_data_all = merged_data_all[merged_data_all.index > '2013-02-01']

print(merged_data_all)

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Price      S&P 500 Close Dow Jones Close  KOSPI Close S&P 500 MACD  \
Ticker             ^GSPC            ^DJI        ^KS11                
Date                                                                 
2013-02-04   1495.709961    13880.080078  1953.209961    16.833122   
2013-02-05   1511.290039    13979.299805  1938.180054    16.887413   
2013-02-06   1512.119995    13986.519531  1936.189941    16.803706   
2013-02-07   1509.390015    13944.049805  1931.770020    16.328852   
2013-02-08   1517.930054    13992.969727  1950.900024    16.451989   
...                  ...             ...          ...          ...   
2023-12-21   4746.750000    37404.351562  2600.020020    76.724339   
2023-12-22   4754.629883    37385.968750  2599.510010    76.666913   
2023-12-26   4774.750000    37545.328125  2602.590088    77.353248   
2023-12-27   4781.580078    37656.519531  2613.500000    77.554305   
2023-12-28   4783.350098    37710.101562  2655.280029    76.969218   

Price      Dow Jone

# Environment

In [6]:
class env():
  def __init__(self, data:pd.DataFrame, train_split_date = '2021-01-01', valid_split_date = '2022-01-01', test_split_date  = '2023-01-01'):
    self.data = data
    self.t = 0
    self.train_data = torch.tensor(self.data[self.data.index < train_split_date].values).to(torch.float32).to(device)
    self.valid_data = torch.tensor(self.data[(self.data.index >= train_split_date) & (self.data.index < valid_split_date)].values).to(torch.float32).to(device)
    self.test_data = torch.tensor(self.data[self.data.index >= test_split_date].values).to(torch.float32).to(device)

  def step(self, state, action, data): #데이터 앞의 세개는 각 인덱스의 가격으로 설정
      shares_to_buy = torch.round(action).to(torch.float32).to(device)
      num_stock, balance = state[-4:-1], state[-1]
      W = torch.dot(state[:3], num_stock) + balance
      #print(shares_to_buy.device, num_stock.device, balance.device)
      shares_to_buy = torch.where(shares_to_buy + num_stock < 0, -num_stock, shares_to_buy)
      cost = torch.dot(data[self.t,:3], shares_to_buy)

      #print(f"shares_to_buy: {shares_to_buy}, balance: {balance}, cost: {cost}")
      if cost > balance:
          cost = torch.tensor(0).to(device)
          shares_to_buy = torch.zeros(3).to(device)
      n_balance = balance - cost
      self.t += 1

      done = self.t >= (data.shape[0] - 1)

      n_num_stock = num_stock + shares_to_buy
      n_W = torch.dot(data[self.t,:3], n_num_stock) + n_balance

      n_state = torch.hstack((data[self.t, :], n_num_stock, n_balance))
      reward = n_W - W

      #print(f"n_balance: {n_balance}, n_stock: {n_num_stock}, n_W: {n_W}, reward: {reward}")
      return n_state, reward, done

  def train_step(self, state, action):
      return self.step(state, action, self.train_data)

  def test_step(self, state, action):
      return self.step(state, action, self.test_data)

  def valid_step(self, state, action):
      return self.step(state, action, self.valid_data)

  def reset(self, data):
      self.t = 0
      init_S = torch.zeros(3).to(device)
      init_B = torch.tensor(1e8).to(device)
      init_state = torch.hstack((data[0,:], init_S, init_B))
      return init_state

  def train_reset(self):
      return self.reset(self.train_data)

  def test_reset(self):
      return self.reset(self.test_data)

  def valid_reset(self):
      return self.reset(self.valid_data)

# PPO Agent

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
from collections import deque
import model
import importlib
importlib.reload(model)

GAMMA = 0.99
LAM = 0.95  # Lambda for GAE
LEARNING_RATE = 0.0001
CLIP_EPSILON = 0.1
PPO_EPOCHS = 5
BATCH_SIZE = 256
K = 100  # 최대 매수/매도 개수

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class PPOAgent:
    def __init__(self, window_size, num_stocks, feature_len, n_channels, k=K):
        self.num_stocks = num_stocks
        self.feature_extractor = model.PreLSTM(n_channels, hidden_dim=128, output_dim=feature_len).to(device)
        self.policy_network = model.PolicyNetwork(self.feature_extractor, num_stocks).to(device)
        self.value_network = model.ValueNetwork(self.feature_extractor).to(device)

        self.policy_optimizer = optim.Adam(list(self.feature_extractor.parameters()) + list(self.policy_network.parameters()), lr=LEARNING_RATE)
        self.value_optimizer = optim.Adam(self.value_network.parameters(), lr=LEARNING_RATE)

        self.gamma = GAMMA
        self.lam = LAM
        self.clip_epsilon = CLIP_EPSILON
        self.k = k

        # Trajectory buffer
        self.states = []
        self.actions = []
        self.rewards = []
        self.dones = []
        self.log_probs = []
        self.values = []

        self.state_buffer = deque(maxlen=window_size)

    def act(self, state):
        self.state_buffer.append(state)
        if len(self.state_buffer) < self.state_buffer.maxlen:
            return torch.zeros(self.num_stocks).to(device), 0., 0.

        state_tensor = torch.stack(list(self.state_buffer)).to(torch.float32).to(device)
        with torch.no_grad():
            mu, sigma_diag = self.policy_network(state_tensor)
            sigma = torch.diag(torch.clamp(sigma_diag, min=1e-6))
            dist = torch.distributions.MultivariateNormal(mu, covariance_matrix=sigma)
            action = dist.sample()
            log_prob = dist.log_prob(action)
            clipped_action = torch.clamp(action, -1, 1)
            discrete_action = (clipped_action * self.k).long()

            value = self.value_network(state_tensor)
        #print(f"actions: {discrete_action.shape}, log_prob: {log_prob}, value: {value.shape}")
        return discrete_action.cpu(), log_prob.cpu().item(), value.cpu().item()

    def store_transition(self, state, action, reward, done, log_prob, value):
        # Calculate after the trajectory
        self.states.append(state)
        self.actions.append(action)
        self.rewards.append(reward)
        self.dones.append(done)
        self.log_probs.append(log_prob)
        self.values.append(value)

    def compute_returns_and_advantages(self, last_value=0.0):
        values = self.values + [last_value]
        advantages = []
        gae = 0
        for step in reversed(range(len(self.rewards))):
            delta = self.rewards[step] + self.gamma * values[step+1]*(1 - self.dones[step]) - values[step]
            gae = delta + self.gamma * self.lam * (1 - self.dones[step]) * gae
            advantages.insert(0, gae)
        returns = [adv + val for adv, val in zip(advantages, self.values)]
        advantages = torch.tensor(advantages, dtype=torch.float32).to(device)
        returns = torch.tensor(returns, dtype=torch.float32).to(device)
        # Normalize Advantage
        advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
        return returns, advantages

    def ppo_update(self, states, actions, log_probs, returns, advantages):
        # PPO update
        dataset_size = states.shape[0]
        for _ in range(PPO_EPOCHS):
            indices = np.arange(dataset_size)
            np.random.shuffle(indices)
            for start in range(0, dataset_size, BATCH_SIZE):
                end = start + BATCH_SIZE
                batch_idx = indices[start:end]

                batch_states = states[batch_idx].to(device)
                batch_actions = actions[batch_idx].to(torch.float32).to(device)
                batch_log_probs_old = log_probs[batch_idx].to(device)
                batch_returns = returns[batch_idx].to(device)
                batch_advantages = advantages[batch_idx].to(device)

                mu, sigma_diag = self.policy_network(batch_states)
                sigma = []
                for s_diag in sigma_diag:
                    sigma.append(torch.diag(torch.clamp(s_diag, min=1e-6)))
                sigma = torch.stack(sigma) # (batch_size, num_stocks, num_stocks)
                dist = torch.distributions.MultivariateNormal(mu, covariance_matrix=sigma)
                new_log_probs = dist.log_prob(batch_actions)

                ratio = torch.exp(new_log_probs - batch_log_probs_old)

                # Clipped objective
                surr1 = ratio * batch_advantages
                surr2 = torch.clamp(ratio, 1.0 - self.clip_epsilon, 1.0 + self.clip_epsilon) * batch_advantages
                policy_loss = -torch.min(surr1, surr2).mean()

                # Value loss
                value = self.value_network(batch_states).squeeze()
                value_loss = nn.MSELoss()(value, batch_returns)

                total_loss = policy_loss + 0.5 * value_loss

                self.policy_optimizer.zero_grad()
                self.value_optimizer.zero_grad()
                total_loss.backward()
                self.policy_optimizer.step()
                self.value_optimizer.step()

    def finish_trajectory_and_update(self, last_state=None, done=False):
        # Update
        if len(self.states) == 0:
            return

        with torch.no_grad():
            if last_state is not None:
                state_tensor = torch.stack(list(self.state_buffer)).to(torch.float32).to(device)
                last_value = self.value_network(state_tensor).item() if not done else 0.0
            else:
                last_value = 0.0

        returns, advantages = self.compute_returns_and_advantages(last_value)

        states_tensor = torch.stack(self.states).to(torch.float32)
        actions_tensor = torch.stack(self.actions).to(torch.float32)
        log_probs_tensor = torch.tensor(self.log_probs, dtype=torch.float32)

        self.ppo_update(states_tensor, actions_tensor, log_probs_tensor, returns, advantages)

        self.states = []
        self.actions = []
        self.rewards = []
        self.dones = []
        self.log_probs = []
        self.values = []


# PPO Train

In [8]:
EPISODE = 500
Env = env(merged_data_all)
Agent =  PPOAgent(window_size = 30, num_stocks = 3, feature_len = 128, n_channels=16, k=20)

for episode in range(EPISODE):
    state = Env.train_reset().to(device)
    total_reward = torch.tensor(0, device = device, dtype = torch.float32)
    for i in range(Env.train_data.shape[0]-2):
        action, log_prob, value = Agent.act(state.to(device))
        next_state, reward, done = Env.train_step(state, action)

        if len(Agent.state_buffer) == Agent.state_buffer.maxlen:
            state_to_be_stored = torch.stack(list(Agent.state_buffer)).to(torch.float32).to(device)
            Agent.store_transition(state_to_be_stored, action, reward, done, log_prob, value)

        state = next_state
        total_reward += reward

        if done:
            break
    Agent.finish_trajectory_and_update(last_state = state, done = done)

    if episode % 1 == 0:
        print(f"Episode: {episode + 1}, Total Reward: {total_reward}")

    if episode % 5 == 0:
        state = Env.valid_reset()
        valid_reward = torch.tensor(0, device = device, dtype = torch.float32)

        with torch.no_grad():
            for i in range(Env.test_data.shape[0]-2):
                action, _, _ = Agent.act(state.to(device))
                next_state, reward, done = Env.test_step(state, action)
                state = next_state
                valid_reward += reward

        print("validation_reward:",valid_reward.item())


  return disable_fn(*args, **kwargs)


Episode: 1, Total Reward: 1236992.0
validation_reward: 274520.0
Episode: 2, Total Reward: 9685024.0
Episode: 3, Total Reward: 70895184.0
Episode: 4, Total Reward: 86440000.0
Episode: 5, Total Reward: 87056784.0
Episode: 6, Total Reward: 87411728.0
validation_reward: 9089744.0
Episode: 7, Total Reward: 87735456.0
Episode: 8, Total Reward: 90496256.0
Episode: 9, Total Reward: 89104928.0
Episode: 10, Total Reward: 89630320.0
Episode: 11, Total Reward: 89100064.0
validation_reward: 10849184.0
Episode: 12, Total Reward: 90457984.0
Episode: 13, Total Reward: 88922288.0
Episode: 14, Total Reward: 87412192.0
Episode: 15, Total Reward: 86548080.0
Episode: 16, Total Reward: 86741904.0
validation_reward: 10976280.0
Episode: 17, Total Reward: 88667040.0
Episode: 18, Total Reward: 87658480.0
Episode: 19, Total Reward: 86345472.0
Episode: 20, Total Reward: 85374688.0
Episode: 21, Total Reward: 85681392.0
validation_reward: 10672272.0


KeyboardInterrupt: 

# PPO Test

테스트는 트레이닝시와 동일하게 1억원의 잔고를 가지고 진행
최종적으로 10.7퍼센트 가량의 수익을 올렸습니다

In [9]:
Env = env(merged_data_all)
state = Env.test_reset()
total_reward = torch.tensor(0, device = device, dtype = torch.float32)

with torch.no_grad():
    for i in range(Env.test_data.shape[0]-2):
        action, _, _ = Agent.act(state.to(device))
        next_state, reward, done = Env.test_step(state, action)
        #print(f"{i}th reward:{reward}")
        state = next_state
        total_reward += reward

print("total_reward:",total_reward.item())


total_reward: 10761976.0
