# Library import & Drive mount

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd /content/drive/MyDrive/Reinforcement_Learning/

/content/drive/MyDrive/Reinforcement_Learning


In [None]:
import torch
import torch.nn as nn
from torch.nn.utils import weight_norm
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from collections import deque
import torch.optim as optim
import importlib
from calculate_tech_ind import calculate_macd, calculate_rsi, calculate_cci, calculate_adx
from model import PreLSTM, PolicyNetwork

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
print(device)

def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
set_seed(1)

cpu


# Data download and Preprocessing

In [None]:
symbols = {
    'S&P 500': '^GSPC',
    'Dow Jones': '^DJI',
    'KOSPI': '^KS11'
}
start_date = '2012-12-01'
end_date = '2023-12-31'

market_data = {}
for name, ticker in symbols.items():
    data = yf.download(ticker, start=start_date, end=end_date)
    market_data[name] = data

snp500_data = market_data['S&P 500']
dowjones_data = market_data['Dow Jones']
kospi_data = market_data['KOSPI']
#print(snp500_data)

# 공통 거래일 계산
common_dates = snp500_data.index.intersection(dowjones_data.index).intersection(kospi_data.index)

# 공통 거래일 기준 데이터 정렬
snp500_data_aligned = snp500_data.loc[common_dates]
dowjones_data_aligned = dowjones_data.loc[common_dates]
kospi_data_aligned = kospi_data.loc[common_dates]

snp500_data_aligned = calculate_macd(snp500_data_aligned)
snp500_data_aligned = calculate_rsi(snp500_data_aligned)
snp500_data_aligned = calculate_cci(snp500_data_aligned)

dowjones_data_aligned = calculate_macd(dowjones_data_aligned)
dowjones_data_aligned = calculate_rsi(dowjones_data_aligned)
dowjones_data_aligned = calculate_cci(dowjones_data_aligned)

kospi_data_aligned = calculate_macd(kospi_data_aligned)
kospi_data_aligned = calculate_rsi(kospi_data_aligned)
kospi_data_aligned = calculate_cci(kospi_data_aligned)

# Close, MACD, RSI 병합
merged_close = pd.concat([
    snp500_data_aligned[['Close']].rename(columns={'Close': 'S&P 500 Close'}),
    dowjones_data_aligned[['Close']].rename(columns={'Close': 'Dow Jones Close'}),
    kospi_data_aligned[['Close']].rename(columns={'Close': 'KOSPI Close'})
], axis=1)

merged_macd = pd.concat([
    snp500_data_aligned[['MACD']].rename(columns={'MACD': 'S&P 500 MACD'}),
    dowjones_data_aligned[['MACD']].rename(columns={'MACD': 'Dow Jones MACD'}),
    kospi_data_aligned[['MACD']].rename(columns={'MACD': 'KOSPI MACD'})
], axis=1)

merged_rsi = pd.concat([
    snp500_data_aligned[['RSI']].rename(columns={'RSI': 'S&P 500 RSI'}),
    dowjones_data_aligned[['RSI']].rename(columns={'RSI': 'Dow Jones RSI'}),
    kospi_data_aligned[['RSI']].rename(columns={'RSI': 'KOSPI RSI'})
], axis=1)

merged_cci = pd.concat([
    snp500_data_aligned[['CCI']].rename(columns={'CCI': 'S&P 500 CCI'}),
    dowjones_data_aligned[['CCI']].rename(columns={'CCI': 'Dow Jones CCI'}),
    kospi_data_aligned[['CCI']].rename(columns={'CCI': 'KOSPI CCI'})
], axis=1)

# 전체 병합 데이터 생성
merged_data_all = pd.concat([merged_close, merged_macd, merged_rsi, merged_cci], axis=1)

# 날짜 기준 필터링
merged_data_all = merged_data_all[merged_data_all.index > '2013-02-01']

print(merged_data_all)

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Price      S&P 500 Close Dow Jones Close  KOSPI Close S&P 500 MACD  \
Ticker             ^GSPC            ^DJI        ^KS11                
Date                                                                 
2013-02-04   1495.709961    13880.080078  1953.209961    16.833122   
2013-02-05   1511.290039    13979.299805  1938.180054    16.887413   
2013-02-06   1512.119995    13986.519531  1936.189941    16.803706   
2013-02-07   1509.390015    13944.049805  1931.770020    16.328852   
2013-02-08   1517.930054    13992.969727  1950.900024    16.451989   
...                  ...             ...          ...          ...   
2023-12-21   4746.750000    37404.351562  2600.020020    76.724339   
2023-12-22   4754.629883    37385.968750  2599.510010    76.666913   
2023-12-26   4774.750000    37545.328125  2602.590088    77.353248   
2023-12-27   4781.580078    37656.519531  2613.500000    77.554305   
2023-12-28   4783.350098    37710.101562  2655.280029    76.969218   

Price      Dow Jone

# Environment

In [None]:
class env():
  def __init__(self, data:pd.DataFrame, train_split_date = '2021-01-01', valid_split_date = '2022-01-01', test_split_date  = '2023-01-01'):
    self.data = data
    self.t = 0
    self.train_data = torch.tensor(self.data[self.data.index < train_split_date].values).to(torch.float32).to(device)
    self.valid_data = torch.tensor(self.data[(self.data.index >= train_split_date) & (self.data.index < valid_split_date)].values).to(torch.float32).to(device)
    self.test_data = torch.tensor(self.data[self.data.index >= test_split_date].values).to(torch.float32).to(device)
    #self.max_shares = 100

  def step(self, state, action, data): #데이터 앞의 세개는 각 인덱스의 가격으로 설정
      shares_to_buy = torch.round(action).to(torch.float32).to(device)
      num_stock, balance = state[-4:-1], state[-1]
      W = torch.dot(state[:3], num_stock) + balance
      #print(shares_to_buy.device, num_stock.device, balance.device)
      shares_to_buy = torch.where(shares_to_buy + num_stock < 0, -num_stock, shares_to_buy)
      cost = torch.dot(data[self.t,:3], shares_to_buy)

      #print(f"shares_to_buy: {shares_to_buy}, balance: {balance}, cost: {cost}")
      if cost > balance:
          cost = torch.tensor(0).to(device)
          shares_to_buy = torch.zeros(3).to(device)
      n_balance = balance - cost
      self.t += 1

      done = self.t >= (data.shape[0] - 1)

      n_num_stock = num_stock + shares_to_buy
      n_W = torch.dot(data[self.t,:3], n_num_stock) + n_balance

      n_state = torch.hstack((data[self.t, :], n_num_stock, n_balance))
      reward = n_W - W

      #print(f"n_balance: {n_balance}, n_stock: {n_num_stock}, n_W: {n_W}, reward: {reward}")
      return n_state, reward, done

  def train_step(self, state, action):
      return self.step(state, action, self.train_data)

  def test_step(self, state, action):
      return self.step(state, action, self.test_data)

  def valid_step(self, state, action):
      return self.step(state, action, self.valid_data)

  def reset(self, data):
      self.t = 0
      init_S = torch.zeros(3).to(device)
      init_B = torch.tensor(1e8).to(device)
      init_state = torch.hstack((data[0,:], init_S, init_B))
      return init_state

  def train_reset(self):
      return self.reset(self.train_data)

  def test_reset(self):
      return self.reset(self.test_data)

  def valid_reset(self):
      return self.reset(self.valid_data)

# A2C Agent

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque
import model
import importlib
importlib.reload(model)

GAMMA = 0.99
LEARNING_RATE = 0.0001
REPLAY_MEMORY_SIZE = 5000
MIN_REPLAY_MEMORY_SIZE = 1000
BATCH_SIZE = 128
TAU = 0.1
K = 100

class A2CAgent:
    def __init__(self, window_size, num_stocks, feature_len, n_channels, k):
        self.num_stocks = num_stocks
        self.feature_extractor = model.PreLSTM(n_channels, hidden_dim=128, output_dim=feature_len).to(device)
        self.policy_network = model.PolicyNetwork(self.feature_extractor, num_stocks).to(device)
        self.value_network = model.ValueNetwork(self.feature_extractor).to(device)

        self.policy_optimizer = optim.Adam(list(self.feature_extractor.parameters()) + list(self.policy_network.parameters()), lr=LEARNING_RATE)
        self.value_optimizer = optim.Adam(self.value_network.parameters(), lr=LEARNING_RATE)

        self.replay_memory = deque(maxlen=REPLAY_MEMORY_SIZE)
        self.gamma = GAMMA
        self.state_buffer = deque(maxlen=window_size)
        self.k = k

    def act(self, state):
        self.state_buffer.append(state)
        if len(self.state_buffer) < self.state_buffer.maxlen:
            return torch.zeros(self.num_stocks).to(device)

        state = torch.stack(list(self.state_buffer)).to(torch.float32).to(device)

        # action sampling
        with torch.no_grad():
            mu, sigma_diag = self.policy_network(state)  # size: (6,)
            sigma = torch.diag(torch.clamp(sigma_diag, min=1e-6))

            # use Multivariate Normal Distribution
            action_dist = torch.distributions.MultivariateNormal(mu, covariance_matrix=sigma)
            sampled_action = action_dist.sample()
            clipped_action = torch.clamp(sampled_action, -1, 1)  # Clipping [-1, 1]
            discrete_action = (clipped_action * self.k).long()  # from continuous [-1,1]^3 to discrete {-k, .. ,0, .. k}^3

        return discrete_action.cpu()

    def remember(self, state, action, reward, next_state, done):
        self.replay_memory.append((state, action, reward, next_state, done))

    def replay(self):
        if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE:
            return

        minibatch = random.sample(self.replay_memory, BATCH_SIZE)
        states, actions, rewards, next_states, dones = zip(*minibatch)

        states = torch.stack(states).to(torch.float32).to(device)
        actions = torch.stack(actions).to(torch.float32).to(device)
        rewards = torch.tensor(rewards, dtype=torch.float32).to(device)
        next_states = torch.stack(next_states).to(torch.float32).to(device)
        dones = torch.tensor(dones, dtype=torch.float32).to(device)

        # Calculate Advantage
        with torch.no_grad():
            next_values = self.value_network(next_states).squeeze()  # (BATCH_SIZE,)
            targets = rewards + (1 - dones) * self.gamma * next_values
            advantages = targets - self.value_network(states).squeeze()

        mu, sigma_diag = self.policy_network(states)
        #print(mu.shape, sigma_diag.shape)
        sigma = torch.stack([torch.diag(torch.clamp(s, min=1e-6)) for s in sigma_diag])

        action_dist = torch.distributions.MultivariateNormal(mu, covariance_matrix=sigma)
        log_probs = action_dist.log_prob(actions)
        policy_loss = -(log_probs * advantages).mean()

        # Value Loss
        value_loss = nn.MSELoss()(self.value_network(states).squeeze(), targets)

        # Total Loss
        total_loss = policy_loss + 0.5 * value_loss

        self.policy_optimizer.zero_grad()
        self.value_optimizer.zero_grad()
        total_loss.backward()
        self.policy_optimizer.step()
        self.value_optimizer.step()


# A2C Train

In [None]:
EPISODE = 500
Env = env(merged_data_all)
Agent =  A2CAgent(window_size = 30, num_stocks = 3, feature_len = 128, n_channels=16, k=K)

for episode in range(EPISODE):
    state = Env.train_reset()
    total_reward = torch.tensor(0, device = device, dtype = torch.float32)
    for i in range(Env.train_data.shape[0]-2):
        #print(i)
        action = Agent.act(state.to(device))
        next_state, reward, done = Env.train_step(state, action)
        #print(torch.stack(list(Agent.state_buffer)).shape)
        if torch.stack(list(Agent.state_buffer)).shape[0] == 30:
            Agent.remember(torch.stack(list(Agent.state_buffer)), action, reward, next_state, done)
        state = next_state
        total_reward += reward

        if i % 500 == 0:
            Agent.replay()
        if done:
            break


    if episode % 1 == 0:
        print(f"Episode: {episode + 1}, Total Reward: {total_reward}")

    if episode % 5 == 0:
        Env = env(merged_data_all)
        state = Env.valid_reset()
        valid_reward = torch.tensor(0, device = device, dtype = torch.float32)

        with torch.no_grad():
            for i in range(Env.test_data.shape[0]-2):
                action = Agent.act(state.to(device))
                next_state, reward, done = Env.test_step(state, action)
                state = next_state
                valid_reward += reward

        print("validation_reward:",valid_reward.item())


#torch.save(Agent.model.state_dict(), f'DQN_epoch{EPISODE}trained_model_weights.pth')

Episode: 1, Total Reward: 13361544.0
validation_reward: 2024224.0
Episode: 2, Total Reward: 20652144.0
Episode: 3, Total Reward: 22521392.0
Episode: 4, Total Reward: 28536272.0
Episode: 5, Total Reward: 50416104.0
Episode: 6, Total Reward: 24773080.0
validation_reward: 4356240.0
Episode: 7, Total Reward: 27925904.0
Episode: 8, Total Reward: 34905624.0
Episode: 9, Total Reward: 31565440.0
Episode: 10, Total Reward: 33008776.0
Episode: 11, Total Reward: 27200072.0
validation_reward: 1882760.0
Episode: 12, Total Reward: 32660880.0
Episode: 13, Total Reward: 42158608.0
Episode: 14, Total Reward: 21295640.0
Episode: 15, Total Reward: 27897720.0
Episode: 16, Total Reward: 31777408.0
validation_reward: 5701712.0
Episode: 17, Total Reward: 34404560.0
Episode: 18, Total Reward: 33697024.0
Episode: 19, Total Reward: 35055896.0
Episode: 20, Total Reward: 39135984.0
Episode: 21, Total Reward: 32735960.0
validation_reward: 2510464.0
Episode: 22, Total Reward: 28244680.0
Episode: 23, Total Reward: 3

# A2C Test

테스트는 트레이닝시와 동일하게 1억원의 잔고를 가지고 진행
초기값에 큰 영향을 받아서 낮으면 1%에서 높으면 11% 정도의 수익을 올렸습니다

In [None]:
Env = env(merged_data_all)
state = Env.test_reset()
total_reward = torch.tensor(0, device = device, dtype = torch.float32)

with torch.no_grad():
    for i in range(Env.test_data.shape[0]-2):
        action = Agent.act(state.to(device))
        next_state, reward, done = Env.test_step(state, action)
        #print(f"{i}th reward:{reward}")
        state = next_state
        total_reward += reward

print("total_reward:",total_reward)


total_reward: tensor(11387168.)
