**코스피의 10일 가격 데이터를 상태로 사용하고, 두 ETF 상품(인버스, 레버리지)의 비율을 행동으로 하는 강화학습 모델**

## **Import Modules**

In [1]:
import torch
import torch.nn as nn
from torch.distributions import MultivariateNormal, Beta

from sklearn.preprocessing import MinMaxScaler

import pandas as pd
import numpy as np
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter
import random, os

## **모델 구현**

### **메모리 버퍼**

In [2]:
class Memory:
    def __init__(self):
        self.actions = []
        self.states = []
        self.logprobs = []
        self.rewards = []
        self.is_terminals = []
    
    def clear_memory(self):
        del self.actions[:]
        del self.states[:]
        del self.logprobs[:]
        del self.rewards[:]
        del self.is_terminals[:]


### **에이전트**

In [3]:
class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim, action_std, NN_conf, use_gpu = True):
        super(ActorCritic, self).__init__()
        # action mean range 0 to 1
        # action dim = 1 
        if NN_conf == 'sigmoid':
            self.actor =  nn.Sequential(
                    nn.Linear(state_dim, 128),
                    nn.Tanh(),
                    nn.Linear(128, 64),
                    nn.Tanh(),
                    nn.Linear(64, action_dim),
                    nn.Sigmoid()
                    )
            # critic
            self.critic = nn.Sequential(
                    nn.Linear(state_dim, 128),
                    nn.Tanh(),
                    nn.Linear(128, 64),
                    nn.Tanh(),
                    nn.Linear(64, 1)
                    )
        elif NN_conf == 'relu':
            print('ReLU')
            self.actor =  nn.Sequential(
                    nn.Linear(state_dim, 128),
                    nn.ReLU(),
                    nn.Linear(128, 64),
                    nn.ReLU(),
                    nn.Linear(64, action_dim),
                    nn.ReLU()
                    )
            # critic
            self.critic = nn.Sequential(
                    nn.Linear(state_dim, 128),
                    nn.ReLU(),
                    nn.Linear(128, 64),
                    nn.ReLU(),
                    nn.Linear(64, 1)
                    )
            
        self.set_device(use_gpu)

        self.action_var = torch.full((action_dim,), action_std*action_std).to(self.device)

    def set_device(self, use_gpu = False):
        if use_gpu:
            self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        else:
            self.device = "cpu"


    def forward(self):
        raise NotImplementedError
    
    def act(self, state, memory, gready):
        action_mean = self.actor(state)
        if not gready:
            cov_mat = torch.diag(self.action_var).to(self.device)
            dist = MultivariateNormal(action_mean, cov_mat)
            
            action = dist.sample()
            action = torch.clamp(action, 0, 1)  # Clamping the action
            
            action_logprob = dist.log_prob(action)
            
            memory.states.append(state)
            memory.actions.append(action)
            memory.logprobs.append(action_logprob)
                 
            return action.detach()
        
        else:
            return action_mean.detach()

    def evaluate(self, state, action):   
        # action mean, get the 1 actions from NN
        action_mean = self.actor(state)
        #print(action_mean)
        # [0.25, 0.25 , 0.25]. Expand this tesor to same size as other
        action_var = self.action_var.expand_as(action_mean)
        # creates diag matrix 3x3
        #print(self.device)
        cov_mat = torch.diag_embed(action_var).to(self.device)
        
        dist = MultivariateNormal(action_mean, cov_mat)
        action_logprobs = dist.log_prob(action)
        
        dist_entropy = dist.entropy()
        state_value = self.critic(state)
        #print('state', state_value)
        return action_logprobs, torch.squeeze(state_value), dist_entropy

In [4]:
class PPO:
    def __init__(self, state_dim, action_dim, conf_ppo, use_gpu=False):
        self.lr = conf_ppo['lr']
        self.betas = conf_ppo['betas']
        self.gamma = conf_ppo['gamma']
        self.eps_clip = conf_ppo['eps_clip']
        self.K_epochs = conf_ppo['K_epochs']
        action_std = conf_ppo['action_std']
        self.set_device(use_gpu)
        self.policy = ActorCritic(state_dim, action_dim, action_std, NN_conf=conf_ppo['nn_type'], use_gpu=use_gpu).to(self.device)
        self.policy_old = ActorCritic(state_dim, action_dim, action_std, NN_conf=conf_ppo['nn_type'], use_gpu=use_gpu).to(self.device)
        self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=self.lr, betas=self.betas)
        self.policy_old.load_state_dict(self.policy.state_dict())
        self.MseLoss = nn.MSELoss()
        self.lam_a = conf_ppo['lam_a']
        self.normalize_rewards = conf_ppo['normalize_rewards']
        self.loss_total = 0.0
        self.loss_a = 0.0
        self.loss_max = 0.0
        self.loss_min = 0.0

    def set_device(self, use_gpu=True, set_policy=False):
        if use_gpu:
            self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        else:
            self.device = "cpu"
        if set_policy:
            self.policy.actor.to(self.device)
            self.policy.critic.to(self.device)
            self.policy.action_var.to(self.device)
            self.policy.set_device(self.device)
            self.policy_old.actor.to(self.device)
            self.policy_old.critic.to(self.device)
            self.policy_old.action_var.to(self.device)
            self.policy_old.set_device(self.device)

    def select_action(self, state, memory, greedy=False):
        state = torch.FloatTensor(state.reshape(1, -1)).to(self.device)
        return self.policy_old.act(state, memory, greedy).cpu().data.numpy().flatten()

    def estimate_action(self, state, action):
        state = torch.FloatTensor(state.reshape(1, -1)).to(self.device)
        action = torch.FloatTensor(action.reshape(1, -1)).to(self.device)
        return self.policy_old.evaluate(state, action)

    def update(self, memory, to_tensor=False, use_gpu=True):
        self.set_device(use_gpu, set_policy=True)
        rewards = []
        discounted_reward = 0
        for reward, is_terminal in zip(reversed(memory.rewards), reversed(memory.is_terminals)):
            if is_terminal:
                discounted_reward = 0
            discounted_reward = reward + (self.gamma * discounted_reward)
            rewards.insert(0, discounted_reward)
        if to_tensor:
            memory.states = [torch.FloatTensor(i.reshape(1, -1)).to(self.device) for i in memory.states]
            memory.actions = [torch.FloatTensor(i.reshape(1, -1)).to(self.device) for i in memory.actions]
            memory.logprobs = [torch.FloatTensor(i.reshape(1, -1)).to(self.device) for i in memory.logprobs]
        rewards = torch.tensor(rewards, dtype=torch.float32).to(self.device)
        if self.normalize_rewards:
            rewards = ((rewards - rewards.mean())/(rewards.std() + 1e-7)).to(self.device)
        old_states = torch.squeeze(torch.stack(memory.states).to(self.device), 1).detach()
        old_actions = torch.squeeze(torch.stack(memory.actions).to(self.device), 1).detach()
        old_logprobs = torch.squeeze(torch.stack(memory.logprobs), 1).to(self.device).detach()
        for _ in range(self.K_epochs):
            logprobs, state_values, dist_entropy = self.policy.evaluate(old_states, old_actions)
            ratios = torch.exp(logprobs - old_logprobs.detach())
            advantages = rewards - state_values.detach()
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1-self.eps_clip, 1+self.eps_clip) * advantages
            mseLoss = 0.5 * self.MseLoss(state_values, rewards)
            loss = -torch.min(surr1, surr2) + mseLoss - 0.01 * dist_entropy
            if self.lam_a != 0:
                mu = torch.squeeze(torch.stack(memory.actions[:-1]).to(self.device), 1).detach()
                mu_nxt = torch.squeeze(torch.stack(memory.actions[1:]).to(self.device), 1).detach()
                loss += 0.5 * self.MseLoss(mu_nxt, mu) * self.lam_a
            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()
            
        self.loss_total = loss.cpu().data.numpy().flatten()[0]
        self.loss_a = mseLoss.cpu().data.numpy().flatten()[0]
        self.loss_max = advantages.max().cpu().data.numpy().flatten()[0]
        self.loss_min = advantages.min().cpu().data.numpy().flatten()[0]
        self.policy_old.load_state_dict(self.policy.state_dict())


### **환경 설정**

In [5]:
class TradingEnv:
    def __init__(self, config, scaler):
        self.config = config
        self.scaler = scaler
        
        self.kospi_data = pd.read_csv(self.config['kospi_path']).iloc[:, 1:].values
        self.leverage_data = pd.read_csv(self.config['leverage_path'])['Close'].values
        self.inverse_data = pd.read_csv(self.config['inverse_path'])['Close'].values
        
    def reset(self, rand_state=False):
        self.day = 0
        self.reset_reward()
        self.init_cash = self.config['initial_cash']
        
        self.price_a = self.leverage_data[self.day]
        self.price_b = self.inverse_data[self.day]
        if rand_state:
            random_ratio = np.random.rand()
            self.na = int(self.init_cash * random_ratio / self.price_a)
            self.nb = int(self.init_cash * (1-random_ratio) / self.price_b)
        else:
            self.na = int(self.init_cash * 0.5 / self.price_a)
            self.nb = int(self.init_cash * 0.5 / self.price_b)
        self.init_total = self.na * self.price_a + self.nb * self.price_b
        self.total_cash = self.init_total
        # print(f"reset -> day : {self.day}, {self.total_cash} = {self.na} * {self.price_a} + {self.nb} * {self.price_b}")
        self.state = self.get_state()
        # print(f"init state : {self.state}")
        return self.state
    
    def get_state(self):
        data = self.kospi_data[self.day:self.day + self.config['state_window']]
        scaled_data = self.scaler.transform(data)
        return scaled_data.flatten()
    
    def get_reward(self):
        self.price_a = self.leverage_data[self.day]
        self.price_b = self.inverse_data[self.day]
        self.total_cash = self.na * self.price_a + self.nb * self.price_b
        reward = self.total_cash - self.init_total
        # print(f"{self.total_cash} = {self.na} * {self.price_a} + {self.nb} * {self.price_b}")
        return reward
    
    def reset_reward(self):
        self.reward_max_time = 0
        self.reward_trigger = 0
        
    def is_done(self, reward):
        if self.day == self.config['sim_day_max']:
            print("Max Time")
            self.reward_max_time = 1
            return True
        
        if reward / self.init_total < -0.05:
            # print("Stop loss triggered")
            self.reward_trigger = 1
            return True
        
        return False
            
    def step(self, action):
        leverage_ratio = action[0]
        inverse_ratio = 1 - leverage_ratio
        
        self.na = int(self.total_cash * leverage_ratio / self.price_a)
        self.nb = int(self.total_cash * inverse_ratio / self.price_b)
        # print(f"step -> day : {self.day}, {self.na} = int({self.total_cash} * {leverage_ratio} / {self.price_a}), {self.nb} = int({self.total_cash} * {inverse_ratio} / {self.price_b})")
        
        self.day += 1
        reward = self.get_reward()
        done = self.is_done(reward)
        state = self.get_state()
        # print(f"step -> day : {self.day}, state : {state}, reward : {reward}, done : {done}")
        return state, reward, done, None

In [6]:
# kospi_data = pd.read_csv(conf['kospi_path']).iloc[:, 1:].values
# kospi_data

In [7]:
# running_reward = 0.0
# state_dim = conf['state_dim']
# action_dim = conf['action_dim']
# memory = Memory()

# ppo = PPO(state_dim, action_dim, conf)
# env = TradingEnv(conf)
# state = env.reset(rand_state=False)

# done = False
# action = np.zeros(1)
# for i in range(30):
#     act = ppo.select_action(state, memory)
#     action[0] = act[0]
#     print(f"action : {action}")
#     state, reward, done, _ = env.step(action)
#     print(f"---------------------------------------------------")

## **에피소드 진행**

In [8]:
def get_scaler(kospi_data):
    scaler = MinMaxScaler()
    scaler.fit(kospi_data)
    return scaler

In [9]:
def train(config):
    writer = SummaryWriter('runs/'+ config['track'])
    memory = Memory()
    state_dim = config['state_dim']
    action_dim = config['action_dim']
    
    ppo = PPO(state_dim, action_dim, config)
    
    best_reward = -float('inf')
    time_step = 0
    
    kospi_data = pd.read_csv(config['kospi_path']).iloc[:, 1:].values
    scaler = get_scaler(kospi_data)
    env = TradingEnv(config, scaler)
    
    for i_episode in tqdm(range(1, config['max_episodes']+1)):
        
        running_reward = 0.0
        state = env.reset()
        
        done = False
        action = np.zeros(1)
        step_day = 0
        
        while not done:
            act = ppo.select_action(state, memory)
            action[0] = act[0]
            
            state, reward, done, _ = env.step(action)
            
            # Log step reward and action
            writer.add_scalar(f"Episode_{i_episode}/Step_Reward", running_reward, step_day)
            writer.add_scalar(f"Episode_{i_episode}/Step_Action", action[0], step_day)
            
            # store rewards
            memory.rewards.append(reward)
            memory.is_terminals.append(done)
            
            step_day += 1
            time_step += 1
            running_reward += reward
            ## PPO update
            if time_step % config['update_timestep'] == 0:
                ppo.update(memory)
                memory.clear_memory()
                time_step = 0
                
        writer.add_scalar("Reward", running_reward, i_episode)
        writer.add_scalar("Loss", ppo.loss_total, i_episode)
        writer.add_scalar("Loss_MSE", ppo.loss_a, i_episode)
        writer.add_scalar("Loss_adv_max", ppo.loss_max, i_episode)
        writer.add_scalar("Loss_adv_min", ppo.loss_min, i_episode)
        writer.add_scalar("Max_Time", env.reward_max_time, i_episode)
        writer.add_scalar("Trigger", env.reward_trigger, i_episode)
        
        # save best model
        if running_reward > best_reward:
            best_reward = running_reward
            torch.save(ppo.policy.state_dict(), f'ppo_models/best_stock_ppo.pth')
        
    # save model after training
    torch.save(ppo.policy.state_dict(), 'ppo_models/stock_ppo.pth')

In [10]:
conf = {
    'track': 'RL_ppo_training',
    'max_episodes': 1000,
    'update_timestep': 200,
    'action_std': 0.6,
    'K_epochs': 80, # update policy for K epochs
    'eps_clip': 0.2, # clip parameter for PPO
    'gamma': 1.0, # discount factor
    'lr': 1e-5,
    'betas': (0.9, 0.999),
    'random_seed': None,
    'lam_a': 0,
    'normalize_rewards': True,
    'nn_type': 'sigmoid',
    'state_dim': 60, # 10일 X 6개 특징
    'action_dim': 1,
    'sim_day_max': 30,
    'kospi_path': 'data/RL/kospi_rl.csv',
    'leverage_path': 'data/RL/leverage_rl.csv',
    'inverse_path': 'data/RL/inverse_rl.csv',
    'lstm_path': 'lstm_models/best_lstm_stock_1e-3.pth',
    'initial_cash': 1000000,
    'state_window': 10
}

In [11]:
train(conf)

 16%|█▌        | 158/1000 [00:09<00:35, 23.95it/s]

Max Time


 17%|█▋        | 172/1000 [00:10<00:53, 15.57it/s]

Max Time


 20%|█▉        | 195/1000 [00:12<00:49, 16.25it/s]

Max Time


 22%|██▏       | 217/1000 [00:14<01:04, 12.14it/s]

Max Time


 26%|██▋       | 263/1000 [00:19<01:41,  7.25it/s]

Max Time


 32%|███▏      | 317/1000 [00:26<01:51,  6.14it/s]

Max Time


 42%|████▏     | 419/1000 [00:39<01:11,  8.07it/s]

Max Time


 42%|████▏     | 424/1000 [00:41<01:49,  5.25it/s]

Max Time


 45%|████▌     | 451/1000 [00:43<00:54,  9.98it/s]

Max Time


 45%|████▌     | 453/1000 [00:45<02:29,  3.67it/s]

Max Time


 46%|████▌     | 456/1000 [00:45<01:54,  4.74it/s]

Max Time


 46%|████▌     | 460/1000 [00:46<01:20,  6.69it/s]

Max Time


 47%|████▋     | 474/1000 [00:48<00:58,  8.93it/s]

Max Time


 49%|████▊     | 486/1000 [00:50<00:50, 10.09it/s]

Max Time


 50%|████▉     | 496/1000 [00:53<01:30,  5.57it/s]

Max Time


 51%|█████▏    | 513/1000 [00:55<00:56,  8.56it/s]

Max Time


 55%|█████▌    | 553/1000 [01:01<01:18,  5.69it/s]

Max Time


 59%|█████▉    | 588/1000 [01:05<00:36, 11.19it/s]

Max Time


 60%|█████▉    | 595/1000 [01:06<00:52,  7.68it/s]

Max Time


 60%|██████    | 601/1000 [01:06<00:36, 10.89it/s]

Max Time
Max Time


 61%|██████▏   | 614/1000 [01:08<00:31, 12.30it/s]

Max Time


 62%|██████▏   | 621/1000 [01:09<00:44,  8.46it/s]

Max Time
Max Time


 64%|██████▎   | 636/1000 [01:10<00:36, 10.10it/s]

Max Time


 66%|██████▌   | 656/1000 [01:13<00:50,  6.86it/s]

Max Time
Max Time


 69%|██████▉   | 693/1000 [01:16<00:25, 12.00it/s]

Max Time


 70%|██████▉   | 696/1000 [01:16<00:23, 12.97it/s]

Max Time


 70%|██████▉   | 698/1000 [01:17<00:50,  6.00it/s]

Max Time


 72%|███████▏  | 715/1000 [01:19<00:25, 11.40it/s]

Max Time


 72%|███████▏  | 722/1000 [01:20<00:41,  6.76it/s]

Max Time
Max Time


 73%|███████▎  | 728/1000 [01:21<00:24, 11.16it/s]

Max Time


 74%|███████▍  | 741/1000 [01:22<00:24, 10.50it/s]

Max Time


 77%|███████▋  | 772/1000 [01:25<00:18, 12.38it/s]

Max Time


 78%|███████▊  | 777/1000 [01:26<00:30,  7.37it/s]

Max Time


 78%|███████▊  | 781/1000 [01:27<00:23,  9.13it/s]

Max Time
Max Time


 78%|███████▊  | 783/1000 [01:27<00:21, 10.12it/s]

Max Time


 81%|████████  | 811/1000 [01:30<00:14, 12.68it/s]

Max Time


 82%|████████▏ | 820/1000 [01:31<00:17, 10.05it/s]

Max Time


 84%|████████▍ | 838/1000 [01:33<00:12, 12.90it/s]

Max Time


 85%|████████▌ | 852/1000 [01:34<00:12, 12.16it/s]

Max Time


 87%|████████▋ | 868/1000 [01:36<00:10, 12.40it/s]

Max Time


 90%|████████▉ | 895/1000 [01:39<00:07, 14.06it/s]

Max Time
Max Time


 90%|█████████ | 900/1000 [01:40<00:12,  7.74it/s]

Max Time


 91%|█████████ | 908/1000 [01:40<00:06, 14.21it/s]

Max Time


 91%|█████████▏| 914/1000 [01:41<00:10,  8.54it/s]

Max Time


 92%|█████████▏| 921/1000 [01:41<00:06, 12.73it/s]

Max Time


 93%|█████████▎| 928/1000 [01:43<00:08,  8.74it/s]

Max Time


 94%|█████████▍| 943/1000 [01:44<00:06,  9.40it/s]

Max Time


 96%|█████████▌| 956/1000 [01:46<00:04,  9.72it/s]

Max Time
Max Time


 96%|█████████▌| 959/1000 [01:46<00:03, 11.31it/s]

Max Time


 96%|█████████▋| 964/1000 [01:47<00:04,  7.52it/s]

Max Time


 98%|█████████▊| 984/1000 [01:49<00:01, 12.14it/s]

Max Time
Max Time


100%|██████████| 1000/1000 [01:50<00:00,  9.05it/s]
