This code isn't done, half finish to try actor-critic structure in this way:
https://cloud.tencent.com/developer/article/1635793

## 環境初始化
新增 [gym-anytrading](https://github.com/AminHP/gym-anytrading) 套件，當作本次任務的環境。

引入需要的套件，並設定隨機參數種子。

In [None]:
import pandas as pd
import os
from tqdm.notebook import tqdm
from torch.distributions import Categorical
import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn
import torch
import numpy as np
import matplotlib.pyplot as plt
from gym_anytrading.envs import StocksEnv

%matplotlib inline


# 設定 seed
torch.manual_seed(1234)
np.random.seed(1234)


## 使用自訂資料集
本次作業同樣使用台積電 (2330) 2010/1月~2022/10月資料作為使用的內容。

In [None]:
STOCKS_TSMC = pd.read_csv('./CE6020_hw2_resource/2330_stock.csv')
STOCKS_TSMC


將台積電資料輸入股票環境，並設定本次環境範圍與輸入天數資料 (欄位內容、天數等)。


In [None]:
def my_process_data(env):
    start = env.frame_bound[0] - env.window_size
    end = env.frame_bound[1]
    prices = env.df.loc[:, 'Low'].to_numpy()[start:end]
    # 這邊可修改想要使用的 feature
    signal_features = env.df.loc[:, ['Close', 'Open']].to_numpy()[start:end]
    return prices, signal_features


class MyStocksEnv(StocksEnv):
    # 除 _process_data 外，其餘功能 (class function) 禁止覆寫
    _process_data = my_process_data


# window_size: 能夠看到幾天的資料當作輸入, frame_bound: 想要使用的資料日期區間
# 可修改 frame_bound 來學習不同的環境資料
# 不可修改此處 window_size 參數
env = MyStocksEnv(df=STOCKS_TSMC, window_size=10, frame_bound=(1000, 1500))


檢視環境參數

In [None]:
print("env information:")
print("> shape:", env.shape)
print("> df.shape:", env.df.shape)
print("> prices.shape:", env.prices.shape)
print("> signal_features.shape:", env.signal_features.shape)
print("> max_possible_profit:", env.max_possible_profit())

env.reset()
env.render()


In [None]:
observation = env.reset()
while True:
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)
    # ob=10d stock * 2 features
    # re = earn
    # done = Bool
    # info = reward & profit & position
    if done:
        print(info)
        break
plt.cla()
env.render_all()
plt.show()


## Actor-Critic


輸入是 20-dim (10天*2欄位)，輸出則是離散的二個動作之一 (賣=0 或 買=1)

In [None]:
from torch.distributions import MultivariateNormal
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=16):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, action_dim)

    def forward(self, x): # input state, output prob of action
        x = self.fc1(x)
        x = torch.tanh(x)
        x = self.fc2(x)
        x = torch.softmax(x, dim=-1)
        return x

class Critic(nn.Module):
    def __init__(self, state_dim, hidden_dim=16):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 1)

    def forward(self, x): # input state, output score
        x = self.fc1(x)
        x = torch.relu(x)
        x = self.fc2(x)
        return x

class ACAgent():

    def __init__(self, state_dim, action_dim,lr=0.001,gamma=0.5):
        self.actor = Actor(state_dim, action_dim)
        self.critic = Critic(state_dim)
        self.actor_opt = optim.RMSprop(self.actor.parameters(), lr=lr)
        self.critic_opt = optim.RMSprop(self.critic.parameters(), lr=lr)
        # self.action_log_prob = None
        self.gamma = gamma

    def sample_action(self, state):
        action_prob = self.actor(state)
        action_dist = Categorical(action_prob)
        action = action_dist.sample()
        log_prob = action_dist.log_prob(action)
        return action.detach().item(), log_prob

    def evaluate_score(self,state):
        return self.critic(state)

    def actor_learn(self, td_error, log_prob):
        loss = torch.mean(log_prob*td_error)
        self.actor_opt.zero_grad()
        loss.backward()
        self.actor_opt.step()
        return loss
    
    def critic_learn(self, reward, state, next_state, done):
        # done = 0 if done else 1
        # done = 1
        next_score = self.evaluate_score(next_state)
        score = self.evaluate_score(state)
        loss = reward + done*self.gamma*next_score - score
        loss = torch.square(loss)
        td_error = loss.detach()
        self.critic_opt.zero_grad()
        loss.backward()
        self.critic_opt.step()
        return td_error

    def load_ckpt(self, ckpt_path):
        if os.path.exists(ckpt_path):
            checkpoint = torch.load(ckpt_path)
            self.actor.load_state_dict(checkpoint['actor_state_dict'])
            self.critic.load_state_dict(checkpoint['critic_state_dict'])
            self.actor_opt.load_state_dict(checkpoint['actor_optimizer_state_dict'])
            self.critic_opt.load_state_dict(checkpoint['critic_optimizer_state_dict'])
        else:
            print("Checkpoint file not found, use default settings")

    def save_ckpt(self, ckpt_path):
        torch.save({
            'actor_state_dict': self.actor.state_dict(),
            'critic_state_dict': self.critic.state_dict(),
            'actor_optimizer_state_dict': self.actor_opt.state_dict(),
            'critic_optimizer_state_dict': self.critic_opt.state_dict(),
        }, ckpt_path)


再來，搭建一個簡單的 agent，並搭配上方的 policy network 來採取行動。
這個 agent 能做到以下幾件事：
- `learn()`：從記下來的 log probabilities 及 rewards 來更新 policy network。
- `sample()`：從 environment 得到 observation 之後，利用 policy network 得出應該採取的行動。
而此函式除了回傳抽樣出來的 action，也會回傳此次抽樣的 log probabilities。

最後，建立一個 network 和 agent，就可以開始進行訓練了。

In [None]:
agent = ACAgent(env.shape[0]*env.shape[1], 2)


## 訓練 Agent

現在我們開始訓練 agent。
透過讓 agent 和 environment 互動，我們記住每一組對應的 log probabilities 及 reward，並在資料日期區間結束後，回放這些「記憶」來訓練 policy network。

In [None]:
EPISODE_PER_BATCH = 5  # 每蒐集 5 個 episodes 更新一次 agent
NUM_BATCH = 10     # 總共更新 400 次
CHECKPOINT_PATH = './model.ckpt'  # agent model 儲存位置

avg_total_rewards = []

agent.actor.train()  # 訓練前，先確保 network 處在 training 模式
agent.critic.train()

prg_bar = tqdm(range(NUM_BATCH), miniters=1)
for batch in prg_bar:
    log_probs, rewards = [], []
    total_rewards = []
    # 蒐集訓練資料
    for episode in range(EPISODE_PER_BATCH):
        state = env.reset()
        state = torch.FloatTensor(state).flatten()
        # print(state)
        records = {"state":[],"next_state":[],"reward":[],"log_prob":[]}
        # episode_rewards = []
        total_step = 0
        done = False
        while not done:
            action, log_prob = agent.sample_action(state)
            next_state, reward, done, info = env.step(action)
            next_state = torch.FloatTensor(next_state).flatten()
            # print(reward)
            # if reward > 0:
            
            records["reward"].append(reward)
            
            # if len(set(records["reward"][-5:])) == 1 and len(records["reward"]) >=5:
            #     records["reward"][-1] -= 10
            td_error = agent.critic_learn(reward,state, next_state)
            # agent.actor_learn(td_error, log_prob)

            # records["reward"].append(reward)
            # records["state"].append(state)
            # records["next_state"].append(next_state)
            # records["log_prob"].append(log_prob)
            state = next_state
            total_step += 1
        # td_errors = agent.critic_learn(records["reward"],records["state"], records["next_state"])
        # agent.actor_learn(td_errors, records["log_prob"])
        total_rewards.append(info['total_reward'])
        # rewards.append(np.full(total_step, -10 if info['total_reward'] < 0 else 10)+np.array(ep_rewards))  # 設定同一個 episode 每個 action 的 reward 都是 total reward
        rewards.append(np.full(total_step, info['total_reward']))  # 設定同一個 episode 每個 action 的 reward 都是 total reward
        # env.render_all()
        # plt.show()
    # 紀錄訓練過程
    avg_total_reward = sum(total_rewards) / len(total_rewards)
    avg_total_rewards.append(avg_total_reward)
    prg_bar.set_description(f"Average Reward: {avg_total_reward: 04.2f}, Final Reward: {info['total_reward']: 04.2f}, Final Profit: {info['total_profit']: 04.2f}")

    # 更新網路
    rewards = np.concatenate(rewards, axis=0)
    rewards = (rewards - np.mean(rewards)) / (np.std(rewards) + 1e-9)  # 將 reward 正規標準化
    # agent.learn(torch.stack(log_probs), torch.from_numpy(rewards))

# 儲存 agent model 參數
agent.save_ckpt(CHECKPOINT_PATH)


### 訓練結果

訓練過程中，我們持續記下了 `avg_total_reward`，這個數值代表的是：每次更新 policy network 前，我們讓 agent 玩數個回合（episodes），而這些回合的平均 total rewards 為何。
理論上，若是 agent 一直在進步，則所得到的 `avg_total_reward` 也會持續上升。
若將其畫出來則結果如下：

In [None]:
plt.plot(avg_total_rewards)
plt.title("Total Rewards")
plt.show()

In [None]:
plt.cla()
env.render_all()
plt.show()

## 測試
在這邊我們替換環境使用的資料日期區間，並使用讀取紀錄點的方式來執行測試。

In [None]:
testenv = MyStocksEnv(df=STOCKS_TSMC, window_size=10, frame_bound=(2000, 2500))

network = PolicyGradientNetwork(testenv.shape[1])
test_agent = PolicyGradientAgent(network)

checkpoint_path = './model.ckpt'

test_agent.load_ckpt(checkpoint_path)
test_agent.network.eval()  # 測試前先將 network 切換為 evaluation 模式

observation = testenv.reset()
while True:
    action, _ = test_agent.sample(observation)
    observation, reward, done, info = testenv.step(action)
    if done:
        break

plt.cla()
testenv.render_all()
plt.show()
# green buy, red sell
