In [2]:
from teste import (
    TICKERS, NUM_ATIVOS, ret, r_media, v_media, dd,
    regimes, regime_ids, cluster_ids,
    discretizar_estado_financeiro,
    calcular_recompensa_portfolio,
    aplicar_acao_portfolio
)

import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
import numpy as np



class PolicyMLP(nn.Module):
    def __init__(self, state_dim, num_actions):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, num_actions)
        )

    def forward(self, x):
        logits = self.net(x)
        return torch.softmax(logits, dim=1)




In [3]:
AÇOES_POR_ATIVO = 2
state_dim = NUM_ATIVOS + 3 + NUM_ATIVOS
num_actions = NUM_ATIVOS * AÇOES_POR_ATIVO

taxa_aprendizado = 1e-4
policy_net_final = PolicyMLP(state_dim, num_actions)
optimizer_final = optim.Adam(policy_net_final.parameters(), lr=taxa_aprendizado)


# teste shapes
pesos = np.ones(NUM_ATIVOS)/NUM_ATIVOS
t = 0
state = discretizar_estado_financeiro(
    pesos, regime_ids[t], cluster_ids, r_media.iloc[t], v_media.iloc[t],dd.iloc[t]
)
print("state shape:", state.shape)
probs = policy_net_final(state)
print("action_probs shape:", probs.shape)
print("sum probs:", probs.sum().item())

state shape: torch.Size([1, 37])
action_probs shape: torch.Size([1, 34])
sum probs: 1.0


In [4]:
episodios = len(ret) - 1
pesos = np.ones(NUM_ATIVOS) / NUM_ATIVOS

for ep in range(episodios):
    t = ep
    regime_id = regime_ids[t]
    retornos_dia = ret.iloc[t].values
    drawdown_dia = dd.iloc[t]
    r_dia_media = r_media.iloc[t]
    v_dia_media = v_media.iloc[t]

    state = discretizar_estado_financeiro(
        pesos, regime_id, cluster_ids, r_dia_media, v_dia_media, drawdown_dia 
    )

    action_probs = policy_net_final(state)
    m = Categorical(action_probs)
    action = m.sample().item()
    log_prob = m.log_prob(torch.tensor(action))

    novos_pesos = aplicar_acao_portfolio(pesos, action)

    recompensa = calcular_recompensa_portfolio(
        pesos_antigos=pesos,
        pesos_novos=novos_pesos,
        retornos_dia=retornos_dia,
        drawdown_dia=drawdown_dia,
        lambda_dd=0.2,
        lambda_tc=0.001
    )

    loss = -log_prob * recompensa
    optimizer_final.zero_grad()
    loss.backward()
    optimizer_final.step()

    pesos = novos_pesos

    if ep % 200 == 0:
        print(f"Ep {ep} | reward {recompensa:.5f} | regime {regimes[t]}")


Ep 0 | reward -0.00004 | regime neutro
Ep 200 | reward -0.00888 | regime neutro
Ep 400 | reward -0.01709 | regime alta_vol
Ep 600 | reward 0.00637 | regime bull
Ep 800 | reward 0.00219 | regime bull
Ep 1000 | reward 0.00022 | regime bull
Ep 1200 | reward -0.00145 | regime alta_vol
Ep 1400 | reward 0.00597 | regime alta_vol
Ep 1600 | reward -0.00643 | regime alta_vol
Ep 1800 | reward 0.00216 | regime neutro
Ep 2000 | reward 0.00373 | regime alta_vol
Ep 2200 | reward 0.01910 | regime bull
Ep 2400 | reward 0.00657 | regime alta_vol
Ep 2600 | reward -0.00671 | regime alta_vol
Ep 2800 | reward -0.01313 | regime bear
Ep 3000 | reward -0.02348 | regime bear
Ep 3200 | reward 0.00250 | regime bear
Ep 3400 | reward 0.00162 | regime alta_vol
Ep 3600 | reward 0.00824 | regime bull
Ep 3800 | reward -0.00233 | regime alta_vol
