In [1]:
import torch
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

In [2]:
import torch
print(torch.__version__)
print(torch.version.cuda)
print(torch.cuda.is_available())

2.5.1+cu121
12.1
True



# torch.cuda.is_available()如果是False
# 那就是cuda版本與安裝的pytorch不符合
# pip install torch只會安裝CPU版本的pytorch
# 要在anaconda prompt輸入以下的指令，才能成功安裝正確版本且是GPU的pytorch
# pip install torch --index-url https://download.pytorch.org/whl/cu121

In [3]:
class MarketGuidedGating(nn.Module):
    def __init__(self, market_dim, feature_dim, beta=5):
        super().__init__()
        self.fc = nn.Linear(market_dim, feature_dim)
        self.beta = beta
        self.feature_dim = feature_dim

    def forward(self, x, m):
        alpha = self.feature_dim * F.softmax(self.fc(m) / self.beta, dim=-1)
        return x * alpha  # Hadamard product

"""
class IntraStockEncoder(nn.Module):
    def __init__(self, feature_dim, embed_dim, nhead):
        super().__init__()
        self.input_proj = nn.Linear(feature_dim, embed_dim)
        self.pos_encoder = nn.Parameter(torch.randn(1, 1, embed_dim))  # Learnable positional encoding
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=nhead)
        self.transformer = nn.TransformerEncoder(self.encoder_layer, num_layers=1)

    def forward(self, x):  # x: (batch, time, feature)
        x = self.input_proj(x) + self.pos_encoder
        x = self.transformer(x.transpose(0, 1)).transpose(0, 1)
        return x  # (batch, time, embed_dim)
"""

class IntraStockEncoder(nn.Module):  # MAX_LEN屬於 time step 系列
    def __init__(self, feature_dim, embed_dim=256, nhead=4, max_len=60): #檢查 embed_dim=256, nhead=4, max_len=60是否正確
        super().__init__()
        self.input_proj = nn.Linear(feature_dim, embed_dim)
        self.register_buffer('pos_encoder', self._get_sinusoid_encoding_table(max_len, embed_dim))  # fixed encoding
        self.layer_norm = nn.LayerNorm(embed_dim)  # LN(f(x) + p)
        #self.encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=nhead)
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=nhead, batch_first=True)
        self.transformer = nn.TransformerEncoder(self.encoder_layer, num_layers=1)

    def forward(self, x):  # x: (batch, time, extract_proj_feature) extract_proj_feature = feature_dim (for example : Alpha 158)
        batch, time, _ = x.shape
        x = self.input_proj(x)                         # → (batch, time, embed_dim)
        x = x + self.pos_encoder[:time, :].unsqueeze(0)  # broadcast: [1, time, embed_dim]
        x = self.layer_norm(x)
        #x = self.transformer(x.transpose(0, 1)).transpose(0, 1)
        x = self.transformer(x) # 這裡的輸出是(batch, time, embed_dim) 同一個batch 以及同一支股票 就是h_{u,t} dim=256
        return x  # (batch, time, embed_dim)

    def _get_sinusoid_encoding_table(self, seq_len, d_model): # seq_len = MAX_LEN, d_model = embed_dim
        position = torch.arange(seq_len, dtype=torch.float).unsqueeze(1)  # [seq_len, 1]
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))  # [d_model//2]
        pe = torch.zeros(seq_len, d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term) #要小心對應到的奇偶數不同會報錯，但不會報錯的原因是embed_dim = 256，是偶數
        return pe  # [seq_len, d_model] that is to say [seq_len, embed_dim] or more precisely []


"""
class InterStockAggregator(nn.Module):
    def __init__(self, embed_dim, nhead):
        super().__init__()
        self.attn = nn.MultiheadAttention(embed_dim=embed_dim, num_heads=nhead, batch_first=True)
        self.ffn = nn.Sequential(
            nn.Linear(embed_dim, embed_dim),
            nn.ReLU(),
            nn.Linear(embed_dim, embed_dim),
        )

    def forward(self, x):  # x: (batch, stocks, time, embed_dim)
        batch, stocks, time, embed_dim = x.shape
        out = []
        for t in range(time):
            xt = x[:, :, t, :]  # (batch, stocks, embed_dim)
            attn_out, _ = self.attn(xt, xt, xt)
            out.append(self.ffn(attn_out))
        return torch.stack(out, dim=2)  # (batch, stocks, time, embed_dim)
"""


class InterStockAggregator(nn.Module):
    def __init__(self, embed_dim=256, nhead=2): #數字要再確認一下
        super().__init__()
        self.attn = nn.MultiheadAttention(embed_dim=embed_dim, num_heads=nhead, batch_first=True)
        self.ffn = nn.Sequential(
            nn.Linear(embed_dim, 4*embed_dim),
            nn.ReLU(),
            nn.Linear(4*embed_dim, embed_dim),
        )
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)

    def forward(self, x):  # x: (batch, stocks, time, embed_dim)
        batch, stocks, time, embed_dim = x.shape

        # 變形為 (batch * time, stocks, embed_dim)，將時間軸攤平成 batch 維
        # x.permute(0, 2, 1, 3) 變成 (batch, time, stocks, embed_dim)
        x_reshaped = x.permute(0, 2, 1, 3).reshape(batch * time, stocks, embed_dim)

        # MultiheadAttention: 每個時間點的所有股票做 MHA
        # Self attention with residual + LN
        attn_out, _ = self.attn(x_reshaped, x_reshaped, x_reshaped)  # (batch * time, stocks, embed_dim)
        x_attn = self.norm1(attn_out + x_reshaped)

        # Feed Forward with residual + LN
        ffn_out = self.ffn(x_attn)
        out = self.norm2(ffn_out + x_attn)
        

        # 還原形狀為 (batch, stocks, time, embed_dim)
        out = out.view(batch, time, stocks, embed_dim).permute(0, 2, 1, 3)

        return out  # (batch, stocks, time, embed_dim)



class TemporalAggregator(nn.Module):
    def __init__(self, embed_dim):
        super().__init__()
        self.w_lambda = nn.Parameter(torch.randn(embed_dim, embed_dim))

    def forward(self, x):  # x: (batch, stocks, time, embed_dim)
        query  = x[:, :, -1, :]  # (batch, stocks, embed_dim)
        scores = torch.einsum('bstf,fd,bsd->bst', x, self.w_lambda, query)
        weights = F.softmax(scores, dim=2)  # (batch, stocks, time)
        output = torch.einsum('bst,bstf->bsf', weights, x)
        return output  # (batch, stocks, embed_dim)

"""
class MASTER(nn.Module):
    def __init__(self, market_dim, feature_dim, embed_dim=256, nhead1=4, nhead2=2, beta=5):
        super().__init__()
        self.gating = MarketGuidedGating(market_dim, feature_dim, beta)
        self.intra_encoder = IntraStockEncoder(feature_dim, embed_dim, nhead1)
        self.inter_agg = InterStockAggregator(embed_dim, nhead2)
        self.temporal_agg = TemporalAggregator(embed_dim)
        self.predictor = nn.Linear(embed_dim, 1)

    def forward(self, x, market):
        # x: (batch, stocks, time, features), market: (batch, market_features)
        batch, stocks, time, features = x.shape
        market_scaled = self.gating(x.view(-1, features), market.repeat_interleave(stocks * time, dim=0))
        x_rescaled = market_scaled.view(batch, stocks, time, features)

        local_embed = []
        for i in range(stocks):
            local_embed.append(self.intra_encoder(x_rescaled[:, i, :, :]))
        local_embed = torch.stack(local_embed, dim=1)  # (batch, stocks, time, embed_dim)

        inter_embed = self.inter_agg(local_embed)  # (batch, stocks, time, embed_dim)
        temporal_embed = self.temporal_agg(inter_embed)  # (batch, stocks, embed_dim)

        out = self.predictor(temporal_embed).squeeze(-1)  # (batch, stocks)
        return out
"""

class MASTER(nn.Module):
    def __init__(self, market_dim, feature_dim, embed_dim=256, nhead1=4, nhead2=2, beta=5):
        super().__init__()
        self.gating = MarketGuidedGating(market_dim, feature_dim, beta)
        self.intra_encoder = IntraStockEncoder(feature_dim, embed_dim, nhead1)
        self.inter_agg = InterStockAggregator(embed_dim, nhead2)
        self.temporal_agg = TemporalAggregator(embed_dim)
        self.predictor = nn.Linear(embed_dim, 1)

    def forward(self, x, market):
        # x: (batch, stocks, time, features), market: (batch, market_features)
        batch, stocks, time, features = x.shape

        # ====== Gating ======
        #market_expanded = market.unsqueeze(1).unsqueeze(2).expand(-1, stocks, time, -1)  # (batch, stocks, time, market_features)
        market_expanded = market[ : , None, None, : ].expand(-1, stocks, time, -1)
        #print(f"market expanded shape = {market_expanded.shape}")
        #print(f"x = {x.shape}")
        market_scaled = self.gating(x, market_expanded)  # broadcasting-wise multiplication

        # ====== Intra-Stock Encoder ======
        print(f"market_scaled shape = {market_scaled.shape}")
        x_flat = market_scaled.view(batch * stocks, time, features)  # (batch*stocks, time, features)
        print(f"x_flat shape = {x_flat.shape}")
        local_embed = self.intra_encoder(x_flat)  # → (batch*stocks, time, embed_dim)
        local_embed = local_embed.view(batch, stocks, time, -1)  # → (batch, stocks, time, embed_dim)

        # ====== Inter-Stock Aggregation ======
        inter_embed = self.inter_agg(local_embed)  # (batch, stocks, time, embed_dim)

        # ====== Temporal Aggregation ======
        temporal_embed = self.temporal_agg(inter_embed)  # (batch, stocks, embed_dim)
        print(f"temporal shape = {temporal_embed.shape}")

        # ====== Prediction ======
        out = self.predictor(temporal_embed).squeeze(-1)  # (batch, stocks)
        print(self.predictor(temporal_embed).shape)
        return out


In [6]:
batch_size = 8
stocks = 1822
time = 8
feature_dim = 185
market_dim = 21
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
x = torch.randn(batch_size, stocks, time, feature_dim)
m = torch.randn(batch_size, market_dim)
model = MASTER(market_dim = market_dim, feature_dim = feature_dim)
x = x.to(device)
m = m.to(device)
model = model.to(device)
model(x, m).shape

market_scaled shape = torch.Size([8, 1822, 8, 185])
x_flat shape = torch.Size([14576, 8, 185])
temporal shape = torch.Size([8, 1822, 256])
torch.Size([8, 1822, 1])


torch.Size([8, 1822])

In [6]:
x.dtype

torch.float32

In [51]:
# 不直接使用nn.TransformerEncoder 但是MHA是現成的
import torch
import torch.nn as nn
import torch.nn.functional as F

class IntraStockEncoderManual(nn.Module):
    def __init__(self, feature_dim, embed_dim=256, nhead=4, max_len=60):
        super().__init__()
        self.input_proj = nn.Linear(feature_dim, embed_dim)
        self.register_buffer('pos_encoder', self._get_sinusoid_encoding_table(max_len, embed_dim))
        self.layer_norm1 = nn.LayerNorm(embed_dim)
        self.layer_norm2 = nn.LayerNorm(embed_dim)

        self.self_attn = nn.MultiheadAttention(embed_dim=embed_dim, num_heads=nhead, batch_first=True)

        self.ffn = nn.Sequential(
            nn.Linear(embed_dim, embed_dim),
            nn.ReLU(),
            nn.Linear(embed_dim, embed_dim)
        )

    def forward(self, x):  # x: (batch, time, feature_dim)
        batch, time, _ = x.shape

        x = self.input_proj(x)  # (batch, time, embed_dim)
        x = x + self.pos_encoder[:time, : ].unsqueeze(0)  # 加上位置編碼

        # === Self Attention over time ===
        residual = x
        attn_out, _ = self.self_attn(x, x, x)  # (batch, time, embed_dim)
        x = self.layer_norm1(attn_out + residual)  # 殘差 + LN

        # === Feed Forward Network ===
        residual = x
        x = self.ffn(x)
        x = self.layer_norm2(x + residual)  # 殘差 + LN

        return x  # (batch, time, embed_dim)

    def _get_sinusoid_encoding_table(self, seq_len, d_model):
        position = torch.arange(seq_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
        pe = torch.zeros(seq_len, d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        return pe  # (seq_len, d_model)

ISEM = IntraStockEncoderManual(158)
x = torch.randn(16, 60, 158)
ISEM(x).shape

torch.Size([16, 60, 256])

In [None]:
# 連 Multihead attention都是用手刻的(這是單頭的)

import torch
import torch.nn as nn
import torch.nn.functional as F

class ManualIntraStockEncoder(nn.Module):
    def __init__(self, feature_dim, embed_dim, nhead):
        super().__init__()
        self.input_proj = nn.Linear(feature_dim, embed_dim)

        self.q_proj = nn.Linear(embed_dim, embed_dim)
        self.k_proj = nn.Linear(embed_dim, embed_dim)
        self.v_proj = nn.Linear(embed_dim, embed_dim)

        self.attn_out_proj = nn.Linear(embed_dim, embed_dim)
        self.layernorm1 = nn.LayerNorm(embed_dim)

        self.ffn = nn.Sequential(
            nn.Linear(embed_dim, embed_dim * 4),
            nn.ReLU(),
            nn.Linear(embed_dim * 4, embed_dim)
        )
        self.layernorm2 = nn.LayerNorm(embed_dim)

    def forward(self, x):  # x: (batch, time, feature_dim)
        x = self.input_proj(x)  # → (batch, time, embed_dim)

        # Q, K, V (batch, time, embed_dim)
        Q = self.q_proj(x)
        K = self.k_proj(x)
        V = self.v_proj(x)

        # 做 scaled dot-product attention（手動多頭的簡化版，單頭）
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / (Q.size(-1) ** 0.5)  # (batch, time, time)
        attn_weights = F.softmax(attn_scores, dim=-1)
        attn_out = torch.matmul(attn_weights, V)  # (batch, time, embed_dim)

        # 殘差 + LayerNorm
        x = self.layernorm1(x + self.attn_out_proj(attn_out))

        # FFN + 殘差 + LayerNorm
        x = self.layernorm2(x + self.ffn(x))

        return x  # shape: (batch, time, embed_dim)


In [None]:
# 這是自己手刻的多頭
#這種是分成不同Q K V的，也可以直接使用一個大矩陣，再用chunk分成3等份

class ManualMultiHeadIntraStockEncoder(nn.Module):
    def __init__(self, feature_dim, embed_dim, nhead):
        super().__init__()
        assert embed_dim % nhead == 0, "embed_dim 必須可以被 nhead 整除"

        self.embed_dim = embed_dim
        self.nhead = nhead
        self.head_dim = embed_dim // nhead

        self.input_proj = nn.Linear(feature_dim, embed_dim)

        self.q_proj = nn.Linear(embed_dim, embed_dim)
        self.k_proj = nn.Linear(embed_dim, embed_dim)
        self.v_proj = nn.Linear(embed_dim, embed_dim)

        self.attn_out_proj = nn.Linear(embed_dim, embed_dim)
        self.layernorm1 = nn.LayerNorm(embed_dim)

        self.ffn = nn.Sequential(
            nn.Linear(embed_dim, embed_dim * 4),
            nn.ReLU(),
            nn.Linear(embed_dim * 4, embed_dim)
        )
        self.layernorm2 = nn.LayerNorm(embed_dim)

    def forward(self, x):  # x: (batch, time, feature_dim)
        B, T, _ = x.shape
        x = self.input_proj(x)  # → (B, T, embed_dim)

        # Q, K, V linear projection   (batch, time, embed_dim) -> (batch, time, nhead, head_dim) -> (batch, nhead, time, head_dim)
        Q = self.q_proj(x).view(B, T, self.nhead, self.head_dim).transpose(1, 2)  # (B, nhead, T, head_dim) dim1 和 dim2交換
        K = self.k_proj(x).view(B, T, self.nhead, self.head_dim).transpose(1, 2)
        V = self.v_proj(x).view(B, T, self.nhead, self.head_dim).transpose(1, 2)

        # Scaled Dot-Product Attention
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.head_dim ** 0.5)  # (B, nhead, T, T)
        attn_weights = F.softmax(attn_scores, dim=-1)
        attn_out = torch.matmul(attn_weights, V)  # (B, nhead, T, head_dim)

        # 合併 heads
        attn_out = attn_out.transpose(1, 2).contiguous().view(B, T, self.embed_dim)  # → (B, T, embed_dim)

        # 殘差 + LayerNorm
        x = self.layernorm1(x + self.attn_out_proj(attn_out))

        # FFN + 殘差 + LayerNorm
        x = self.layernorm2(x + self.ffn(x))

        return x  # (batch, time, embed_dim)
