In [36]:
import pandas as pd 
import numpy as np 
import logging
import torch
from sklearn.preprocessing import RobustScaler, PowerTransformer

In [37]:
train_data = pd.read_csv('trainning_data_merged.csv')


In [38]:
training_data_mock = train_data.head()
training_data_mock

Unnamed: 0,contract_calendar_days,expected_trading_days,actual_trading_days,start_date,start_price,price_series,volatility,risk_free_rate,ticker,country,asset_underlying
0,30,20,22,2015-01-14,6139.29,"[6139.29, 6168.66, 6252.08, 6109.29, 6336.63, ...",0.212546,0.04799,852,China,CSI1000
1,30,20,21,2015-01-15,6168.66,"[6168.66, 6252.08, 6109.29, 6336.63, 6465.73, ...",0.212604,0.04797,852,China,CSI1000
2,30,20,20,2015-01-16,6252.08,"[6252.08, 6109.29, 6336.63, 6465.73, 6568.64, ...",0.210737,0.04793,852,China,CSI1000
3,30,20,21,2015-01-19,6109.29,"[6109.29, 6336.63, 6465.73, 6568.64, 6502.07, ...",0.210879,0.0479,852,China,CSI1000
4,30,20,20,2015-01-20,6336.63,"[6336.63, 6465.73, 6568.64, 6502.07, 6642.61, ...",0.212102,0.048465,852,China,CSI1000


In [52]:
import numpy as np

lengths = train_data["price_series"].apply(lambda x: len(eval(x)))
p50 = np.percentile(lengths, 50)  # 中位数
p80 = np.percentile(lengths, 80)
p90 = np.percentile(lengths, 90)

print("中位长度:", p50, " 80% 样本长度 <=", p80, " 90% 样本长度 <=", p90)

# 比如我们想覆盖 90% 的样本：
L = int(p90)
# 再四舍五入到一个好看一点的数，比如最近的 8 或 16 的倍数：
L = int(np.ceil(L / 8.0) * 8)
print("建议的序列长度 L =", L)

中位长度: 64.0  80% 样本长度 <= 243.0  90% 样本长度 <= 252.0
建议的序列长度 L = 256


In [None]:
training_data_mock['price_series'][0]

205

In [53]:

config = {
    "volatility_scale": 0.1,        # 收益率缩放因子（比如0.1或0.2，按你原来的设置来）
    "input_sequence_length": 256,    # 每条样本的时间序列长度
    "base_trading_days": 252,       # 一年交易日数（用来归一化价格）
}

In [54]:


# ===================== 2. 简单的工具函数 =====================

def encode_to_ids(series):
    """
    把一列字符串/类别，简单编码成 0,1,2,...
    """
    codes, uniques = pd.factorize(series.astype(str))
    return codes.astype(np.int32)

# ===== 标准化相关：用 dict 存参数，而不是 class =====

def fit_scaler(X, outlier_threshold=3.0):
    """
    拟合一个“增强标准化器”，返回：
    - X_scaled：变换后的特征
    - scaler: 里面存了 RobustScaler、每一列的 PowerTransformer 等
    """
    X = np.asarray(X, dtype=np.float32)
    n_features = X.shape[1]

    # 1) 先做 RobustScaler（对极端值比较鲁棒）
    robust = RobustScaler()
    X_r = robust.fit_transform(X)

    # 2) 用 3σ 规则截断极端值
    X_c = X_r.copy()
    for i in range(n_features):
        col = X_r[:, i]
        mean = col.mean()
        std = col.std()
        lower = mean - outlier_threshold * std
        upper = mean + outlier_threshold * std
        X_c[:, i] = np.clip(col, lower, upper)

    # 3) 每一列单独做 PowerTransformer(yeo-johnson, 带 standardize)
    transformers = []
    X_out = np.zeros_like(X_c)
    for i in range(n_features):
        pt = PowerTransformer(method="yeo-johnson", standardize=True)
        col = X_c[:, i].reshape(-1, 1)
        X_out[:, i] = pt.fit_transform(col).ravel()
        transformers.append(pt)

    scaler = {
        "robust": robust,
        "transformers": transformers,
        "n_features": n_features,
        "outlier_threshold": outlier_threshold,
    }
    return X_out, scaler


def transform_with_scaler(X, scaler):
    """
    用已经拟合好的 scaler 去变换新的 X
    """
    X = np.asarray(X, dtype=np.float32)
    robust = scaler["robust"]
    transformers = scaler["transformers"]
    n_features = scaler["n_features"]
    outlier_threshold = scaler["outlier_threshold"]

    # 1) RobustScaler
    X_r = robust.transform(X)

    # 2) 截断极端值
    X_c = X_r.copy()
    for i in range(n_features):
        col = X_r[:, i]
        mean = col.mean()
        std = col.std()
        lower = mean - outlier_threshold * std
        upper = mean + outlier_threshold * std
        X_c[:, i] = np.clip(col, lower, upper)

    # 3) PowerTransformer
    X_out = np.zeros_like(X_c)
    for i in range(n_features):
        col = X_c[:, i].reshape(-1, 1)
        X_out[:, i] = transformers[i].transform(col).ravel()

    return X_out


def inverse_transform(X, scaler):
    """
    从标准化空间反变换回原始特征空间
    （这里只在从预测结果还原价格时会用到）
    """
    X = np.asarray(X, dtype=np.float32)
    robust = scaler["robust"]
    transformers = scaler["transformers"]
    n_features = scaler["n_features"]

    X_inv = np.zeros_like(X)
    for i in range(n_features):
        col = X[:, i].reshape(-1, 1)
        X_inv[:, i] = transformers[i].inverse_transform(col).ravel()

    X_inv = robust.inverse_transform(X_inv)
    return X_inv


# ===================== 3. 处理价格数据 =====================

def preprocess_price_data(df):
    """
    输入：原始 DataFrame
    期望至少包含：
      - 'start_price'    起始价格
      - 'price_series'   价格路径（类似 "[100, 101, 102]" 这种字符串）
    输出：
      - 新的 df，包含：
         'S_0'           起始价格（数值）
         'price_series'  每行都是 numpy.array([p0, p1, ...])
    """
    df = df.copy()

    # 起始价格转成数值，重命名为 S_0
    df["start_price"] = pd.to_numeric(df["start_price"], errors="coerce")
    df = df.rename(columns={"start_price": "S_0"})

    def parse_series(x):
        # 如果是字符串 "[100, 101, 102]"，用 eval 解析
        if isinstance(x, str):
            arr = eval(x)
            return np.array(arr, dtype=np.float32)
        # 如果本来就是 list/array，就直接转
        elif isinstance(x, (list, np.ndarray)):
            return np.array(x, dtype=np.float32)
        else:
            return np.array([], dtype=np.float32)

    df["price_series"] = df["price_series"].apply(parse_series)

    return df


# ===================== 4. 把价格路径 → 收益率序列 + mask =====================

def build_sequences_and_masks(df, config):
    """
    从 df['price_series'] 生成：
      - transformed_sequence：长度固定的收益率序列
      - validity_mask：同长度的 mask，1=真实数据，0=补零
    """
    df = df.copy()
    seq_len = config["input_sequence_length"]
    vol_scale = float(config["volatility_scale"])

    def process_one(price_series):
        prices = np.asarray(price_series, dtype=np.float32)
        target = np.zeros(seq_len, dtype=np.float32)
        mask = np.zeros(seq_len, dtype=np.float32)

        if len(prices) < 2:
            # 只有起始标记
            target[0] = 1.0
            mask[0] = 1.0
            return target, mask

        # 对数收益率
        log_returns = np.diff(np.log(prices))
        scaled = log_returns / vol_scale

        # 第0位作为起始标记
        target[0] = 1.0
        mask[0] = 1.0

        max_n = min(len(scaled), seq_len - 1)
        target[1:1 + max_n] = scaled[:max_n]
        mask[1:1 + max_n] = 1.0
        return target, mask

    tmp = df["price_series"].apply(process_one)
    df["transformed_sequence"] = tmp.apply(lambda x: x[0])
    df["validity_mask"] = tmp.apply(lambda x: x[1])

    return df


# ===================== 5. 构造条件特征矩阵 =====================

def build_condition_array(df, config, need_fit_scaler=True, scaler=None):
    """
    构造条件特征矩阵 (N, D)，D 大致包括：
        - S_0 / base_trading_days
        - volatility
        - risk_free_rate
        - contract_calendar_days / 365
        - actual_trading_days / base_trading_days
        - country_id（或对 country 编码）
        - index_id（或对 asset_underlying 编码）
    返回：
        - cond_scaled：标准化后的条件特征
        - scaler：拟合好的 scaler（只在 need_fit_scaler=True 时重新拟合）
    """
    df = df.copy()
    base_days = float(config["base_trading_days"])

    prices = df["S_0"].astype(float).values / base_days
    contract_days = df["contract_calendar_days"].astype(float).values / 365.0
    trading_days = df["actual_trading_days"].astype(float).values / base_days
    vol = df["volatility"].astype(float).values
    rf = df["risk_free_rate"].astype(float).values

    # 国家 ID
    if "country_id" in df.columns:
        country_id = df["country_id"].astype(int).values
    elif "country" in df.columns:
        country_id = encode_to_ids(df["country"])
    else:
        country_id = np.zeros(len(df), dtype=np.int32)

    # 指数 ID
    if "index_id" in df.columns:
        index_id = df["index_id"].astype(int).values
    elif "asset_underlying" in df.columns:
        index_id = encode_to_ids(df["asset_underlying"])
    else:
        index_id = np.zeros(len(df), dtype=np.int32)

    cond = np.column_stack([
        prices,
        vol,
        rf,
        contract_days,
        trading_days / (contract_days + 1e-6),  # 防止除0
        country_id,
        index_id,
    ]).astype(np.float32)

    if need_fit_scaler:
        cond_scaled, scaler = fit_scaler(cond, outlier_threshold=3.0)
    else:
        cond_scaled = transform_with_scaler(cond, scaler)

    return cond_scaled, scaler


# ===================== 6. 从预测结果还原价格路径（可选） =====================

def recover_price_path(x_sample, y_pred, scaler, config):
    """
    x_sample: 条件特征的一行（和 cond_scaled 中一行对应）
    y_pred:   模型预测的一条序列，形状类似 (seq_len,) 或 (1, seq_len)
    返回：还原后的价格路径（numpy 数组）
    """
    x_sample = np.asarray(x_sample, dtype=np.float32).reshape(1, -1)

    # 先反标准化，回到构造 cond 之前的空间
    x_unscaled = inverse_transform(x_sample, scaler)

    # 第0列对应的是 price_feature = S_0 / base_trading_days
    norm_price = x_unscaled[0, 0]
    start_price = norm_price * float(config["base_trading_days"])

    # y_pred：第0位是起始标记，从第1位开始是收益率
    y_pred = np.asarray(y_pred, dtype=np.float32).ravel()
    returns_scaled = y_pred[1:]
    valid_returns = returns_scaled[returns_scaled != 0.0]
    log_returns = valid_returns * float(config["volatility_scale"])

    # 对 log_returns 做累加，还原价格路径
    log_prices = [np.log(start_price)]
    for r in log_returns:
        log_prices.append(log_prices[-1] + r)
    log_prices = np.array(log_prices, dtype=np.float32)
    prices = np.exp(log_prices)
    return prices


# ===================== 7. 一路算到 PyTorch 张量 =====================

# 1) 处理价格数据
data_after_price = preprocess_price_data(training_data_mock)

# 2) 生成收益率序列和 mask
data_with_seq = build_sequences_and_masks(data_after_price, config)

# 3) 构造条件特征并做标准化（这里是训练集，所以拟合 scaler）
cond_np, global_scaler = build_condition_array(
    data_with_seq,
    config,
    need_fit_scaler=True,
    scaler=None
)

# 4) 把目标序列和 mask 取出来，转成 numpy
y_np = np.array(data_with_seq["transformed_sequence"].tolist(), dtype=np.float32)
mask_np = np.array(data_with_seq["validity_mask"].tolist(), dtype=np.float32)

# 5) 调整形状：y 和 mask 变成 (N, 1, seq_len)
y_np = y_np.reshape(len(y_np), 1, -1)
mask_np = mask_np.reshape(len(mask_np), 1, -1)

# 6) 转成 PyTorch 张量
X_t = torch.tensor(cond_np, dtype=torch.float32)    # (N, D)
y_t = torch.tensor(y_np, dtype=torch.float32)       # (N, 1, seq_len)
mask_t = torch.tensor(mask_np, dtype=torch.float32) # (N, 1, seq_len)

print("数据准备完成：")
print("X_t 形状:", X_t.shape)
print("y_t 形状:", y_t.shape)
print("mask_t 形状:", mask_t.shape)

数据准备完成：
X_t 形状: torch.Size([5, 7])
y_t 形状: torch.Size([5, 1, 256])
mask_t 形状: torch.Size([5, 1, 256])


In [56]:
data_with_seq["transformed_sequence"][0]

array([ 1.        ,  0.04772186,  0.13432503, -0.23103714,  0.3653717 ,
        0.20168304,  0.1579094 , -0.10186195,  0.21384239,  0.07678986,
       -0.04124641, -0.06207466, -0.1190567 , -0.01635551,  0.17990112,
       -0.00181198,  0.00110626, -0.1928997 , -0.08019447,  0.12433052,
        0.13587952,  0.08735657,  0.16573906,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.  

In [58]:
mask_t[0]

tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0.,