In [1]:
import os
from glob import glob
import pandas as pd
import numpy as np
from pathlib import Path
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset, DataLoader

In [3]:
def load_stock_data(folder_path):
    """加载股票数据并预处理"""
    csv_files = glob(os.path.join(folder_path, "*.csv"))
    stock_data = pd.concat(
        (pd.read_csv(csv, parse_dates=['Date']) for csv in csv_files),
        ignore_index=True
    )
    stock_data = stock_data.sort_values(['Name', 'Date'])
    return stock_data

def preprocess_data(df):
    """数据预处理：填充缺失值"""
    df = df.copy()
    for col in ['Open', 'High', 'Low', 'Close', 'Volume']:
        df[col] = df[col].fillna((df[col].ffill() + df[col].bfill()) / 2)
    return df

In [5]:
folder_path = 'time-series-data/'
stock_data = load_stock_data(folder_path)
display(stock_data)

Unnamed: 0,Date,Open,High,Low,Close,Volume,Name
15099,2006-01-03,39.69,41.22,38.79,40.91,24232729,AABA
15100,2006-01-04,41.22,41.90,40.77,40.97,20553479,AABA
15101,2006-01-05,40.93,41.73,40.85,41.53,12829610,AABA
15102,2006-01-06,42.88,43.57,42.80,43.21,29422828,AABA
15103,2006-01-09,43.10,43.66,42.82,43.42,16268338,AABA
...,...,...,...,...,...,...,...
36231,2017-12-22,83.88,84.02,83.60,83.97,10161447,XOM
36232,2017-12-26,83.96,84.36,83.90,83.98,4777216,XOM
36233,2017-12-27,83.99,84.10,83.74,83.90,7000612,XOM
36234,2017-12-28,83.98,84.05,83.80,84.02,7495254,XOM


In [9]:
class StockRNNDataset(Dataset):
    """构建 PyTorch Dataset，用于时间序列窗口"""
    def __init__(self, df, time_step=60, horizon=1, features=None):
        self.time_step = time_step
        self.horizon = horizon
        self.features = features or ['Open','High','Low','Close','Volume']
        self.scaler = StandardScaler()
        self.data = []

        for name, grp in df.groupby('Name'):
            grp = grp.sort_values('Date').reset_index(drop=True)
            X = grp[self.features].values
            y = grp['Return'].shift(-horizon).values
            if len(grp) < time_step + horizon:
                continue
            X_scaled = self.scaler.fit_transform(X)
            for i in range(len(grp) - time_step - horizon + 1):
                X_seq = X_scaled[i:i+time_step]
                label = 1 if y[i+time_step-1] > 0 else 0
                self.data.append((X_seq, label))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        X_seq, y = self.data[idx]
        return torch.FloatTensor(X_seq), torch.FloatTensor([y])


class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout=0.3):
        super().__init__()
        self.rnn = nn.RNN(input_size, hidden_size, num_layers,
                          batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        h0 = torch.zeros(self.rnn.num_layers, x.size(0), self.rnn.hidden_size, device=x.device)
        out, _ = self.rnn(x, h0)
        out = out[:, -1, :]
        return self.sigmoid(self.fc(out))


def train_model(model, train_loader, val_loader, device, epochs=10):
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    model.to(device)
    for epoch in range(epochs):
        model.train()
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            pred = model(X_batch)
            loss = criterion(pred, y_batch)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        # TODO: 在此处添加验证集评估
    return model


def generate_factor_scores(model, df, time_step, device):
    model.eval()
    records = []
    scaler_cache = {}
    with torch.no_grad():
        for name, grp in df.groupby('Name'):
            grp = grp.sort_values('Date').reset_index(drop=True)
            feats = grp[['Open','High','Low','Close','Volume']].values
            if len(grp) < time_step:
                continue
            # 缓存各股票的 scaler
            scaler = StandardScaler().fit(feats)
            feats_scaled = scaler.transform(feats)
            for i in range(time_step, len(grp)):
                X_seq = torch.FloatTensor(feats_scaled[i-time_step:i]).unsqueeze(0).to(device)
                score = model(X_seq).item()
                records.append({'Name': name,
                                'Date': grp.loc[i, 'Date'],
                                'rnn_score': score})
    return pd.DataFrame(records)


def backtest_and_ic(factor_df, price_df):
    price_df = price_df.copy()
    price_df['Return'] = price_df.groupby('Name')['Close'].pct_change().shift(-1)
    df = factor_df.merge(price_df[['Name','Date','Return']], on=['Name','Date'])
    ic = df.groupby('Date').apply(
        lambda x: x['rnn_score'].corr(x['Return'], method='spearman')
    )
    mean_ic = ic.mean()
    def quantile_return(x):
        x = x.copy()
        x['group'] = pd.qcut(x['rnn_score'], 5, labels=False)
        return x.groupby('group')['Return'].mean()
    group_returns = df.groupby('Date').apply(quantile_return)
    avg_group_returns = group_returns.mean()
    return mean_ic, avg_group_returns

In [11]:
stock_data['Return'] = stock_data.groupby('Name')['Close'].pct_change()
train_df = stock_data[(stock_data['Date'] >= '2006-01-01') & (stock_data['Date'] <= '2014-12-31')]
test_df  = stock_data[(stock_data['Date'] >= '2015-01-01') & (stock_data['Date'] <= '2017-12-31')]
time_step = 60
train_ds = StockRNNDataset(train_df, time_step=time_step, horizon=1)
test_ds  = StockRNNDataset(test_df,  time_step=time_step, horizon=1)
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
test_loader  = DataLoader(test_ds,  batch_size=32)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = RNNModel(input_size=5, hidden_size=30, num_layers=3)
model = train_model(model, train_loader, test_loader, device)
train_scores = generate_factor_scores(model, train_df, time_step, device)
test_scores  = generate_factor_scores(model, test_df,  time_step, device)
mean_ic_train, grp_ret_train = backtest_and_ic(train_scores, train_df)
mean_ic_test,  grp_ret_test  = backtest_and_ic(test_scores,  test_df)
print("Train IC:", mean_ic_train)
print("Train Group Returns:\n", grp_ret_train)
print("Test IC:", mean_ic_test)
print("Test Group Returns:\n", grp_ret_test)

RuntimeError: all elements of input should be between 0 and 1

In [13]:
import os
from glob import glob
import pandas as pd
import numpy as np
from pathlib import Path
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset, DataLoader

# 1. 数据加载与预处理

def load_stock_data(folder_path):
    """加载并预处理股票数据"""
    csv_files = glob(os.path.join(folder_path, "*.csv"))
    if not csv_files:
        raise FileNotFoundError(f"在路径 {folder_path} 未发现 CSV 文件")
    stock_data = pd.concat(
        (pd.read_csv(csv, parse_dates=['Date']) for csv in csv_files),
        ignore_index=True
    )
    stock_data = stock_data.sort_values(['Name', 'Date']).reset_index(drop=True)
    return stock_data


def preprocess_data(df):
    """前向后向填充缺失值"""
    df = df.copy()
    for col in ['Open', 'High', 'Low', 'Close', 'Volume']:
        df[col] = df[col].fillna((df[col].ffill() + df[col].bfill()) / 2)
    return df

# 2. Dataset 定义

class StockRNNDataset(Dataset):
    """根据时间窗口构造训练样本"""
    def __init__(self, df, time_step=60, horizon=1, features=None):
        self.time_step = time_step
        self.horizon = horizon
        self.features = features or ['Open','High','Low','Close','Volume']
        self.data = []

        for name, grp in df.groupby('Name'):
            grp = grp.sort_values('Date').reset_index(drop=True)
            grp['Return'] = grp['Close'].pct_change().shift(-horizon)
            X = grp[self.features].values
            y = grp['Return'].values
            if len(grp) < time_step + horizon:
                continue
            # 标准化
            scaler = StandardScaler().fit(X)
            X_scaled = scaler.transform(X)
            for i in range(len(grp) - time_step - horizon + 1):
                seq = X_scaled[i:i+time_step]
                label = 1 if y[i+time_step-1] > 0 else 0
                self.data.append((seq, label))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        seq, label = self.data[idx]
        return torch.FloatTensor(seq), torch.FloatTensor([label])

# 3. 模型定义（移除 Sigmoid，使用 BCEWithLogitsLoss）

class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout=0.3):
        super().__init__()
        self.rnn = nn.RNN(input_size, hidden_size, num_layers,
                          batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        h0 = torch.zeros(self.rnn.num_layers, x.size(0), self.rnn.hidden_size, device=x.device)
        out, _ = self.rnn(x, h0)
        out = out[:, -1, :]
        # 返回 logits
        return self.fc(out)

# 4. 训练函数（使用 BCEWithLogitsLoss）

def train_model(model, train_loader, val_loader, device, epochs=10):
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    model.to(device)
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            logits = model(X_batch)
            loss = criterion(logits, y_batch)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        # 可选：在此打印 epoch loss 或加入验证评估
    return model

# 5. 因子得分生成（对 logits 取 sigmoid）

def generate_factor_scores(model, df, time_step, device):
    model.eval()
    records = []
    with torch.no_grad():
        for name, grp in df.groupby('Name'):
            grp = grp.sort_values('Date').reset_index(drop=True)
            feats = grp[['Open','High','Low','Close','Volume']].values
            if len(grp) < time_step:
                continue
            scaler = StandardScaler().fit(feats)
            feats_scaled = scaler.transform(feats)
            for i in range(time_step, len(grp)):
                seq = torch.FloatTensor(feats_scaled[i-time_step:i]).unsqueeze(0).to(device)
                logit = model(seq)
                score = torch.sigmoid(logit).item()
                records.append({'Name': name,
                                'Date': grp.loc[i,'Date'],
                                'rnn_score': score})
    return pd.DataFrame(records)

# 6. 回测及 IC 检验

def backtest_and_ic(factor_df, price_df):
    df = factor_df.merge(
        price_df[['Name','Date','Close']]
            .assign(Return=price_df.groupby('Name')['Close'].pct_change().shift(-1)),
        on=['Name','Date']
    )
    ic_series = df.groupby('Date').apply(
        lambda x: x['rnn_score'].corr(x['Return'], method='spearman')
    )
    mean_ic = ic_series.mean()
    def quantile_return(x):
        x = x.copy()
        x['group'] = pd.qcut(x['rnn_score'], 5, labels=False)
        return x.groupby('group')['Return'].mean()
    grp_ret = df.groupby('Date').apply(quantile_return).mean()
    return mean_ic, grp_ret

# 7. 主流程示例

def main():
    data_folder = 'time-series-data'
    stock_data = load_stock_data(data_folder)
    stock_data = preprocess_data(stock_data)
    stock_data['Date'] = pd.to_datetime(stock_data['Date'])
    # 划分训练测试
    train_df = stock_data[(stock_data['Date']<'2015-01-01')]
    test_df  = stock_data[(stock_data['Date']>='2015-01-01')]
    # Dataset 和 Dataloader
    time_step = 60
    train_ds = StockRNNDataset(train_df, time_step=time_step)
    test_ds  = StockRNNDataset(test_df,  time_step=time_step)
    train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
    test_loader  = DataLoader(test_ds,  batch_size=32)
    # 模型训练与评估
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = RNNModel(input_size=5, hidden_size=30, num_layers=3)
    model = train_model(model, train_loader, test_loader, device)
    # 因子得分 & 回测
    train_scores = generate_factor_scores(model, train_df, time_step, device)
    test_scores  = generate_factor_scores(model, test_df,  time_step, device)
    ic_train, ret_train = backtest_and_ic(train_scores, train_df)
    ic_test,  ret_test  = backtest_and_ic(test_scores,  test_df)
    print("Train IC:", ic_train)
    print("Train Group Returns:\n", ret_train)
    print("Test IC:", ic_test)
    print("Test Group Returns:\n", ret_test)

if __name__ == '__main__':
    main()

  ic_series = df.groupby('Date').apply(
  grp_ret = df.groupby('Date').apply(quantile_return).mean()
  ic_series = df.groupby('Date').apply(


Train IC: 0.013360763567649012
Train Group Returns:
 group
0   -0.000035
1    0.000306
2    0.000647
3    0.000751
4    0.000920
dtype: float64
Test IC: 0.017110917628327685
Test Group Returns:
 group
0    0.000066
1    0.000198
2    0.000853
3    0.000814
4    0.000790
dtype: float64


  grp_ret = df.groupby('Date').apply(quantile_return).mean()


In [18]:
import matplotlib.pyplot as plt

def plot_cum_nav(factor_df, price_df, title):
    # 1. 合并收益
    df = factor_df.merge(
        price_df[['Name','Date','Close']]
                .assign(Return=price_df.groupby('Name')['Close']
                                     .pct_change().shift(-1)),
        on=['Name','Date']
    )
    # 2. 分层
    df['group'] = pd.qcut(df['score'], 5, labels=False)
    # 3. 日度分层平均收益
    daily = (df
             .groupby(['Date','group'])['Return']
             .mean()
             .unstack()      # shape = (n_days, 5)
             .sort_index())
    # 4. 累积净值
    cum_nav = (1 + daily).cumprod()
    # 5. 绘图
    plt.figure(figsize=(10,6))
    for grp in cum_nav.columns:
        plt.plot(cum_nav.index, cum_nav[grp], label=f'组{grp+1}')
    plt.title(f'{title}：RNN 因子分层组合累积净值')
    plt.xlabel('Date')
    plt.ylabel('Cumulative NAV')
    plt.legend(title='分层')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

# --- 如果你在 main() 里已经得到了这几项，就直接调用： ---
plot_cum_nav(train_scores, train_df, '训练集')
plot_cum_nav(test_scores,  test_df,  '测试集')

NameError: name 'train_scores' is not defined