In [2]:
import pandas as pd
import polars as pl
import numpy as np
import gc 
#import kaggle #如果是本地或服务器环境

In [None]:
#主办方提供的匿名数据集包括responder0~8,feature_0~78,其中responder6为待预测值。
#responder为每个日期结束后才能获取的滞后数据
class CONFIG:
    target_col = "responder_6"
    cols_original = ["date_id",'time_id',"symbol_id"] + [f"responder_{idx}" for idx in range(9)] 
    responder_cols = [f"responder_{idx}" for idx in range(9)]
    feature_cols = [f"feature_{idx:02d}" for idx in range(79)]
    start_dt = 900

In [None]:
#简单的特征工程
def create_agg_list(columns):
    agg_mean_list = [pl.col(c).mean().name.suffix(f"_mean_lag") for c in columns]
    agg_std_list = [pl.col(c).std().name.suffix(f"_std_lag") for c in columns]
    agg_max_list = [pl.col(c).max().name.suffix(f"_max_lag") for c in columns]
    agg_last_list = [pl.col(c).last().name.suffix(f"_last_lag") for c in columns]
    agg_chg_list = [(pl.col(c).last()/pl.col(c).first() - 1).name.suffix(f"_chg_lag") for c in columns]
    agg_list = agg_mean_list + agg_std_list + agg_max_list + agg_last_list + agg_chg_list
    return agg_list

In [None]:
# 只使用900天后的数据进行训练和测试
# /kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet
train = pl.scan_parquet(
    f"/root/autodl-tmp/jane-street-real-time-market-data-forecasting/train.parquet"
).filter(
    pl.col("date_id").gt(CONFIG.start_dt)
)

In [None]:
#求responder的每天的mean,std等agg_list
agg_list = create_agg_list(CONFIG.responder_cols)

#由于responder为滞后数据，今日“开盘”时只能获取到昨日的“收盘”数据
lag_data = train.select(CONFIG.cols_original) 
lag_data = lag_data.with_columns(
    date_id = pl.col('date_id') + 1,
    ) 
lag_data = lag_data.group_by(["date_id","symbol_id"], maintain_order=True).agg(agg_list)

In [7]:
train = train.join(lag_data, on=["date_id", "symbol_id"],  how="left")

In [None]:
# ‘train.parquet'
# kaggle只支持92列以内polars dataframe的collect
# 训练集、测试集放在一个文件里
train.collect().write_parquet("autodl-tmp/train.parquet", partition_by = "date_id")