In [2]:
import pandas as pd

# Load data
train_df = pd.read_parquet("drw-crypto-market-prediction/train.parquet")
test_df = pd.read_parquet("drw-crypto-market-prediction/test.parquet")

# Quick overview
# train_df
print(train_df.head())
print(train_df.info())

                     bid_qty  ask_qty  buy_qty  sell_qty   volume        X1  \
timestamp                                                                     
2023-03-01 00:00:00   15.283    8.425  176.405    44.984  221.389  0.121263   
2023-03-01 00:01:00   38.590    2.336  525.846   321.950  847.796  0.302841   
2023-03-01 00:02:00    0.442   60.250  159.227   136.369  295.596  0.167462   
2023-03-01 00:03:00    4.865   21.016  335.742   124.963  460.705  0.072944   
2023-03-01 00:04:00   27.158    3.451   98.411    44.407  142.818  0.173820   

                           X2        X3        X4        X5  ...      X882  \
timestamp                                                    ...             
2023-03-01 00:00:00 -0.417690  0.005399  0.125948  0.058359  ...  1.925423   
2023-03-01 00:01:00 -0.049576  0.356667  0.481087  0.237954  ...  1.928569   
2023-03-01 00:02:00 -0.291212  0.083138  0.206881  0.101727  ...  1.928047   
2023-03-01 00:03:00 -0.436590 -0.102483  0.017551  0.007

In [7]:
import pandas as pd
import matplotlib.pyplot as plt

# （假设 train_df, test_df 已经通过 pd.read_parquet 载入）

# —— 1. 查看数据规模 —— 
print(f"train 维度：{train_df.shape}")
print(f"test  维度：{test_df.shape}\n")

# —— 2. 打印前几行 & info —— 
print("=== train_df.head() ===")
print(train_df.head(), "\n")
print("=== train_df.info() ===")
print(train_df.info(), "\n")

# —— 3. 缺失值检查 —— 
print("=== train_df 缺失值统计 ===")
print(train_df.isnull().sum(), "\n")

# —— 4. 基本统计描述 —— 
print("=== train_df.describe() ===")
print(train_df.describe(), "\n")

# —— 5. 确保使用 DatetimeIndex —— 
if not isinstance(train_df.index, pd.DatetimeIndex):
    # 在列名里找含 time/date 字样的候选项
    time_cands = [c for c in train_df.columns if "time" in c.lower() or "date" in c.lower()]
    if time_cands:
        tcol = time_cands[0]
        print(f"⚙️ 检测到时间列 `{tcol}`，正在转换并设为索引...")
        # 转 datetime（errors='coerce' 碰到解析失败会变 NaT）
        train_df[tcol] = pd.to_datetime(train_df[tcol], errors="coerce")
        test_df [tcol] = pd.to_datetime(test_df [tcol], errors="coerce")
        # 设为索引
        train_df.set_index(tcol, inplace=True)
        test_df .set_index(tcol, inplace=True)
    else:
        raise RuntimeError("❌ 未找到可用的时间列，请检查数据！")
print(f"✔️ 当前 train_df.index 类型：{type(train_df.index)}\n")

# —— 6. 从 DatetimeIndex 抽取时间特征 —— 
for df in (train_df, test_df):
    df["hour"]       = df.index.hour
    df["weekday"]    = df.index.dayofweek    # Monday=0, Sunday=6
    df["is_weekend"] = (df["weekday"] >= 5).astype(int)

print("已添加时间特征：hour, weekday, is_weekend")
print(train_df[["hour","weekday","is_weekend"]].head(), "\n")

# —— 7. 目标分布可视化 —— 
plt.figure(figsize=(6,3))
train_df["label"].hist(bins=100)
plt.title("label 分布")
plt.xlabel("label")
plt.ylabel("频次")
plt.tight_layout()
plt.show()

train 维度：(525887, 899)
test  维度：(538150, 896)

=== train_df.head() ===
                     bid_qty  ask_qty  buy_qty  sell_qty   volume        X1  \
timestamp                                                                     
2023-03-01 00:00:00   15.283    8.425  176.405    44.984  221.389  0.121263   
2023-03-01 00:01:00   38.590    2.336  525.846   321.950  847.796  0.302841   
2023-03-01 00:02:00    0.442   60.250  159.227   136.369  295.596  0.167462   
2023-03-01 00:03:00    4.865   21.016  335.742   124.963  460.705  0.072944   
2023-03-01 00:04:00   27.158    3.451   98.411    44.407  142.818  0.173820   

                           X2        X3        X4        X5  ...      X885  \
timestamp                                                    ...             
2023-03-01 00:00:00 -0.417690  0.005399  0.125948  0.058359  ...  0.190791   
2023-03-01 00:01:00 -0.049576  0.356667  0.481087  0.237954  ...  0.184660   
2023-03-01 00:02:00 -0.291212  0.083138  0.206881  0.101727  ..

  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)


             bid_qty        ask_qty        buy_qty       sell_qty  \
count  525887.000000  525887.000000  525887.000000  525887.000000   
mean        9.968003      10.174169     131.726678     132.673944   
std        15.645741      15.889582     307.267251     309.803040   
min         0.001000       0.001000       0.000000       0.000000   
25%         2.634000       2.678000      26.407000      27.021000   
50%         6.415000       6.538000      57.015000      58.047000   
75%        13.085000      13.330000     127.639000     129.110000   
max      1114.932000    1352.965000   17614.400000   17686.234000   

              volume             X1             X2             X3  \
count  525887.000000  525887.000000  525887.000000  525887.000000   
mean      264.400622      -0.006026      -0.000243      -0.000353   
std       588.618746       0.538340       0.613746       0.771744   
min         0.000000      -2.787539      -5.861926      -6.125439   
25%        60.688500      -0.3706

AttributeError: 'RangeIndex' object has no attribute 'hour'