In [1]:
import os
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from datetime import timedelta, time
from tqdm import tqdm
from sklearn.metrics import accuracy_score, classification_report

import torch
from torch.utils.data import Dataset, DataLoader, ConcatDataset
import torch.nn.functional as F
from torch.utils import data
import torch.nn as nn
import torch.optim as optim
from multiprocessing import Pool

In [2]:
path = './data/cu.parquet'

data = pd.read_parquet(path)
data.replace(-121, np.nan, inplace=True)
# print(set(data['TradingDay'].values))
data = data[['TradingDay', 'LastPrice', 'PreSettlementPrice', 'PreClosePrice',
             'PreOpenInterest', 'OpenPrice', 'HighestPrice', 'LowestPrice', 'OpenInterest',
             'BidPrice1', 'BidVolume1', 'AskPrice1', 'AskVolume1',
             'BidPrice2', 'BidVolume2', 'AskPrice2', 'AskVolume2',
             'BidPrice3', 'BidVolume3', 'AskPrice3', 'AskVolume3',
             'BidPrice4', 'BidVolume4', 'AskPrice4', 'AskVolume4',
             'BidPrice5', 'BidVolume5', 'AskPrice5', 'AskVolume5',
             'delta_Volume', 'delta_Turnover']]
data.reset_index(names='DateTime', inplace=True)
data.sort_values(by='DateTime', inplace=True)
data.drop_duplicates(subset='DateTime', keep='first', inplace=True)
data

Unnamed: 0,DateTime,TradingDay,LastPrice,PreSettlementPrice,PreClosePrice,PreOpenInterest,OpenPrice,HighestPrice,LowestPrice,OpenInterest,...,BidPrice4,BidVolume4,AskPrice4,AskVolume4,BidPrice5,BidVolume5,AskPrice5,AskVolume5,delta_Volume,delta_Turnover
0,2023-01-03 08:59:00.500,20230103,66160,66120,66260,99641,66160,66160,66160,99658,...,66130,200,66260,110,66120,13,66270,30,224.0,74099200.0
1,2023-01-03 09:00:00.500,20230103,66240,66120,66260,99641,66160,66240,66160,99669,...,66160,160,66270,2,66150,353,66280,1,21.0,6952600.0
2,2023-01-03 09:00:01.000,20230103,66210,66120,66260,99641,66160,66240,66160,99682,...,66160,161,66260,102,66150,353,66270,2,56.0,18541250.0
3,2023-01-03 09:00:01.500,20230103,66230,66120,66260,99641,66160,66240,66160,99681,...,66200,27,66270,3,66170,64,66280,1,14.0,4636150.0
4,2023-01-03 09:00:02.000,20230103,66230,66120,66260,99641,66160,66250,66160,99672,...,66200,28,66300,56,66170,63,66310,1,29.0,9603500.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8162959,2023-12-29 23:59:56.000,20231229,69040,69500,69340,155933,68970,69150,68920,150584,...,69000,85,69080,29,68990,30,69090,25,1.0,345200.0
8162960,2023-12-29 23:59:56.500,20231229,69040,69500,69340,155933,68970,69150,68920,150584,...,69000,85,69080,28,68990,30,69090,26,0.0,0.0
8162961,2023-12-29 23:59:57.000,20231229,69040,69500,69340,155933,68970,69150,68920,150584,...,69000,85,69080,28,68990,30,69090,26,0.0,0.0
8162962,2023-12-29 23:59:57.500,20231229,69040,69500,69340,155933,68970,69150,68920,150584,...,69000,85,69080,28,68990,30,69090,26,0.0,0.0


In [3]:
full_time_index = pd.date_range(start=data['DateTime'].min(), end=data['DateTime'].max(), freq='500ms')

# 定义交易时段的函数
def is_in_trading_hours(dt):
    if time(9, 0) <= dt.time() <= time(11, 30):
        return True
    elif time(13, 30) <= dt.time() <= time(15, 0):
        return True
    elif (time(21, 0) <= dt.time()) or (dt.time() <= time(1, 0)):
        return True
    return False
    
# 筛选交易时段内的时间点
full_time_index = [dt for dt in full_time_index if is_in_trading_hours(dt)]
len(full_time_index)

20787482

In [4]:
data = data.set_index('DateTime').reindex(full_time_index)
data[['delta_Volume', 'delta_Turnover']] = data[['delta_Volume', 'delta_Turnover']].fillna(0)

data = data.ffill()  # 填充前值
data = data.reset_index()
data.columns = ['DateTime'] + list(data.columns[1:])

In [5]:
data

Unnamed: 0,DateTime,TradingDay,LastPrice,PreSettlementPrice,PreClosePrice,PreOpenInterest,OpenPrice,HighestPrice,LowestPrice,OpenInterest,...,BidPrice4,BidVolume4,AskPrice4,AskVolume4,BidPrice5,BidVolume5,AskPrice5,AskVolume5,delta_Volume,delta_Turnover
0,2023-01-03 09:00:00.000,,,,,,,,,,...,,,,,,,,,0.0,0.0
1,2023-01-03 09:00:00.500,20230103.0,66240.0,66120.0,66260.0,99641.0,66160.0,66240.0,66160.0,99669.0,...,66160.0,160.0,66270.0,2.0,66150.0,353.0,66280.0,1.0,21.0,6952600.0
2,2023-01-03 09:00:01.000,20230103.0,66210.0,66120.0,66260.0,99641.0,66160.0,66240.0,66160.0,99682.0,...,66160.0,161.0,66260.0,102.0,66150.0,353.0,66270.0,2.0,56.0,18541250.0
3,2023-01-03 09:00:01.500,20230103.0,66230.0,66120.0,66260.0,99641.0,66160.0,66240.0,66160.0,99681.0,...,66200.0,27.0,66270.0,3.0,66170.0,64.0,66280.0,1.0,14.0,4636150.0
4,2023-01-03 09:00:02.000,20230103.0,66230.0,66120.0,66260.0,99641.0,66160.0,66250.0,66160.0,99672.0,...,66200.0,28.0,66300.0,56.0,66170.0,63.0,66310.0,1.0,29.0,9603500.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20787477,2023-12-29 23:59:57.500,20231229.0,69040.0,69500.0,69340.0,155933.0,68970.0,69150.0,68920.0,150584.0,...,69000.0,85.0,69080.0,28.0,68990.0,30.0,69090.0,26.0,0.0,0.0
20787478,2023-12-29 23:59:58.000,20231229.0,69040.0,69500.0,69340.0,155933.0,68970.0,69150.0,68920.0,150584.0,...,69000.0,85.0,69080.0,28.0,68990.0,30.0,69090.0,26.0,0.0,0.0
20787479,2023-12-29 23:59:58.500,20231229.0,69040.0,69500.0,69340.0,155933.0,68970.0,69150.0,68920.0,150584.0,...,69000.0,85.0,69080.0,28.0,68990.0,30.0,69090.0,26.0,0.0,0.0
20787480,2023-12-29 23:59:59.000,20231229.0,69040.0,69500.0,69340.0,155933.0,68970.0,69150.0,68920.0,150584.0,...,69000.0,85.0,69080.0,28.0,68990.0,30.0,69090.0,26.0,0.0,0.0


In [6]:
data.to_parquet('filled_cu.parquet')

In [7]:
data.columns

Index(['DateTime', 'TradingDay', 'LastPrice', 'PreSettlementPrice',
       'PreClosePrice', 'PreOpenInterest', 'OpenPrice', 'HighestPrice',
       'LowestPrice', 'OpenInterest', 'BidPrice1', 'BidVolume1', 'AskPrice1',
       'AskVolume1', 'BidPrice2', 'BidVolume2', 'AskPrice2', 'AskVolume2',
       'BidPrice3', 'BidVolume3', 'AskPrice3', 'AskVolume3', 'BidPrice4',
       'BidVolume4', 'AskPrice4', 'AskVolume4', 'BidPrice5', 'BidVolume5',
       'AskPrice5', 'AskVolume5', 'delta_Volume', 'delta_Turnover'],
      dtype='object')