In [16]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from chinese_calendar import is_workday

In [17]:
df = pd.read_csv("data/TXF1-KWAY64V2-TAIFEX-Futures-Minute-Trade.txt", parse_dates = {"date": ["Date", "Time"]})
print(df.shape)
df.head(3)

(2069610, 7)


Unnamed: 0,date,Symbol,Open,High,Low,Close,TotalVolume
0,2010-01-04 08:46:00,TXF1,8203.0,8208.0,8184.0,8189.0,1125
1,2010-01-04 08:47:00,TXF1,8188.0,8195.0,8186.0,8193.0,411
2,2010-01-04 08:48:00,TXF1,8194.0,8201.0,8193.0,8201.0,422


In [18]:
origin_start = datetime.datetime(2022, 1, 1)
new_start = origin_start - datetime.timedelta(weeks = 1)

## data range

In [19]:
df = df.query("(date >= @new_start)").reset_index(drop = True)
print(df.shape)

(281423, 7)


## 日盤資料

In [20]:
df["time"] = df["date"].dt.time
df["time"] = df["time"].astype(str).str.replace(":", "")
df["day_night"] = df.eval("(time >= '084500') & (time < '134500')")
df = df[df["day_night"]].reset_index(drop = True)

df = df.drop(["Symbol", "time", "day_night"], axis = 1)
df.columns = ["date", "open", "high", "low", "close", "volume"]

print(df.shape)
df.head(3)

(73983, 6)


Unnamed: 0,date,open,high,low,close,volume
0,2021-12-27 08:46:00,17982.0,17985.0,17969.0,17971.0,1330
1,2021-12-27 08:47:00,17971.0,17979.0,17967.0,17979.0,337
2,2021-12-27 08:48:00,17978.0,17995.0,17977.0,17990.0,648


## Missing value

In [21]:
times = pd.date_range(df.loc[0, "date"], df.loc[len(df)-1, "date"], freq = 'T')
times = [t for t in times if (t.time() >= datetime.time(8, 45)) and (t.time() < datetime.time(13, 45) and (is_workday(t)))]
df = (
    df
    .set_index("date")
    .reindex(index = times)
    .reset_index()
    )

print(df.shape)
df.head(3)

(75599, 6)


Unnamed: 0,date,open,high,low,close,volume
0,2021-12-27 08:46:00,17982.0,17985.0,17969.0,17971.0,1330.0
1,2021-12-27 08:47:00,17971.0,17979.0,17967.0,17979.0,337.0
2,2021-12-27 08:48:00,17978.0,17995.0,17977.0,17990.0,648.0


In [22]:
df["close"] = df["close"].fillna(method = "ffill")
df["volume"] = df["volume"].fillna(0)
for col in ["open", "high", "low"]:
    df[col] = df[col].fillna(df["close"])

## Cyclical features encoding

In [23]:
df['month_sin'] = np.sin(2 * np.pi * df["date"].dt.month/12)
df['month_cos'] = np.cos(2 * np.pi * df["date"].dt.month/12)

df['day_sin'] = np.sin(2 * np.pi * df["date"].dt.day/31)
df['day_cos'] = np.cos(2 * np.pi * df["date"].dt.day/31)

df['hour_sin'] = np.sin(2 * np.pi * df["date"].dt.hour/24)
df['hour_cos'] = np.cos(2 * np.pi * df["date"].dt.hour/24)

df['min_sin'] = np.sin(2 * np.pi * df["date"].dt.minute/60)
df['min_cos'] = np.cos(2 * np.pi * df["date"].dt.minute/60)

df.head(3)

Unnamed: 0,date,open,high,low,close,volume,month_sin,month_cos,day_sin,day_cos,hour_sin,hour_cos,min_sin,min_cos
0,2021-12-27 08:46:00,17982.0,17985.0,17969.0,17971.0,1330.0,-2.449294e-16,1.0,-0.724793,0.688967,0.866025,-0.5,-0.994522,0.104528
1,2021-12-27 08:47:00,17971.0,17979.0,17967.0,17979.0,337.0,-2.449294e-16,1.0,-0.724793,0.688967,0.866025,-0.5,-0.978148,0.207912
2,2021-12-27 08:48:00,17978.0,17995.0,17977.0,17990.0,648.0,-2.449294e-16,1.0,-0.724793,0.688967,0.866025,-0.5,-0.951057,0.309017


## Discretization

In [24]:
df["vol_bin"] = pd.cut(df["volume"], [-1, 500, 1000, 1500, df["volume"].max()], labels=[0, 1, 2, 3])
df["vol_bin"] = df["vol_bin"].astype(int)
df.head(3)

Unnamed: 0,date,open,high,low,close,volume,month_sin,month_cos,day_sin,day_cos,hour_sin,hour_cos,min_sin,min_cos,vol_bin
0,2021-12-27 08:46:00,17982.0,17985.0,17969.0,17971.0,1330.0,-2.449294e-16,1.0,-0.724793,0.688967,0.866025,-0.5,-0.994522,0.104528,2
1,2021-12-27 08:47:00,17971.0,17979.0,17967.0,17979.0,337.0,-2.449294e-16,1.0,-0.724793,0.688967,0.866025,-0.5,-0.978148,0.207912,0
2,2021-12-27 08:48:00,17978.0,17995.0,17977.0,17990.0,648.0,-2.449294e-16,1.0,-0.724793,0.688967,0.866025,-0.5,-0.951057,0.309017,1


## Generate target

In [25]:
df["target"] = (df["close"].shift(-1) - df["close"])
df["return"] = df["close"].pct_change()
df = df.dropna()
df["target"] = df.eval("target > 0").astype(int)
print(df.shape)
df.head(3)

(75597, 17)


Unnamed: 0,date,open,high,low,close,volume,month_sin,month_cos,day_sin,day_cos,hour_sin,hour_cos,min_sin,min_cos,vol_bin,target,return
1,2021-12-27 08:47:00,17971.0,17979.0,17967.0,17979.0,337.0,-2.449294e-16,1.0,-0.724793,0.688967,0.866025,-0.5,-0.978148,0.207912,0,1,0.000445
2,2021-12-27 08:48:00,17978.0,17995.0,17977.0,17990.0,648.0,-2.449294e-16,1.0,-0.724793,0.688967,0.866025,-0.5,-0.951057,0.309017,1,1,0.000612
3,2021-12-27 08:49:00,17990.0,17997.0,17990.0,17992.0,274.0,-2.449294e-16,1.0,-0.724793,0.688967,0.866025,-0.5,-0.913545,0.406737,0,1,0.000111


### Features

In [26]:
num_feats = [
    'open', 'high', 'low', 'close', 'volume', 'month_sin', 'month_cos', 'day_sin', 
    'day_cos', 'hour_sin', 'hour_cos', 'min_sin', 'min_cos', 'return',
]
cat_feats = ['vol_bin']
target = "target"

In [None]:
plt.figure(figsize=(20, 10))
corr = df.corr(method = "spearman").round(4)
sns.heatmap(data = corr, annot=True, cmap = 'RdBu')

In [27]:
df = df.query("(date >= '2022-01-01')").reset_index(drop = True)
print(df.shape)
df.head(3)
df["date"].dt.time.value_counts()

(74099, 17)


08:45:00    247
12:11:00    247
12:09:00    247
12:08:00    247
12:07:00    247
           ... 
10:24:00    247
10:23:00    247
10:22:00    247
10:21:00    247
13:44:00    246
Name: date, Length: 300, dtype: int64

In [None]:
(len(df))/300

In [None]:
df

In [None]:
from sklearn.model_selection import TimeSeriesSplit

n_splits = len(df)//300
tscv = TimeSeriesSplit(n_splits = n_splits, test_size = 300)
for i, (train_index, test_index) in enumerate(tscv.split(df)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")
    print(f"  Test:  index={test_index}")