# Feature Engineering


In [1]:
import pandas as pd
import numpy as np
import datetime
import time
from sklearn.preprocessing import MinMaxScaler

In [2]:
df = pd.read_csv('../dataset/processed_data.csv',encoding='utf-8')
df.columns

Index(['Unnamed: 0', 'kdcode', 'dt', 'close', 'open', 'high', 'low',
       'prev_close', 'adjfactor', 'turnover', 'volume', 'code', 'time_pd'],
      dtype='object')

## Price Feature Engineering


### Creating 5, 10, and 20-Day Historical Moving Average Features (Excluding the Current Day)


In [3]:
df['5_day_avg'] = df.groupby('kdcode')['close'].transform(lambda x: x.shift(1).rolling(window=5, min_periods=1).mean())
df['10_day_avg'] = df.groupby('kdcode')['close'].transform(lambda x: x.shift(1).rolling(window=10, min_periods=1).mean())
df['20_day_avg'] = df.groupby('kdcode')['close'].transform(lambda x: x.shift(1).rolling(window=20, min_periods=1).mean())

df['5_day_avg'] = df.groupby('kdcode')['5_day_avg'].fillna(method='ffill').fillna(method='bfill')
df['10_day_avg'] = df.groupby('kdcode')['10_day_avg'].fillna(method='ffill').fillna(method='bfill')
df['20_day_avg'] = df.groupby('kdcode')['20_day_avg'].fillna(method='ffill').fillna(method='bfill')

df.head()

  df['5_day_avg'] = df.groupby('kdcode')['5_day_avg'].fillna(method='ffill').fillna(method='bfill')
  df['5_day_avg'] = df.groupby('kdcode')['5_day_avg'].fillna(method='ffill').fillna(method='bfill')
  df['10_day_avg'] = df.groupby('kdcode')['10_day_avg'].fillna(method='ffill').fillna(method='bfill')
  df['10_day_avg'] = df.groupby('kdcode')['10_day_avg'].fillna(method='ffill').fillna(method='bfill')
  df['20_day_avg'] = df.groupby('kdcode')['20_day_avg'].fillna(method='ffill').fillna(method='bfill')
  df['20_day_avg'] = df.groupby('kdcode')['20_day_avg'].fillna(method='ffill').fillna(method='bfill')


Unnamed: 0.1,Unnamed: 0,kdcode,dt,close,open,high,low,prev_close,adjfactor,turnover,volume,code,time_pd,5_day_avg,10_day_avg,20_day_avg
0,0.0,000001.SH,2021-01-04,3502.9584,3474.6793,3511.6554,3457.2061,3473.0693,1.0,523367700000.0,38079080000.0,1,2021-01-04,3502.9584,3502.9584,3502.9584
1,1.0,000001.SH,2021-01-05,3528.6767,3492.1912,3528.6767,3484.7151,3502.9584,1.0,568019500000.0,40799590000.0,1,2021-01-05,3502.9584,3502.9584,3502.9584
2,2.0,000001.SH,2021-01-06,3550.8767,3530.9072,3556.8022,3513.1262,3528.6767,1.0,521799500000.0,37023090000.0,1,2021-01-06,3515.81755,3515.81755,3515.81755
3,3.0,000001.SH,2021-01-07,3576.2046,3552.9087,3576.2046,3526.6174,3550.8767,1.0,545709600000.0,40534820000.0,1,2021-01-07,3527.503933,3527.503933,3527.503933
4,4.0,000001.SH,2021-01-08,3570.1082,3577.6923,3588.0625,3544.8912,3576.2046,1.0,502170800000.0,34555790000.0,1,2021-01-08,3539.6791,3539.6791,3539.6791


### Creating Daily, Weekly, and Monthly Lagged Return Features


In [4]:
df['daily_return'] = df.groupby('kdcode')['close'].apply(lambda x: x.shift(1).pct_change()).reset_index(level=0, drop=True)

df['weekly_return'] = df.groupby('kdcode')['close'].apply(lambda x: x.shift(1).pct_change(periods=5)).reset_index(level=0, drop=True)

df['monthly_return'] = df.groupby('kdcode')['close'].apply(lambda x: x.shift(1).pct_change(periods=20)).reset_index(level=0, drop=True)

df['daily_return'] = df.groupby('kdcode')['daily_return'].fillna(method='ffill').fillna(method='bfill')
df['weekly_return'] = df.groupby('kdcode')['weekly_return'].fillna(method='ffill').fillna(method='bfill')
df['monthly_return'] = df.groupby('kdcode')['monthly_return'].fillna(method='ffill').fillna(method='bfill')
df.head()


  df['daily_return'] = df.groupby('kdcode')['daily_return'].fillna(method='ffill').fillna(method='bfill')
  df['daily_return'] = df.groupby('kdcode')['daily_return'].fillna(method='ffill').fillna(method='bfill')
  df['weekly_return'] = df.groupby('kdcode')['weekly_return'].fillna(method='ffill').fillna(method='bfill')
  df['weekly_return'] = df.groupby('kdcode')['weekly_return'].fillna(method='ffill').fillna(method='bfill')
  df['monthly_return'] = df.groupby('kdcode')['monthly_return'].fillna(method='ffill').fillna(method='bfill')
  df['monthly_return'] = df.groupby('kdcode')['monthly_return'].fillna(method='ffill').fillna(method='bfill')


Unnamed: 0.1,Unnamed: 0,kdcode,dt,close,open,high,low,prev_close,adjfactor,turnover,volume,code,time_pd,5_day_avg,10_day_avg,20_day_avg,daily_return,weekly_return,monthly_return
0,0.0,000001.SH,2021-01-04,3502.9584,3474.6793,3511.6554,3457.2061,3473.0693,1.0,523367700000.0,38079080000.0,1,2021-01-04,3502.9584,3502.9584,3502.9584,0.007342,0.008147,0.000664
1,1.0,000001.SH,2021-01-05,3528.6767,3492.1912,3528.6767,3484.7151,3502.9584,1.0,568019500000.0,40799590000.0,1,2021-01-05,3502.9584,3502.9584,3502.9584,0.007342,0.008147,0.000664
2,2.0,000001.SH,2021-01-06,3550.8767,3530.9072,3556.8022,3513.1262,3528.6767,1.0,521799500000.0,37023090000.0,1,2021-01-06,3515.81755,3515.81755,3515.81755,0.007342,0.008147,0.000664
3,3.0,000001.SH,2021-01-07,3576.2046,3552.9087,3576.2046,3526.6174,3550.8767,1.0,545709600000.0,40534820000.0,1,2021-01-07,3527.503933,3527.503933,3527.503933,0.006291,0.008147,0.000664
4,4.0,000001.SH,2021-01-08,3570.1082,3577.6923,3588.0625,3544.8912,3576.2046,1.0,502170800000.0,34555790000.0,1,2021-01-08,3539.6791,3539.6791,3539.6791,0.007133,0.008147,0.000664


## Bi-Classification Dataset


In [5]:
df['label'] = df.apply(lambda row: 1 if row['close'] >= row['prev_close'] else -1, axis=1)
nan_count = df.isna().sum().sum() 
nan_count

np.int64(0)

In [6]:
column_select = ['kdcode','dt','prev_close','open','high','low','adjfactor','turnover','volume','5_day_avg',
                 '10_day_avg','20_day_avg','daily_return','weekly_return','monthly_return','close','label']

number_col = ['prev_close','open','high','low','adjfactor','turnover','volume','5_day_avg',
                 '10_day_avg','20_day_avg','daily_return','weekly_return','monthly_return','close']

In [7]:
scaler = MinMaxScaler()

df[number_col] = scaler.fit_transform(df[number_col])
df

Unnamed: 0.1,Unnamed: 0,kdcode,dt,close,open,high,low,prev_close,adjfactor,turnover,volume,code,time_pd,5_day_avg,10_day_avg,20_day_avg,daily_return,weekly_return,monthly_return,label
0,0.0,000001.SH,2021-01-04,0.899541,0.890727,0.897650,0.893217,0.891863,0.0,0.346483,0.289914,1,2021-01-04,0.905072,0.912371,0.919368,0.145722,0.142487,0.139787,1
1,1.0,000001.SH,2021-01-05,0.906148,0.895217,0.902002,0.900326,0.899541,0.0,0.376044,0.310627,1,2021-01-05,0.905072,0.912371,0.919368,0.145722,0.142487,0.139787,1
2,2.0,000001.SH,2021-01-06,0.911850,0.905145,0.909194,0.907669,0.906148,0.0,0.345445,0.281874,1,2021-01-06,0.908396,0.915721,0.922744,0.145722,0.142487,0.139787,1
3,3.0,000001.SH,2021-01-07,0.918357,0.910787,0.914155,0.911156,0.911850,0.0,0.361274,0.308611,1,2021-01-07,0.911416,0.918766,0.925812,0.145428,0.142487,0.139787,1
4,4.0,000001.SH,2021-01-08,0.916790,0.917142,0.917187,0.915879,0.918357,0.0,0.332450,0.263090,1,2021-01-08,0.914563,0.921938,0.929009,0.145663,0.142487,0.139787,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292633,4697564.0,688981.SH,2024-12-25,0.024881,0.024451,0.025225,0.024620,0.024591,0.0,0.006408,0.000750,688981,2024-12-25,0.023232,0.022519,0.022748,0.148606,0.187941,0.169740,1
292634,4697565.0,688981.SH,2024-12-26,0.024557,0.024840,0.024990,0.024560,0.024881,0.0,0.004578,0.000541,688981,2024-12-26,0.023986,0.022811,0.022879,0.146932,0.186994,0.164368,-1
292635,4697566.0,688981.SH,2024-12-27,0.024758,0.024528,0.025882,0.024648,0.024557,0.0,0.007533,0.000872,688981,2024-12-27,0.024570,0.023060,0.023006,0.140071,0.175573,0.163779,1
292636,4697567.0,688981.SH,2024-12-30,0.025215,0.024481,0.025412,0.024521,0.024758,0.0,0.005925,0.000690,688981,2024-12-30,0.024737,0.023438,0.023103,0.145924,0.149454,0.157555,1


In [8]:
df[column_select].to_csv('../dataset/HS.csv', index=False)