# [Data Preparing](https://github.com/CMACH508/DeepTrader?tab=readme-ov-file#data-preparing)

|                    File_name                     |                  shape                   |                  description                   |
| :----------------------------------------------: | :--------------------------------------: | :--------------------------------------------: |
|                 stocks_data.npy                  | [num_stocks, num_days, num_ASU_features] |       the inputs for asset scoring unit        |
|                 market_data.npy                  |       [num_days, num_MSU_features]       |     the inputs for marketing scoring unit      |
|                     ror.npy                      |          [num_stocks, num_days]          | rate of return file for calculating the return |
| relation_file (e.g. industry_classification.npy) |         [num_stocks, num_stocks]         |     the relation matrix used in GCN layer      |



In [1]:
import pandas as pd
TWII = "^DJI"
toptw = pd.read_excel(r'0050.xlsx')
TWII_STOCKS = [str(symbol) + '.TW' for symbol in toptw['Symbol']]
print(TWII_STOCKS)
print(f"Total stocks to process: {len(TWII_STOCKS)}")


# Define a small epsilon to avoid 0 values after normalization
EPSILON = 1e-6

['2330.TW', '2454.TW', '2317.TW', '2382.TW', '2308.TW', '2303.TW', '2891.TW', '3711.TW', '2881.TW', '2412.TW', '2886.TW', '2882.TW', '2884.TW', '1216.TW', '2885.TW', '3231.TW', '3034.TW', '2357.TW', '2002.TW', '2892.TW', '1303.TW', '5880.TW', '2379.TW', '1301.TW', '2890.TW', '3008.TW', '3037.TW', '2345.TW', '5871.TW', '3661.TW', '2880.TW', '2327.TW', '2883.TW', '2301.TW', '1101.TW', '2887.TW', '2207.TW', '4938.TW', '6669.TW', '1326.TW', '3045.TW', '2395.TW', '5876.TW', '2603.TW', '1590.TW', '2912.TW', '4904.TW', '2801.TW', '6505.TW', '2408.TW']
Total stocks to process: 50


In [2]:
import yfinance as yf

# Get the earliest available date for the TWII index
djia_ticker = yf.Ticker(TWII)
djia_history = djia_ticker.history(period="max")
if not djia_history.empty:
    djia_earliest_date = djia_history.index.min()
    print("TWII earliest date:", djia_earliest_date)
else:
    print("No historical data found for TWII.")

# Get the earliest available date for each stock in TWII_STOCKS\
# Create a Tickers object for multiple stocks
djia_tickers = yf.Tickers(" ".join(TWII_STOCKS))

# Dictionary to hold each stock's earliest date
stocks_earliest_dates = {}

for stock in TWII_STOCKS:
    ticker = djia_tickers.tickers[stock]
    stock_history = ticker.history(period="max")
    if not stock_history.empty:
        earliest_date = stock_history.index.min()
        stocks_earliest_dates[stock] = earliest_date
    else:
        stocks_earliest_dates[stock] = None

print("TWII Stocks earliest dates:")
for stock, date in stocks_earliest_dates.items():
    print(f"{stock}: {date}")

TWII earliest date: 1992-01-02 00:00:00-05:00
TWII Stocks earliest dates:
2330.TW: 2000-01-04 00:00:00+08:00
2454.TW: 2001-07-23 00:00:00+08:00
2317.TW: 1993-01-05 00:00:00+08:00
2382.TW: 2000-01-04 00:00:00+08:00
2308.TW: 2000-01-04 00:00:00+08:00
2303.TW: 2000-01-04 00:00:00+08:00
2891.TW: 2002-05-17 00:00:00+08:00
3711.TW: 2000-01-04 00:00:00+08:00
2881.TW: 2001-12-20 00:00:00+08:00
2412.TW: 2000-11-15 00:00:00+08:00
2886.TW: 2000-01-04 00:00:00+08:00
2882.TW: 2000-01-04 00:00:00+08:00
2884.TW: 2002-01-29 00:00:00+08:00
1216.TW: 2000-01-04 00:00:00+08:00
2885.TW: 2000-01-04 00:00:00+08:00
3231.TW: 2003-08-19 00:00:00+08:00
3034.TW: 2002-08-27 00:00:00+08:00
2357.TW: 2000-01-04 00:00:00+08:00
2002.TW: 2000-01-04 00:00:00+08:00
2892.TW: 2003-01-03 00:00:00+08:00
1303.TW: 2000-01-04 00:00:00+08:00
5880.TW: 2000-01-04 00:00:00+08:00
2379.TW: 2000-01-04 00:00:00+08:00
1301.TW: 2000-01-04 00:00:00+08:00
2890.TW: 2000-01-04 00:00:00+08:00
3008.TW: 2002-03-11 00:00:00+08:00
3037.TW: 2000-01

In [7]:
import datetime

stocks_to_remove = []
target_date = datetime.date(2000, 1, 4)
for stock, dt in stocks_earliest_dates.items():
	if dt is None or dt.replace(tzinfo=None).date() > target_date:
		stocks_to_remove.append(stock)
for stock in stocks_to_remove:
	if stock in TWII_STOCKS:
		TWII_STOCKS.remove(stock)
print("Stocks to remove:", stocks_to_remove)            
print(len(stocks_to_remove))

Stocks to remove: ['2454.TW', '2891.TW', '2881.TW', '2412.TW', '2884.TW', '3231.TW', '3034.TW', '2892.TW', '3008.TW', '5871.TW', '3661.TW', '2880.TW', '2883.TW', '2887.TW', '4938.TW', '6669.TW', '3045.TW', '5876.TW', '1590.TW', '4904.TW', '6505.TW', '2408.TW']
22


In [16]:
TARGET_DIR = "."
# Valid periods: 1d,5d,1mo,3mo,6mo,1y,2y,5y,10y,ytd,max
# PERIOD = "10y"
START_DATE = "2000-01-01"
END_DATE = "2023-12-31"
# Open, High, Low, Close, Volume, Dividends, Stock Splits
ASSET_FEATURES = ["Open", "High", "Low", "Close", "Volume"]
MARKET_FEATURES = ["Open", "High", "Low", "Close"]

In [17]:
import pandas as pd

# Generate business days for the fixed date range
business_days = pd.bdate_range(start=START_DATE, end=END_DATE)
print("Total business days:", len(business_days))
print()

intervals = {
    "Training": ("2000/01/01", "2008/01/31"),
    "Validation": ("2008/02/01", "2015/10/31"),
    "Test": ("2015/11/01", "2023/12/31"),
}
for interval_name, (start_date, end_date) in intervals.items():
    interval_days = pd.bdate_range(start=start_date, end=end_date)
    start_idx = business_days.get_loc(interval_days[0])  # find the start index
    end_idx = business_days.get_loc(interval_days[-1])   # find the end index
    total_days = len(interval_days)  # calculate the total number of business days
    
    print(f"{interval_name}:")
    print(f"  Start Index = {start_idx}")
    print(f"  End Index = {end_idx}")
    print(f"  Total Business Days = {total_days}\n")


Total business days: 6260

Training:
  Start Index = 0
  End Index = 2108
  Total Business Days = 2109

Validation:
  Start Index = 2109
  End Index = 4129
  Total Business Days = 2021

Test:
  Start Index = 4130
  End Index = 6259
  Total Business Days = 2130

