# [Data Preparing](https://github.com/CMACH508/DeepTrader?tab=readme-ov-file#data-preparing)

|                    File_name                     |                  shape                   |                  description                   |
| :----------------------------------------------: | :--------------------------------------: | :--------------------------------------------: |
|                 stocks_data.npy                  | [num_stocks, num_days, num_ASU_features] |       the inputs for asset scoring unit        |
|                 market_data.npy                  |       [num_days, num_MSU_features]       |     the inputs for marketing scoring unit      |
|                     ror.npy                      |          [num_stocks, num_days]          | rate of return file for calculating the return |
| relation_file (e.g. industry_classification.npy) |         [num_stocks, num_stocks]         |     the relation matrix used in GCN layer      |



In [23]:
TARGET_DIR = "."
# Valid periods: 1d,5d,1mo,3mo,6mo,1y,2y,5y,10y,ytd,max
# PERIOD = "10y"
START_DATE = "2010-01-01"
END_DATE = "2025-01-31"
# Open, High, Low, Close, Volume, Dividends, Stock Splits
ASSET_FEATURES = ["Open", "High", "Low", "Close", "Volume"]
MARKET_FEATURES = ["Open", "High", "Low", "Close"]
# 道瓊工業平均指數（Dow Jones Industrial Average，DJIA)
DJIA = "^DJI"
DJIA_STOCKS = [
    'AAPL', 'AMGN', 'AMZN', 'AXP', 'BA', 'CAT',
    'CRM', 'CSCO', 'CVX', 'DIS', 'DOW', 'GS',
    'HD', 'HON', 'IBM', 'INTC', 'JNJ', 'JPM',
    'KO', 'MCD', 'MMM', 'MRK', 'MSFT', 'NKE',
    'PG', 'TRV', 'UNH', 'V', 'VZ', 'WMT'
]
# DOW (1316, 5)
DJIA_STOCKS = [
    'AAPL', 'AMGN', 'AMZN', 'AXP', 'BA', 'CAT',
    'CRM', 'CSCO', 'CVX', 'DIS', 'GOOGL', 'GS',
    'HD', 'HON', 'IBM', 'INTC', 'JNJ', 'JPM',
    'KO', 'MCD', 'MMM', 'MRK', 'MSFT', 'NKE',
    'PG', 'TRV', 'UNH', 'V', 'VZ', 'WMT'
]
# Define a small epsilon to avoid 0 values after normalization
EPSILON = 1e-6

In [5]:
import yfinance as yf

# Get the earliest available date for the DJIA index
djia_ticker = yf.Ticker(DJIA)
djia_history = djia_ticker.history(period="max")
if not djia_history.empty:
    djia_earliest_date = djia_history.index.min()
    print("DJIA earliest date:", djia_earliest_date)
else:
    print("No historical data found for DJIA.")

# Get the earliest available date for each stock in DJIA_STOCKS\
# Create a Tickers object for multiple stocks
djia_tickers = yf.Tickers(" ".join(DJIA_STOCKS))

# Dictionary to hold each stock's earliest date
stocks_earliest_dates = {}

for stock in DJIA_STOCKS:
    ticker = djia_tickers.tickers[stock]
    stock_history = ticker.history(period="max")
    if not stock_history.empty:
        earliest_date = stock_history.index.min()
        stocks_earliest_dates[stock] = earliest_date
    else:
        stocks_earliest_dates[stock] = None

print("DJIA Stocks earliest dates:")
for stock, date in stocks_earliest_dates.items():
    print(f"{stock}: {date}")

DJIA earliest date: 1992-01-02 00:00:00-05:00
DJIA Stocks earliest dates:
AAPL: 1980-12-12 00:00:00-05:00
AMGN: 1983-06-17 00:00:00-04:00
AMZN: 1997-05-15 00:00:00-04:00
AXP: 1972-06-01 00:00:00-04:00
BA: 1962-01-02 00:00:00-05:00
CAT: 1962-01-02 00:00:00-05:00
CRM: 2004-06-23 00:00:00-04:00
CSCO: 1990-02-16 00:00:00-05:00
CVX: 1962-01-02 00:00:00-05:00
DIS: 1962-01-02 00:00:00-05:00
GOOGL: 2004-08-19 00:00:00-04:00
GS: 1999-05-04 00:00:00-04:00
HD: 1981-09-22 00:00:00-04:00
HON: 1962-01-02 00:00:00-05:00
IBM: 1962-01-02 00:00:00-05:00
INTC: 1980-03-17 00:00:00-05:00
JNJ: 1962-01-02 00:00:00-05:00
JPM: 1980-03-17 00:00:00-05:00
KO: 1962-01-02 00:00:00-05:00
MCD: 1966-07-05 00:00:00-04:00
MMM: 1962-01-02 00:00:00-05:00
MRK: 1962-01-02 00:00:00-05:00
MSFT: 1986-03-13 00:00:00-05:00
NKE: 1980-12-02 00:00:00-05:00
PG: 1962-01-02 00:00:00-05:00
TRV: 1975-11-17 00:00:00-05:00
UNH: 1984-10-17 00:00:00-04:00
V: 2008-03-19 00:00:00-04:00
VZ: 1983-11-21 00:00:00-05:00
WMT: 1972-08-25 00:00:00-04

## stocks_data.npy
| File_name | Shape | Description |
| ---- | ---- | ---- |
|stocks_data.npy |	[num_stocks, num_days, num_ASU_features] |	the inputs for asset scoring unit |

In [24]:
import yfinance as yf
import numpy as np
import pandas as pd


# Generate business days for the fixed date range
date_range = pd.bdate_range(start=START_DATE, end=END_DATE)
print("Total business days:", len(date_range))

stocks_data = []

# Process each stock
for stock in DJIA_STOCKS:
    ticker = yf.Ticker(stock)
    # Download historical data for the given date range
    df = ticker.history(start=START_DATE, end=END_DATE)[ASSET_FEATURES]
    
    # Remove timezone information from the index to match the naive date_range
    df.index = df.index.tz_localize(None)
    # Reindex to the fixed business day range
    df = df.reindex(date_range)
    
    # Check missing values: count total NaN / total (rows x columns)
    total_cells = df.shape[0] * df.shape[1]
    nan_count = df.isna().sum().sum()
    nan_ratio = nan_count / total_cells
    if nan_ratio > 0.1:
        print(f"Warning: {stock} has {nan_ratio:.2%} missing values.")
    
    # Fill missing values using forward fill, then backward fill if necessary
    df.replace(0, np.nan, inplace=True)
    df.ffill(inplace=True)
    df.bfill(inplace=True)
    
    # normalization per column
    for feature in ASSET_FEATURES:
        min_val = df[feature].min()
        max_val = df[feature].max()
        # Standard min-max normalization, then shift the scale
        df[feature] = ((df[feature] - min_val) / (max_val - min_val)) * (1 - EPSILON) + EPSILON
    
    stocks_data.append(df)

# Convert list of DataFrames to numpy array with shape (number_of_stocks, number_of_days, number_of_features)
stocks_data_np = np.stack([df.values for df in stocks_data])
print(stocks_data_np.shape)

np.save(f"{TARGET_DIR}/stocks_data.npy", stocks_data_np)

Total business days: 3936
(30, 3936, 5)


# market_data.npy
| File_name | Shape | Description |
| ---- | ---- | ---- |
| market_data.npy | [num_days, num_MSU_features] | the inputs for marketing scoring unit |

In [25]:
# Download DJIA historical data for the given date range
ticker = yf.Ticker(DJIA)
market_data = ticker.history(start=START_DATE, end=END_DATE)[MARKET_FEATURES]

# Remove timezone information from the index so it matches the naive date_range
market_data.index = market_data.index.tz_localize(None)
# Reindex the DataFrame to the fixed business day range
market_data = market_data.reindex(date_range)

# Check missing values: calculate total cells and NaN ratio
total_cells = market_data.shape[0] * market_data.shape[1]
nan_count = market_data.isna().sum().sum()
nan_ratio = nan_count / total_cells
if nan_ratio > 0.1:
    print(f"Warning: DJIA has {nan_ratio:.2%} missing values.")

# Fill missing values using forward fill, then backward fill if necessary
market_data.replace(0, np.nan, inplace=True)
market_data.ffill(inplace=True)
market_data.bfill(inplace=True)

# Normalize each column (feature)
for feature in MARKET_FEATURES:
    min_val = market_data[feature].min()
    max_val = market_data[feature].max()
    # Standard min-max normalization, then shift the scale
    market_data[feature] = ((market_data[feature] - min_val) / (max_val - min_val)) * (1 - EPSILON) + EPSILON
    
# Convert the DataFrame to a numpy array and save it
market_data_np = market_data.to_numpy()
print(market_data_np.shape)
np.save(f"{TARGET_DIR}/market_data.npy", market_data_np)

(3936, 4)


# ror.py
| File_name | Shape | Description |
| ---- | ---- | ---- |
| ror.npy | [num_stocks, num_days] | rate of return file for calculating the return|

In [16]:
import yfinance as yf
import pandas as pd
import numpy as np

# Generate fixed business day date range
date_range = pd.bdate_range(start=START_DATE, end=END_DATE)
print("Total business days:", len(date_range))

# Dictionary to collect rate of return series for each stock
ror_data = {}

# Process each stock in DJIA_STOCKS
for stock in DJIA_STOCKS:
    ticker = yf.Ticker(stock)
    # Download historical data for the given date range
    data = ticker.history(start=START_DATE, end=END_DATE)
    
    # Remove timezone info from the index so that it matches the naive date_range
    data.index = data.index.tz_localize(None)
    
    # Reindex to the fixed business day range (if a date is missing, NaN is inserted)
    data = data.reindex(date_range)
    
    # Check missing values: count total cells and compute NaN ratio
    total_cells = data.shape[0] * data.shape[1]
    nan_count = data.isna().sum().sum()
    nan_ratio = nan_count / total_cells
    if nan_ratio > 0.1:
        print(f"Warning: {stock} has {nan_ratio:.2%} missing values in raw data.")
    
    # Fill missing values using forward fill, then backward fill if necessary
    data.ffill(inplace=True)
    data.bfill(inplace=True)
    
    # Compute daily rate of return: (Close / Open) - 1.0
    ror_series = data["Close"] / data["Open"] - 1.0
    ror_data[stock] = ror_series

# Create a DataFrame with rows = stocks, columns = dates
ror_df = pd.DataFrame(ror_data).transpose()
print(ror_df.shape)  # Should be (num_stocks, num_days)

# Save the numpy array (shape: [num_stocks, num_days])
np.save(f"{TARGET_DIR}/ror.npy", ror_df.to_numpy())


Total business days: 3936
(30, 3936)


## relation_file
| File_name | Shape | Description |
| ---- | ---- | ---- |
| relation_file (e.g. industry_classification.npy) | [num_stocks, num_stocks] | the relation matrix used in GCN layer|

In [17]:
import yfinance as yf
import numpy as np
from pprint import pprint
from collections import defaultdict

# Create a Tickers object for multiple stocks in DJIA_STOCKS
djia_tickers = yf.Tickers(" ".join(DJIA_STOCKS))

# Build a dictionary to map each sector to a set of stocks in that sector
industry = defaultdict(set)
for stock in djia_tickers.symbols:
    ticker = djia_tickers.tickers[stock]
    # Extract sector information from ticker.info; assume it exists
    sector = ticker.info.get('sector')
    if sector is not None:
        industry[sector].add(stock)

# Print the industry classification dictionary for reference
pprint(dict(industry))

# Build the relation matrix based on industry classification
industry_classification = []
for stock in djia_tickers.symbols:
    ticker = djia_tickers.tickers[stock]
    # Get the sector of the current stock
    sector = ticker.info.get('sector')
    # Determine the number of stocks in this sector; if no sector info, default to 1
    sector_count = len(industry[sector]) if sector is not None else 1
    # For each stock in djia_tickers.symbols, assign weight = 1/sector_count if it belongs to the same sector, else 0.0
    relation = [1.0 / sector_count if other_stock in industry[sector] else 0.0 for other_stock in djia_tickers.symbols]
    industry_classification.append(relation)

# Save the relation matrix as a text file for inspection and as a numpy file for use in GCN
with open(f"{TARGET_DIR}/industry_classification.txt", "w") as f:
    for row in industry_classification:
        # Each value is formatted to two decimal places
        f.write(' '.join([f'{num:.2f}' for num in row]) + '\n')

# Convert the relation matrix to a numpy array with shape [num_stocks, num_stocks]
industry_classification = np.array(industry_classification)
np.save(f"{TARGET_DIR}/industry_classification.npy", industry_classification)


{'Communication Services': {'DIS', 'GOOGL', 'VZ'},
 'Consumer Cyclical': {'AMZN', 'NKE', 'HD', 'MCD'},
 'Consumer Defensive': {'PG', 'WMT', 'KO'},
 'Energy': {'CVX'},
 'Financial Services': {'TRV', 'GS', 'JPM', 'AXP', 'V'},
 'Healthcare': {'JNJ', 'MRK', 'UNH', 'AMGN'},
 'Industrials': {'BA', 'MMM', 'HON', 'CAT'},
 'Technology': {'CRM', 'IBM', 'AAPL', 'CSCO', 'MSFT', 'INTC'}}


## arguments

| Argument          | Description                                                | Default                     | Type  |
| ----------------- | ---------------------------------------------------------- | --------------------------- | ----- |
| `--config`        | Deafult configuration file                                 | hyper.json                  | str   |
| `--window_len`    | Input window size                                          | 13 (weeks)                  | int   |
| `--market`        | Stock market                                               | DJIA                        | str   |
| `--G`             | The number of stocks participating in long/short each time | 4 (for DJIA)                | int   |
| `--batch_size`    | Batch size number                                          | 37                          | Int   |
| `--lr`            | learning rate                                              | 1e-6                        | float |
| `--gamma`         | Coefficient for adjusting lr between ASU and MSU           | 0.05                        | float |
| `--no_spatial`    | Whether to use spatial attention and GCN layer in ASU      | True                        | bool  |
| `--no_msu`        | Whether to use market scoring unit                         | True                        | bool  |
| `--relation_file` | File name for relation matrix used in GCN layer            | Industry_classification.npy | str   |
| `--addaptiveadj`  | Whether to use addaptive matrix in GCN (Eq. 2)             | True                        | Bool  |

