In [1]:
import pandas as pd
import numpy as np
import datetime
import yfinance as yf
import gymnasium as gym

from finrl.meta.preprocessor.yahoodownloader import YahooDownloader
from finrl.meta.preprocessor.preprocessors import FeatureEngineer, data_split
from finrl import config_tickers
from finrl.config import INDICATORS

import itertools

Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
Users of this version of Gym should be able to simply replace 'import gym' with 'import gymnasium as gym' in the vast majority of cases.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.


In [2]:
TRAIN_START_DATE = '2009-01-01'
TRAIN_END_DATE = '2022-07-01'
TRADE_START_DATE = '2022-07-01'
TRADE_END_DATE = '2024-05-01'

In [3]:
from finrl.config_tickers import DOW_30_TICKER

print(DOW_30_TICKER)

['AXP', 'AMGN', 'AAPL', 'BA', 'CAT', 'CSCO', 'CVX', 'GS', 'HD', 'HON', 'IBM', 'INTC', 'JNJ', 'KO', 'JPM', 'MCD', 'MMM', 'MRK', 'MSFT', 'NKE', 'PG', 'TRV', 'UNH', 'CRM', 'VZ', 'V', 'WBA', 'WMT', 'DIS', 'DOW']


In [4]:
df_raw = YahooDownloader(start_date=TRAIN_START_DATE,
                        end_date=TRADE_END_DATE,
                        ticker_list=DOW_30_TICKER).fetch_data()

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

Shape of DataFrame:  (113141, 8)


In [5]:
df_raw.head()

Price,date,close,high,low,open,volume,tic,day
0,2009-01-02,2.724327,2.733033,2.556515,2.578129,746015200,AAPL,4
1,2009-01-02,40.463203,40.524938,39.612645,40.188829,6547900,AMGN,4
2,2009-01-02,14.854059,15.000064,14.139404,14.27004,10955700,AXP,4
3,2009-01-02,33.941093,34.173619,32.088396,32.103398,7010200,BA,4
4,2009-01-02,30.233912,30.279027,28.815991,28.944894,7117200,CAT,4


In [6]:
df_raw.tic.value_counts()

tic
AAPL    3857
AMGN    3857
AXP     3857
BA      3857
CAT     3857
CRM     3857
CSCO    3857
CVX     3857
DIS     3857
GS      3857
HD      3857
HON     3857
IBM     3857
INTC    3857
JNJ     3857
JPM     3857
KO      3857
MCD     3857
MMM     3857
MRK     3857
MSFT    3857
NKE     3857
PG      3857
TRV     3857
UNH     3857
V       3857
VZ      3857
WBA     3857
WMT     3857
DOW     1288
Name: count, dtype: int64

In [None]:
fe = FeatureEngineer(use_technical_indicator=True,
                    tech_indicator_list=INDICATORS,
                    use_vix=True,
                    use_turbulence=True,
                    user_defined_feature=False)

df = fe.preprocess_data(df_raw)
df = df.copy()
df = df.fillna(0)
df = df.replace(np.inf, 0)


[*********************100%***********************]  1 of 1 completed

Successfully added technical indicators
Shape of DataFrame:  (3856, 8)
Successfully added vix





Successfully added turbulence index


In [None]:
df=df.sort_values(['date','tic'],ignore_index=True)
df.index = df.date.factorize()[0]

cov_list = []
return_list = []

lookback=252
for i in range(lookback,len(df.index.unique())):
  data_lookback = df.loc[i-lookback:i,:]
  price_lookback=data_lookback.pivot_table(index = 'date',columns = 'tic', values = 'close')
  return_lookback = price_lookback.pct_change().dropna()
  return_list.append(return_lookback)

  covs = return_lookback.cov().values 
  cov_list.append(covs)


df_cov = pd.DataFrame({'date':df.date.unique()[lookback:],'cov_list':cov_list,'return_list':return_list})
df = df.merge(df_cov, on='date')
df = df.sort_values(['date','tic']).reset_index(drop=True)


In [None]:
df.shape

(111824, 18)

In [None]:
train = data_split(df, TRAIN_START_DATE, TRAIN_END_DATE)
trade = data_split(df, TRADE_START_DATE, TRADE_END_DATE)
print(len(train))
print(len(trade))


98513
13311


In [13]:
train_path = 'data/train.csv'
trade_path = 'data/trade.csv'

with open(train_path, 'w', encoding = 'utf-8-sig') as f:
  train.to_csv(f)

with open(trade_path, 'w', encoding = 'utf-8-sig') as f:
  trade.to_csv(f)