In [1]:
import pandas as pd
import numpy as np
import datetime
import yfinance as yf
import gymnasium as gym

from finrl.meta.preprocessor.yahoodownloader import YahooDownloader
from finrl.meta.preprocessor.preprocessors import FeatureEngineer, data_split
from finrl import config_tickers
from finrl.config import INDICATORS

import itertools

Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
Users of this version of Gym should be able to simply replace 'import gym' with 'import gymnasium as gym' in the vast majority of cases.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.


In [2]:
TRAIN_START_DATE = '2009-01-01'
TRAIN_END_DATE = '2022-07-01'
TRADE_START_DATE = '2022-07-01'
TRADE_END_DATE = '2024-05-01'

In [8]:
from finrl.config_tickers import SP_500_TICKER

In [10]:
from finrl.config_tickers import SP_500_TICKER

print(SP_500_TICKER)

['A', 'AAL', 'AAP', 'AAPL', 'ABBV', 'ABC', 'ABMD', 'ABT', 'ACN', 'ADBE', 'ADI', 'ADM', 'ADP', 'ADS', 'ADSK', 'AEE', 'AEP', 'AES', 'AFL', 'AGN', 'AIG', 'AIV', 'AIZ', 'AJG', 'AKAM', 'ALB', 'ALGN', 'ALK', 'ALL', 'ALLE', 'ALXN', 'AMAT', 'AMCR', 'AMD', 'AME', 'AMG', 'AMGN', 'AMP', 'AMT', 'AMZN', 'ANET', 'ANSS', 'ANTM', 'AON', 'AOS', 'APA', 'APD', 'APH', 'APTV', 'ARE', 'ARNC', 'ATO', 'ATVI', 'AVB', 'AVGO', 'AVY', 'AWK', 'AXP', 'AZO', 'BA', 'BAC', 'BAX', 'BBT', 'BBY', 'BDX', 'BEN', 'BF.B', 'BHGE', 'BIIB', 'BK', 'BKNG', 'BLK', 'BLL', 'BMY', 'BR', 'BRK.B', 'BSX', 'BWA', 'BXP', 'C', 'CAG', 'CAH', 'CAT', 'CB', 'CBOE', 'CBRE', 'CBS', 'CCI', 'CCL', 'CDNS', 'CE', 'CELG', 'CERN', 'CF', 'CFG', 'CHD', 'CHRW', 'CHTR', 'CI', 'CINF', 'CL', 'CLX', 'CMA', 'CMCSA', 'CME', 'CMG', 'CMI', 'CMS', 'CNC', 'CNP', 'COF', 'COG', 'COO', 'COP', 'COST', 'COTY', 'CPB', 'CPRI', 'CPRT', 'CRM', 'CSCO', 'CSX', 'CTAS', 'CTL', 'CTSH', 'CTVA', 'CTXS', 'CVS', 'CVX', 'CXO', 'D', 'DAL', 'DD', 'DE', 'DFS', 'DG', 'DGX', 'DHI', 'DHR'

In [11]:
df_raw = YahooDownloader(start_date=TRAIN_START_DATE,
                        end_date=TRADE_END_DATE,
                        ticker_list=SP_500_TICKER).fetch_data()

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

1 Failed download:
['ABC']: YFTzMissingError('possibly delisted; no timezone found')
[*********************100%***********************]  1 of 1 completed

1 Failed download:
['ABMD']: YFTzMissingError('possibly delisted; no timezone found')
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

Shape of DataFrame:  (1635596, 8)


In [12]:
df_raw.head()

Price,date,close,high,low,open,volume,tic,day
0,2009-01-02,10.320795,10.358927,9.856868,9.914064,4236220,A,4
1,2009-01-02,7.909602,7.994448,7.230828,7.287392,5167000,AAL,4
2,2009-01-02,29.373983,29.511647,28.453358,29.133073,795900,AAP,4
3,2009-01-02,2.724326,2.733032,2.556514,2.578128,746015200,AAPL,4
4,2009-01-02,17.518656,17.613511,17.129426,17.508844,13163193,ABT,4


In [13]:
df_raw.tic.value_counts()

tic
A       3857
AAL     3857
AAP     3857
AAPL    3857
ABT     3857
        ... 
IR      1753
FOXA    1294
DOW     1288
CTVA    1242
STI      502
Name: count, Length: 437, dtype: int64

In [14]:
fe = FeatureEngineer(use_technical_indicator=True,
                    tech_indicator_list=INDICATORS,
                    use_vix=True,
                    use_turbulence=True,
                    user_defined_feature=False)

df = fe.preprocess_data(df_raw)
df = df.copy()
df = df.fillna(0)
df = df.replace(np.inf, 0)


Successfully added technical indicators


[*********************100%***********************]  1 of 1 completed


Shape of DataFrame:  (3856, 8)
Successfully added vix
Successfully added turbulence index


In [15]:
df=df.sort_values(['date','tic'],ignore_index=True)
df.index = df.date.factorize()[0]

cov_list = []
return_list = []

lookback=252
for i in range(lookback,len(df.index.unique())):
  data_lookback = df.loc[i-lookback:i,:]
  price_lookback=data_lookback.pivot_table(index = 'date',columns = 'tic', values = 'close')
  return_lookback = price_lookback.pct_change().dropna()
  return_list.append(return_lookback)

  covs = return_lookback.cov().values 
  cov_list.append(covs)


df_cov = pd.DataFrame({'date':df.date.unique()[lookback:],'cov_list':cov_list,'return_list':return_list})
df = df.merge(df_cov, on='date')
df = df.sort_values(['date','tic']).reset_index(drop=True)


In [None]:
df.shape

(111824, 18)

In [17]:
train = data_split(df, TRAIN_START_DATE, TRAIN_END_DATE)
trade = data_split(df, TRADE_START_DATE, TRADE_END_DATE)
print(len(train))
print(len(trade))


1239130
180846


In [13]:
train_path = 'data/train.csv'
trade_path = 'data/trade.csv'

with open(train_path, 'w', encoding = 'utf-8-sig') as f:
  train.to_csv(f)

with open(trade_path, 'w', encoding = 'utf-8-sig') as f:
  trade.to_csv(f)

In [18]:
import pickle

train_pickle_path = 'data/train.pickle'
trade_pickle_path = 'data/trade.pickle'

with open(train_pickle_path, 'wb') as f:
    pickle.dump(train, f)

with open(trade_pickle_path, 'wb') as f:
    pickle.dump(trade, f)
