In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
# matplotlib.use('Agg')
import datetime

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
from finrl.meta.preprocessor.yahoodownloader import YahooDownloader
from finrl.meta.preprocessor.preprocessors import FeatureEngineer, data_split
from finrl.meta.env_stock_trading.env_stocktrading import StockTradingEnv

from finrl.meta.data_processor import DataProcessor

from finrl.plot import backtest_stats, backtest_plot, get_daily_return, get_baseline
from pprint import pprint

import sys
sys.path.append("../FinRL")

import itertools
import random
import json

from finrl import config
from finrl import config_tickers
import os
from finrl.main import check_and_make_directories
from finrl.config import (
    DATA_SAVE_DIR,
    TRAINED_MODEL_DIR,
    TENSORBOARD_LOG_DIR,
    RESULTS_DIR,
)
print(f"DATA_SAVE_DIR: {DATA_SAVE_DIR}")
check_and_make_directories([DATA_SAVE_DIR, TRAINED_MODEL_DIR, TENSORBOARD_LOG_DIR, RESULTS_DIR])

DATA_SAVE_DIR: datasets


In [2]:
TRAIN_START_DATE = '2010-01-01'
TRAIN_END_DATE = '2020-01-01'
TRADE_START_DATE = '2020-01-02'
TRADE_END_DATE = '2023-03-26'

In [3]:
if os.path.exists("dataset/yahoo_data.csv"):
    df = pd.read_csv("dataset/yahoo_data.csv", index_col=0)
else:
    df = YahooDownloader(
        start_date = TRAIN_START_DATE,
        end_date = TRADE_END_DATE,
        ticker_list = config_tickers.DOW_30_TICKER
    ).fetch_data()
    if not os.path.exists("dataset"):
        os.makedirs("dataset")
    df.to_csv("dataset/yahoo_data.csv")

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

In [4]:
print(config_tickers.DOW_30_TICKER)

['AXP', 'AMGN', 'AAPL', 'BA', 'CAT', 'CSCO', 'CVX', 'GS', 'HD', 'HON', 'IBM', 'INTC', 'JNJ', 'KO', 'JPM', 'MCD', 'MMM', 'MRK', 'MSFT', 'NKE', 'PG', 'TRV', 'UNH', 'CRM', 'VZ', 'V', 'WBA', 'WMT', 'DIS', 'DOW']


In [5]:
df.sort_values(['date','tic'],ignore_index=True).tail()

Unnamed: 0,date,open,high,low,close,volume,tic,day
97548,2023-03-24,468.980011,476.880005,467.589996,475.98999,2533200,UNH,4
97549,2023-03-24,220.669998,221.330002,218.25,221.039993,4791500,V,4
97550,2023-03-24,37.529999,37.700001,37.25,37.66,14957700,VZ,4
97551,2023-03-24,32.380001,32.790001,31.860001,32.700001,8105300,WBA,4
97552,2023-03-24,141.009995,142.470001,140.600006,141.800003,7584100,WMT,4


In [6]:
INDICATORS = ['macd','rsi_14', 'rsi_21','rsi_28','boll_ub','boll_lb','rsi_30','cci_30','dx_30','close_30_sma','close_60_sma']


In [7]:
fe = FeatureEngineer(
    use_technical_indicator=True,
    tech_indicator_list = INDICATORS,
    use_vix=True,
    use_turbulence=True,
    user_defined_feature = False
)
processed = fe.preprocess_data(df)

Successfully added technical indicators
[*********************100%***********************]  1 of 1 completed
Shape of DataFrame:  (3328, 8)
Successfully added vix
Successfully added turbulence index


In [8]:
list_ticker = processed["tic"].unique().tolist()
list_date = list(pd.date_range(processed['date'].min(),processed['date'].max()).astype(str))
combination = list(itertools.product(list_date,list_ticker))

processed_full = pd.DataFrame(combination,columns=["date","tic"]).merge(processed,on=["date","tic"],how="left")
processed_full = processed_full[processed_full['date'].isin(processed['date'])]
processed_full = processed_full.sort_values(['date','tic'])

processed_full = processed_full.fillna(0)

In [9]:
processed_full.sort_values(['date','tic'],ignore_index=True).tail(20)

Unnamed: 0,date,tic,open,high,low,close,volume,day,macd,rsi_14,...,rsi_28,boll_ub,boll_lb,rsi_30,cci_30,dx_30,close_30_sma,close_60_sma,vix,turbulence
96492,2023-03-23,GS,315.209991,321.359985,312.640015,314.850006,2585900.0,3.0,-12.399119,36.911846,...,39.801473,373.680885,293.494579,40.251338,-99.00961,22.509771,344.307542,348.727129,22.610001,13.533917
96493,2023-03-23,HD,285.26001,286.109985,280.220001,283.910004,6005800.0,3.0,-5.803666,36.387216,...,40.84208,297.322331,281.637673,41.365357,-97.183636,28.993482,296.40575,307.710951,22.610001,13.533917
96494,2023-03-23,HON,186.860001,189.419998,185.550003,186.919998,2360100.0,3.0,-3.145101,37.784571,...,40.690171,198.656886,185.108115,41.12656,-128.632709,36.135221,194.066415,200.810197,22.610001,13.533917
96495,2023-03-23,IBM,123.809998,124.93,122.599998,123.370003,4651900.0,3.0,-2.46589,33.47702,...,36.688263,131.850417,121.854585,37.263026,-110.989631,31.672023,129.321668,133.897389,22.610001,13.533917
96496,2023-03-23,INTC,28.379999,29.16,28.219999,29.030001,43785500.0,3.0,0.468551,58.109139,...,54.168281,30.466031,23.753969,53.822388,80.95239,22.616393,27.224,27.786414,22.610001,13.533917
96497,2023-03-23,JNJ,151.179993,151.690002,150.110001,151.130005,9051500.0,3.0,-2.152461,35.407257,...,36.496259,156.277143,150.42886,36.809532,-104.872194,31.282995,155.333736,162.609591,22.610001,13.533917
96498,2023-03-23,JPM,127.900002,129.529999,126.019997,126.839996,16677600.0,3.0,-3.399305,38.465405,...,42.424294,147.962102,121.972897,42.923722,-121.604856,36.766842,137.076667,137.501366,22.610001,13.533917
96499,2023-03-23,KO,60.09,60.389999,59.720001,59.919998,15479000.0,3.0,0.113568,51.457503,...,49.719309,60.703675,58.434541,49.648368,59.864391,3.041277,59.501945,60.248045,22.610001,13.533917
96500,2023-03-23,MCD,267.98999,271.480011,267.98999,269.619995,2263800.0,3.0,1.003924,55.170691,...,53.117071,272.718056,259.980571,53.003185,106.305887,15.213954,265.798108,265.860198,22.610001,13.533917
96501,2023-03-23,MMM,101.660004,102.93,100.760002,101.0,3177300.0,3.0,-2.764887,31.180849,...,36.692393,112.026798,99.610201,37.210703,-127.413197,28.635478,107.736245,113.36256,22.610001,13.533917


In [10]:
import pickle
processed_full.to_pickle('dataset/processed.pkl')