# Install and Import Packages

In [None]:
## install required packages
!pip install swig
!pip install wrds
!pip install pyportfolioopt
## install finrl library
!pip install git+https://github.com/AI4Finance-Foundation/FinRL.git

# Importing the necessary libraries

In [1]:
import pandas as pd
import numpy as np
import datetime
import yfinance as yf

from finrl.meta.preprocessor.yahoodownloader import YahooDownloader
from finrl.meta.preprocessor.preprocessors import FeatureEngineer, data_split
from finrl import config_tickers
from finrl.config import INDICATORS

import itertools

In [2]:
TRAIN_START_DATE = '2009-01-01'
TRAIN_END_DATE = '2020-07-01'
TRADE_START_DATE = '2020-07-01'
TRADE_END_DATE = '2023-05-01'

In [9]:
symbols = [
    'aapl',
    'msft',
    'meta',
    'ibm',
    'hd',
    'cat',
    'amzn',
    'intc',
    't',
    'v',
    'gs']

symbols

['aapl', 'msft', 'meta', 'ibm', 'hd', 'cat', 'amzn', 'intc', 't', 'v', 'gs']

In [10]:
df_raw = YahooDownloader(start_date = TRAIN_START_DATE,
                                end_date = TRADE_END_DATE,
                                ticker_list = symbols).fetch_data()

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Shape of DataFrame:  (38804, 8)


In [11]:
df_raw.head()

Price,date,close,high,low,open,volume,tic,day
0,2009-01-02,2.730993,3.251429,3.041429,3.067143,746015200,aapl,4
1,2009-01-02,2.718,2.7265,2.5535,2.5675,145928000,amzn,4
2,2009-01-02,30.490799,46.98,44.709999,44.91,7117200,cat,4
3,2009-01-02,66.006058,87.620003,82.190002,84.019997,14088500,gs,4
4,2009-01-02,16.251101,24.190001,22.959999,23.07,14902500,hd,4


In [12]:
fe = FeatureEngineer(use_technical_indicator=True,
                     tech_indicator_list = INDICATORS,
                     use_vix=True,
                     use_turbulence=True,
                     user_defined_feature = False)

processed = fe.preprocess_data(df_raw)

Successfully added technical indicators


[*********************100%***********************]  1 of 1 completed


Shape of DataFrame:  (3604, 8)
Successfully added vix
Successfully added turbulence index


In [13]:
processed.head()

Unnamed: 0,date,close,high,low,open,volume,tic,day,macd,boll_ub,boll_lb,rsi_30,cci_30,dx_30,close_30_sma,close_60_sma,vix,turbulence
0,2009-01-02,2.730993,3.251429,3.041429,3.067143,746015200,aapl,4,0.0,2.951624,2.625622,100.0,66.666667,100.0,2.730993,2.730993,39.189999,0.0
1,2009-01-02,2.718,2.7265,2.5535,2.5675,145928000,amzn,4,0.0,2.951624,2.625622,100.0,66.666667,100.0,2.718,2.718,39.189999,0.0
2,2009-01-02,30.490799,46.98,44.709999,44.91,7117200,cat,4,0.0,2.951624,2.625622,100.0,66.666667,100.0,30.490799,30.490799,39.189999,0.0
3,2009-01-02,66.006058,87.620003,82.190002,84.019997,14088500,gs,4,0.0,2.951624,2.625622,100.0,66.666667,100.0,66.006058,66.006058,39.189999,0.0
4,2009-01-02,16.251101,24.190001,22.959999,23.07,14902500,hd,4,0.0,2.951624,2.625622,100.0,66.666667,100.0,16.251101,16.251101,39.189999,0.0


In [14]:
list_ticker = processed["tic"].unique().tolist()
list_date = list(pd.date_range(processed['date'].min(),processed['date'].max()).astype(str))
combination = list(itertools.product(list_date,list_ticker))

processed_full = pd.DataFrame(combination,columns=["date","tic"]).merge(processed,on=["date","tic"],how="left")
processed_full = processed_full[processed_full['date'].isin(processed['date'])]
processed_full = processed_full.sort_values(['date','tic'])

processed_full = processed_full.fillna(0)

In [15]:
processed_full.head()

Unnamed: 0,date,tic,close,high,low,open,volume,day,macd,boll_ub,boll_lb,rsi_30,cci_30,dx_30,close_30_sma,close_60_sma,vix,turbulence
0,2009-01-02,aapl,2.730993,3.251429,3.041429,3.067143,746015200.0,4.0,0.0,2.951624,2.625622,100.0,66.666667,100.0,2.730993,2.730993,39.189999,0.0
1,2009-01-02,amzn,2.718,2.7265,2.5535,2.5675,145928000.0,4.0,0.0,2.951624,2.625622,100.0,66.666667,100.0,2.718,2.718,39.189999,0.0
2,2009-01-02,cat,30.490799,46.98,44.709999,44.91,7117200.0,4.0,0.0,2.951624,2.625622,100.0,66.666667,100.0,30.490799,30.490799,39.189999,0.0
3,2009-01-02,gs,66.006058,87.620003,82.190002,84.019997,14088500.0,4.0,0.0,2.951624,2.625622,100.0,66.666667,100.0,66.006058,66.006058,39.189999,0.0
4,2009-01-02,hd,16.251101,24.190001,22.959999,23.07,14902500.0,4.0,0.0,2.951624,2.625622,100.0,66.666667,100.0,16.251101,16.251101,39.189999,0.0


# Save the Data

In [16]:
# Split the data
train = data_split(processed_full, TRAIN_START_DATE,TRAIN_END_DATE)
trade = data_split(processed_full, TRADE_START_DATE,TRADE_END_DATE)

print(len(train))
print(len(trade))

28930
7110


In [19]:
train_path = 'Data/train_data.csv'
trade_path = 'Data/trade_data.csv'

# check if the directory exist, if not, create the directory
import os
if not os.path.exists('Data'):
    os.makedirs('Data')

train.to_csv(train_path)
trade.to_csv(trade_path)

print(f'Data saved to "{train_path=}" and "{trade_path=}"')

Data saved to "train_path='Data/train_data.csv'" and "trade_path='Data/trade_data.csv'"
