In [2]:
import pandas as pd
%matplotlib inline
from finrl.config_tickers import SP_500_TICKER
from finrl.meta.preprocessor.yahoodownloader import YahooDownloader
from finrl.meta.preprocessor.preprocessors import FeatureEngineer, data_split
from finrl.agents.stablebaselines3.models import DRLAgent, DRLEnsembleAgent
from finrl.plot import backtest_stats, backtest_plot, get_daily_return, get_baseline

import pandas_datareader as web
import gdown

import os
from finrl.main import check_and_make_directories
from finrl.config import (
    DATA_SAVE_DIR,
    TRAINED_MODEL_DIR,
    TENSORBOARD_LOG_DIR,
    RESULTS_DIR,
    INDICATORS,
    TRAIN_START_DATE,
    TRAIN_END_DATE,
    TEST_END_DATE,
    TRADE_START_DATE,
    TRADE_END_DATE
)

check_and_make_directories([DATA_SAVE_DIR, TRAINED_MODEL_DIR, TENSORBOARD_LOG_DIR, RESULTS_DIR])

In [3]:
TRAIN_START_DATE = '2010-01-01'
TRAIN_END_DATE = '2017-12-31'

In [4]:
TEST_START_DATE = '2018-01-01'
TEST_END_DATE = '2020-12-30'

In [5]:
# The link for the data folder in Google drive. 
data_url = "https://drive.google.com/drive/folders/1zQGlgh5kHTXSq7eoyXf6i_uAWYFJ5xzx?usp=share_link"

current_path = os.getcwd() # Get the current working directory.

data_folder_path = os.path.join(os.getcwd(),"genie_data") # Generate data folder path. 

# Check if the folder already exists locally

if not os.path.exists(data_folder_path):
    
    os.makedirs(data_folder_path)
    print(f"Downloading data from Google Drive to: {data_folder_path}")
    gdown.download_folder(data_url,output=data_folder_path, quiet=True, use_cookies=False) # Download the data from Google Drive.
    
else:
    print(f"Directory already exists: {data_folder_path}")

Directory already exists: c:\Users\drebi\dev\AIAgentPortfolio\test-env\genie_data


In [6]:
yf_df = pd.read_csv('./genie_data/yf-data.csv', index_col='Unnamed: 0')
yf_df['date'] = pd.to_datetime(yf_df['date'])
yf_df

Unnamed: 0,date,open,high,low,close,volume,tic,day
0,2010-01-04,39.099998,39.419998,38.840000,22.922108,2175500,D,0
1,2010-01-04,11.220000,11.430000,10.950000,9.966131,14482500,DAL,0
2,2010-01-04,5.660000,5.970000,5.650000,4.164575,14901600,KEY,0
3,2010-01-04,25.320000,25.910000,24.930000,19.833185,3811400,LNC,0
4,2010-01-04,15.245000,15.350000,15.120000,9.583809,1332800,LNT,0
...,...,...,...,...,...,...,...,...
28549,2020-12-30,50.520000,51.000000,50.360001,47.424179,563800,LNT,2
28550,2020-12-30,147.470001,147.990005,147.009995,138.462997,2224900,PEP,2
28551,2020-12-30,138.600006,138.919998,137.550003,130.316620,3261400,PG,2
28552,2020-12-30,372.339996,373.100006,371.570007,359.862762,49455300,SPY,2


In [7]:
sentiments_df = pd.read_csv('./genie_data/sentiments_df.csv', index_col='Unnamed: 0')
sentiments_df['date'] = pd.to_datetime(sentiments_df['date'])
sentiments_df

Unnamed: 0,date,SentimentScore,tic
0,2005-01-01,0.0,ENPH
0,2005-01-01,0.0,LNC
0,2005-01-01,0.0,ETSY
0,2005-01-01,0.0,XEL
0,2005-01-01,0.0,PG
...,...,...,...
7427,2020-12-31,0.0,KEY
6218,2020-12-31,0.0,LNT
6010,2020-12-31,0.0,ENPH
6213,2020-12-31,0.0,XEL


In [8]:
gdp = web.DataReader('GDP', 'fred', 2010, 2020)
gdp = gdp.reset_index()

In [9]:
date = pd.date_range(start='2010-01-01', end='2020-12-30')
date_df = pd.DataFrame()
date_df['date'] = date

In [10]:
gdp['date']=pd.to_datetime(gdp['DATE'])
gdp.rename(columns = {'GDP':'gdp'}, inplace = True)
gdp = gdp.drop(['DATE'], axis=1)
gdp_df=gdp.merge(date_df, on='date', how='right')
gdp_df = gdp_df.fillna(method='ffill')
gdp_df = gdp_df.merge(pd.DataFrame({"tic":yf_df.tic.unique()}),how="cross")

  gdp_df = gdp_df.fillna(method='ffill')


In [11]:
df = pd.merge(pd.merge(yf_df, sentiments_df, how='left', left_on=['date','tic'], right_on = ['date','tic']),
              gdp_df, how='left', left_on=['date','tic'], right_on = ['date','tic'])

In [12]:
df

Unnamed: 0,date,open,high,low,close,volume,tic,day,SentimentScore,gdp
0,2010-01-04,39.099998,39.419998,38.840000,22.922108,2175500,D,0,0.0,14764.610
1,2010-01-04,11.220000,11.430000,10.950000,9.966131,14482500,DAL,0,0.0,14764.610
2,2010-01-04,5.660000,5.970000,5.650000,4.164575,14901600,KEY,0,0.0,14764.610
3,2010-01-04,25.320000,25.910000,24.930000,19.833185,3811400,LNC,0,0.0,14764.610
4,2010-01-04,15.245000,15.350000,15.120000,9.583809,1332800,LNT,0,0.0,14764.610
...,...,...,...,...,...,...,...,...,...,...
41102,2020-12-30,50.520000,51.000000,50.360001,47.424179,563800,LNT,2,0.0,21751.238
41103,2020-12-30,147.470001,147.990005,147.009995,138.462997,2224900,PEP,2,0.0,21751.238
41104,2020-12-30,138.600006,138.919998,137.550003,130.316620,3261400,PG,2,0.0,21751.238
41105,2020-12-30,372.339996,373.100006,371.570007,359.862762,49455300,SPY,2,,21751.238


In [13]:
df['SentimentScore'] = df['SentimentScore'].fillna(0)

In [14]:
df.head(20)

Unnamed: 0,date,open,high,low,close,volume,tic,day,SentimentScore,gdp
0,2010-01-04,39.099998,39.419998,38.84,22.922108,2175500,D,0,0.0,14764.61
1,2010-01-04,11.22,11.43,10.95,9.966131,14482500,DAL,0,0.0,14764.61
2,2010-01-04,5.66,5.97,5.65,4.164575,14901600,KEY,0,0.0,14764.61
3,2010-01-04,25.32,25.91,24.93,19.833185,3811400,LNC,0,0.0,14764.61
4,2010-01-04,15.245,15.35,15.12,9.583809,1332800,LNT,0,0.0,14764.61
5,2010-01-04,61.189999,61.52,60.639999,41.632233,6585900,PEP,0,0.0,14764.61
6,2010-01-04,61.110001,61.310001,60.630001,41.181229,9190800,PG,0,0.0,14764.61
7,2010-01-04,112.370003,113.389999,111.510002,88.117897,118944600,SPY,0,0.0,14764.61
8,2010-01-04,21.379999,21.379999,21.040001,13.396955,2670400,XEL,0,0.0,14764.61
9,2010-01-05,38.860001,39.02,38.080002,22.639708,2802200,D,1,0.0,14764.61


In [15]:
df.drop_duplicates(subset=['date', 'tic'], inplace=True)
df.shape

(28554, 10)

In [16]:
most_volatile_stocks = ['ENPH', 'KEY', 'DAL', 'LNC', 'ETSY']

# Add index to both the stocks
most_volatile_stocks.append('SPY')

df_mv = df[df['tic'].isin(most_volatile_stocks)]

df_mv

Unnamed: 0,date,open,high,low,close,volume,tic,day,SentimentScore,gdp
1,2010-01-04,11.220000,11.430000,10.950000,9.966131,14482500,DAL,0,0.0,14764.610
2,2010-01-04,5.660000,5.970000,5.650000,4.164575,14901600,KEY,0,0.0,14764.610
3,2010-01-04,25.320000,25.910000,24.930000,19.833185,3811400,LNC,0,0.0,14764.610
7,2010-01-04,112.370003,113.389999,111.510002,88.117897,118944600,SPY,0,0.0,14764.610
10,2010-01-05,11.320000,12.340000,11.290000,10.747787,25066000,DAL,1,0.0,14764.610
...,...,...,...,...,...,...,...,...,...,...
41098,2020-12-30,171.679993,177.550003,171.679993,172.929993,2474100,ENPH,2,0.0,21751.238
41099,2020-12-30,178.300003,183.410004,176.119995,183.179993,2125600,ETSY,2,0.0,21751.238
41100,2020-12-30,16.010000,16.320000,15.980000,14.926527,5757800,KEY,2,0.0,21751.238
41101,2020-12-30,49.500000,50.000000,49.310001,46.046585,687300,LNC,2,0.0,21751.238


In [17]:
least_volatile_stocks = ['XEL', 'PG', 'LNT', 'PEP', 'D']

# Add index to both the stocks
least_volatile_stocks.append('SPY')

df_lv = df[df['tic'].isin(least_volatile_stocks)]

df_lv

Unnamed: 0,date,open,high,low,close,volume,tic,day,SentimentScore,gdp
0,2010-01-04,39.099998,39.419998,38.840000,22.922108,2175500,D,0,0.0,14764.610
4,2010-01-04,15.245000,15.350000,15.120000,9.583809,1332800,LNT,0,0.0,14764.610
5,2010-01-04,61.189999,61.520000,60.639999,41.632233,6585900,PEP,0,0.0,14764.610
6,2010-01-04,61.110001,61.310001,60.630001,41.181229,9190800,PG,0,0.0,14764.610
7,2010-01-04,112.370003,113.389999,111.510002,88.117897,118944600,SPY,0,0.0,14764.610
...,...,...,...,...,...,...,...,...,...,...
41102,2020-12-30,50.520000,51.000000,50.360001,47.424179,563800,LNT,2,0.0,21751.238
41103,2020-12-30,147.470001,147.990005,147.009995,138.462997,2224900,PEP,2,0.0,21751.238
41104,2020-12-30,138.600006,138.919998,137.550003,130.316620,3261400,PG,2,0.0,21751.238
41105,2020-12-30,372.339996,373.100006,371.570007,359.862762,49455300,SPY,2,0.0,21751.238


In [18]:
technical_indicators = ['macd', 'rsi_30', 'cci_30', 'dx_30']

fe_pipeline = FeatureEngineer(use_technical_indicator=True,
                             tech_indicator_list=technical_indicators,
                             use_turbulence=True,
                             user_defined_feature=True
)

df_processed = fe_pipeline.preprocess_data(df)

Successfully added technical indicators
Successfully added turbulence index
Successfully added user defined features


In [19]:
df_processed.head()

Unnamed: 0,date,open,high,low,close,volume,tic,day,SentimentScore,gdp,macd,rsi_30,cci_30,dx_30,turbulence,daily_return
0,2010-01-04,39.099998,39.419998,38.84,22.922108,2175500,D,0,0.0,14764.61,0.0,0.0,-66.666667,100.0,0.0,-0.565218
1,2010-01-04,11.22,11.43,10.95,9.966131,14482500,DAL,0,0.0,14764.61,0.0,0.0,-66.666667,100.0,0.0,-0.565218
2,2010-01-04,5.66,5.97,5.65,4.164575,14901600,KEY,0,0.0,14764.61,0.0,0.0,-66.666667,100.0,0.0,-0.582127
3,2010-01-04,25.32,25.91,24.93,19.833185,3811400,LNC,0,0.0,14764.61,0.0,0.0,-66.666667,100.0,0.0,3.762355
4,2010-01-04,15.245,15.35,15.12,9.583809,1332800,LNT,0,0.0,14764.61,0.0,0.0,-66.666667,100.0,0.0,-0.516779


In [20]:
udfs = 2    # Sentiment Scores, GDP

stock_dimension = len(df_processed.tic.unique())

state_space = 1 + 2 * stock_dimension + len(technical_indicators) * stock_dimension + udfs * stock_dimension

print(f'Stock Dimension: {stock_dimension}, state space: {state_space}')

Stock Dimension: 9, state space: 73


In [None]:
env_variables = {
    "hmax": 100,
    "buy_cost_pct": 0.001,
}