# Asset Portfolio Management using Deep Reinforcement Learning
---

## 6.0 Data Split
---

We will split both the close prices and the whole dataset into train and test (trade) data.

We will use 80% of the data for training and then test on the remaining 20%.

We will make use of the FinRL Library function of data_split to split our data into train and test

### 6.1 Import Relevant Libraries

In [44]:
import pandas as pd
import numpy as np
import ta
from ta import add_all_ta_features
from ta.utils import dropna
from finrl.preprocessing.data import data_split
from finrl.preprocessing.preprocessors import FeatureEngineer

### 6.2 Load the data

In [55]:
%store -r data_df
%store -r filtered_stocks
%store -r df_close_full_stocks

In [56]:
data_df.head()

Unnamed: 0,date,tic,close,high,low,open,volume,cov_list,f01,f02,f03,f04
0,2009-03-20,AXP,10.072534,13.19,12.12,13.19,31088200.0,"[[0.002610715086827884, 0.0012647352623545009,...",1.764112,0.488414,5.262903,5.777148
1,2009-03-20,DIS,15.026185,17.98,17.08,17.799999,17766600.0,"[[0.002610715086827884, 0.0012647352623545009,...",1.764112,0.488414,5.262903,5.777148
2,2009-03-20,HD,16.65284,22.73,21.76,22.59,22361800.0,"[[0.002610715086827884, 0.0012647352623545009,...",1.764112,0.488414,5.262903,5.777148
3,2009-03-20,IBM,64.557983,95.0,92.18,93.160004,12193900.0,"[[0.002610715086827884, 0.0012647352623545009,...",1.764112,0.488414,5.262903,5.777148
4,2009-03-20,INTC,10.250909,15.4,14.35,15.19,84639100.0,"[[0.002610715086827884, 0.0012647352623545009,...",1.764112,0.488414,5.262903,5.777148


In [57]:
df_close_full_stocks.head()

Unnamed: 0,date,WBA,TRV,MMM,RTX,DD,KO,V,XOM,CAT,...,JNJ,CSCO,PFE,IBM,MRK,INTC,MCD,BA,NKE,GS
0,2008-03-19,27.162476,33.676067,55.994167,31.819752,34.389133,20.135204,12.92796,54.472538,51.048386,...,44.239494,18.299444,11.740806,80.073196,26.932304,14.305408,36.734047,54.094521,13.067656,138.832825
1,2008-03-20,27.461124,34.677429,54.944336,31.935442,34.45525,20.497879,14.724145,54.840305,51.110714,...,44.580441,18.523792,11.729413,81.024979,27.164101,14.753081,37.25433,55.088783,14.21739,149.790054
2,2008-03-24,28.827457,34.604851,55.409344,32.550919,35.569756,20.544893,13.667027,55.45322,52.640846,...,44.246326,19.17441,11.740806,81.524841,27.721666,15.010838,38.178493,56.186138,14.59993,149.164658
3,2008-03-25,28.379486,34.655636,55.648911,32.467625,35.758652,20.622128,14.472454,54.969315,53.063187,...,44.0145,19.256668,11.83765,80.77845,27.984785,15.105799,38.472866,55.898911,14.449876,149.790054
4,2008-03-26,28.342155,34.503265,55.423466,32.213104,35.711433,20.534811,14.634912,55.65321,53.561695,...,44.116768,18.531269,11.786383,80.052643,28.00359,14.827696,38.062122,56.193504,14.006041,146.329437


In [58]:
# Close Prices data frame

# Reset the Index to tic and date
df_prices = data_df.reset_index().set_index(['tic', 'date']).sort_index()

# Get all the Close Prices
df_close = pd.DataFrame()

for ticker in filtered_stocks:
    series = df_prices.xs(ticker).close
    df_close[ticker] = series

In [59]:
df_close.head()

Unnamed: 0_level_0,JNJ,PG,MMM,KO,IBM,VZ,MCD,PFE,RTX,WMT,MRK,V,DIS,MSFT,XOM,HD,TRV,INTC,AXP,NKE
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2009-03-20,36.306187,31.699135,33.427578,14.794217,64.557983,15.606479,37.552322,8.335665,19.231215,37.454937,17.631168,12.084035,15.026185,13.071695,43.514919,16.65284,28.982973,10.250909,10.072534,9.686008
2009-03-23,37.402328,33.19405,35.894787,15.314644,68.884636,16.375994,38.935833,8.574175,20.630972,38.882446,18.46258,12.776457,16.291998,14.044801,46.438305,17.471949,30.551222,10.859665,11.962158,10.245396
2009-03-24,37.029926,32.728191,35.152435,15.273013,68.598534,16.080833,37.806438,8.513019,20.502861,38.580322,18.310812,12.33018,15.749509,13.738316,45.681114,17.24651,29.281677,10.495812,11.419921,9.810794
2009-03-25,37.142349,33.416534,35.676453,15.526292,68.354279,15.891086,38.829945,8.720952,20.73061,39.033493,18.145855,12.134646,15.956174,13.700004,46.194687,17.494492,30.035936,10.453825,11.592449,9.976461
2009-03-26,37.170456,33.903259,37.139317,15.560982,68.933502,16.096651,39.57111,8.794338,21.575214,39.849213,18.198645,12.767261,16.412554,14.427908,46.899189,18.140776,29.916456,11.069582,12.430463,10.241093


In [60]:
df_close = df_close.reset_index()

### 6.3 Split the Data

In [61]:
# Define the start and end dates for the train and test data

train_pct = 0.8 # percentage of train data
date_list = list(data_df.date.unique()) # List of dates in the data

date_list_len = len(date_list) # len of the date list
train_data_len = int(train_pct * date_list_len) # length of the train data

train_start_date = date_list[0]
train_end_date = date_list[train_data_len]

test_start_date = date_list[train_data_len+1]
test_end_date = date_list[-1]

In [66]:
print('Training Data: ', 'from ', train_start_date, ' to ', train_end_date)

Training Data:  from  2009-03-20  to  2018-08-23


In [67]:
print('Testing Data: ', 'from ', test_start_date, ' to ', test_end_date)

Testing Data:  from  2018-08-24  to  2020-12-31


In [62]:
# Split the whole dataset
train_data = data_split(data_df, train_start_date, train_end_date)
test_data = data_split(data_df, test_start_date, test_end_date)

# Split the Close Prices dataset
prices_train_data = df_close[df_close['date']<=train_end_date]
prices_test_data = df_close[df_close['date']>=test_start_date]

# split the Close Prices of all stocks
prices_full_train = df_close_full_stocks[df_close_full_stocks['date']<=train_end_date]
prices_full_test = df_close_full_stocks[df_close_full_stocks['date']>=test_start_date]

### 6.4 Store the Dataframes

In [63]:
prices_train = prices_train_data.copy()
prices_test = prices_test_data.copy()

train_df = train_data.copy()
test_df = test_data.copy()

prices_full_train_df = prices_full_train.copy()
prices_full_test_df = prices_full_test.copy()

In [64]:
%store prices_train
%store prices_test

%store train_df
%store test_df

%store prices_full_train_df
%store prices_full_test_df

Stored 'prices_train' (DataFrame)
Stored 'prices_test' (DataFrame)
Stored 'train_df' (DataFrame)
Stored 'test_df' (DataFrame)
Stored 'prices_full_train_df' (DataFrame)
Stored 'prices_full_test_df' (DataFrame)
