# Asset Portfolio Management using Deep Reinforcement Learning
---

## 4.0 Feature Engineering and Data Preprocessing
---
We perform feature engineering and data preprocessing by:
* Adding Technical Indicators to the data. The technical inicators are used as inputs in the training of our Reinforcement Learning Model
* Adding Coveriance Matrices which is also used as input for training the Models
* Splitting the data into the training set and the testing (trading) set

### 4.1 Import Relevant Libraries

In [37]:
import pandas as pd
import numpy as np
import ta
from ta import add_all_ta_features
from ta.utils import dropna
from finrl.preprocessing.data import data_split
from finrl.preprocessing.preprocessors import FeatureEngineer

### 4.2 Load the Data from the csv Files

In [55]:
# Load the whole data set
data = pd.read_csv('./datasets/data.csv')

# Load the close prices dataset
prices_data = pd.read_csv('./datasets/close_prices.csv')

In [56]:
%store -r filtered_stocks

In [57]:
list_of_stocks = filtered_stocks
print(list_of_stocks)

Index(['JNJ', 'PG', 'MMM', 'KO', 'IBM', 'VZ', 'MCD', 'PFE', 'RTX', 'WMT',
       'MRK', 'V', 'DIS', 'MSFT', 'XOM', 'HD', 'TRV', 'INTC', 'AXP', 'NKE'],
      dtype='object', name='stock_name')


In [58]:
data.head()

Unnamed: 0,date,tic,close,high,low,open,volume
0,2008-03-19,AAPL,3.994995,4.796071,4.631071,4.754286,1010537000.0
1,2008-03-19,MSFT,21.46834,29.59,28.620001,29.379999,61442100.0
2,2008-03-19,JPM,30.591537,44.889999,42.439999,43.259998,70593300.0
3,2008-03-19,V,12.92796,17.25,13.75,14.875,708486000.0
4,2008-03-19,RTX,31.819752,44.361233,43.272499,43.813721,9691947.0


In [59]:
data = data[data['tic'].isin(list_of_stocks)]

In [60]:
data.tic.unique()

array(['MSFT', 'V', 'RTX', 'PG', 'NKE', 'DIS', 'AXP', 'HD', 'INTC', 'WMT',
       'IBM', 'MRK', 'KO', 'TRV', 'JNJ', 'MCD', 'VZ', 'XOM', 'MMM', 'PFE'],
      dtype=object)

### 4.3 Add Technical Indicators
---
We define a function to add technical indicators to the dataset by making use of the ta library

The folloing indicators are considered:
* Volatility Average True Range (ATR)
* Volatility Bollinger Band Width (BBW)
* Volume On-balance Volume (OBV
* Volume Chaikin Money Flow (CMF)
* Trend Moving Average Convergence Divergence (MACD)
* Trend Average Directional Index (ADX)
* Trend Fast Simple Moving Average (SMA)
* Trend Fast Exponential Moving Average (EMA)
* Trend Commodity Channel Index (CCI)
* Momentum Relative Strength Index (RSI)

In [61]:
# Define a Function for adding technical indicators

def add_features(data, feature_list, short_names):
    """
    Function to add technical indicators for features
    -Takes in a dataset with Open, High, Low, Close and Volume
    -Also takes in a list of the technical indicators to be added 
     as well as a list of the shortened indicator names
    """
    
    # list of column names to filter the features
    data_col_names = list(data.columns)
    filter_names = data_col_names + feature_list
    col_rename = data_col_names +  short_names
    
    # Add technical indicators using the ta Library
    data = add_all_ta_features(data, open="open", high="high", 
    low="low", close="close", volume="volume") 
    
    # Filter the Indicators with the required features
    data = data[filter_names]
    data.columns = col_rename # rename the columns to use shortened indicator names
    data = data.dropna()
    
    return data

In [62]:
# List of Features to add
feature_list= ['volatility_atr','volatility_bbw','volume_obv','volume_cmf',
               'trend_macd', 'trend_adx', 'trend_sma_fast', 
               'trend_ema_fast', 'trend_cci', 'momentum_rsi']

# Short names of the features
short_names = ['atr', 'bbw','obv','cmf','macd', 'adx', 'sma', 'ema', 'cci', 'rsi']

#feature_list= ['volatility_atr','volatility_bbw','volume_obv','volume_cmf','trend_macd']

# Short names of the features
#short_names = ['atr', 'bbw','obv','cmf','macd']

In [63]:
# Add Indicators to our dataset
data_with_features = data.copy()

data_with_features = add_features(data_with_features, feature_list, short_names)

  self._nvi.iloc[i] = self._nvi.iloc[i - 1] * (1.0 + price_change.iloc[i])
  dip[i] = 100 * (self._dip[i] / self._trs[i])
  din[i] = 100 * (self._din[i] / self._trs[i])


In [64]:
data_with_features.head()

Unnamed: 0,date,tic,close,high,low,open,volume,atr,bbw,obv,cmf,macd,adx,sma,ema,cci,rsi
38,2008-03-20,DIS,27.063406,31.98,31.25,31.280001,13802300.0,26.127485,212.267559,-873707227.0,-15.908724,0.01635,0.0,31.457803,29.281244,-42.79771,49.486309
39,2008-03-20,AXP,36.957962,46.130001,42.209999,42.209999,17558800.0,25.421396,211.577285,-856148427.0,-15.766593,0.627527,0.0,30.851009,30.462278,0.466515,51.78952
40,2008-03-20,HD,20.26516,28.17,26.959999,26.969999,22243000.0,23.879052,210.890244,-878391427.0,-15.72992,-0.232402,5.426657,29.478602,28.89349,-61.996629,47.827473
41,2008-03-20,INTC,14.753081,21.76,21.09,21.469999,67373400.0,21.640631,210.455216,-945764827.0,-15.512119,-1.343197,5.274361,29.253385,26.718043,-84.305925,46.560787
42,2008-03-20,WMT,39.480015,53.490002,50.799999,50.849998,44533300.0,23.35026,210.268469,-901231527.0,-14.413917,-0.225652,5.335948,28.004008,28.681423,23.121229,52.622596


In [65]:
feature_list = list(data_with_features.columns)[7:]

In [66]:
print(feature_list)

['atr', 'bbw', 'obv', 'cmf', 'macd', 'adx', 'sma', 'ema', 'cci', 'rsi']


### 4.4 Add Covariance Matrix
---
We define a function that will add Covarance Matrices to our dataset

In [67]:
def add_cov_matrix(df):
    """
    Function to add Coveriance Matrices as part of the defined states
    """
    # Sort the data and index by date and tic
    df=df.sort_values(['date','tic'],ignore_index=True) 
    df.index = df.date.factorize()[0]
    
    cov_list = [] # create empty list for storing coveriance matrices at each time step
    
    # look back for constructing the coveriance matrix is one year
    lookback=252
    for i in range(lookback,len(df.index.unique())):
        data_lookback = df.loc[i-lookback:i,:]
        price_lookback=data_lookback.pivot_table(index = 'date',columns = 'tic', values = 'close')
        return_lookback = price_lookback.pct_change().dropna()
        covs = return_lookback.cov().values 
        covs = covs#/covs.max()
        cov_list.append(covs)
        
    df_cov = pd.DataFrame({'date':df.date.unique()[lookback:],'cov_list':cov_list})
    df = df.merge(df_cov, on='date')
    df = df.sort_values(['date','tic']).reset_index(drop=True)
    
    return df

In [68]:
# Add Covariance Matrices to our dataset
data_with_features_covs = data_with_features.copy()
data_with_features_covs = add_cov_matrix(data_with_features_covs)

In [69]:
data_with_features_covs.head()

Unnamed: 0,date,tic,close,high,low,open,volume,atr,bbw,obv,cmf,macd,adx,sma,ema,cci,rsi,cov_list
0,2009-03-20,AXP,10.072534,13.19,12.12,13.19,31088200.0,16.678314,243.395109,-14450760000.0,-14.875466,-2.758666,5.959579,21.112451,18.906621,-76.944259,46.527407,"[[0.002610715086827884, 0.0012647352623545009,..."
1,2009-03-20,DIS,15.026185,17.98,17.08,17.799999,17766600.0,18.20855,242.540988,-14419680000.0,-14.772689,-2.079888,5.944065,23.204316,20.512819,-57.177326,47.834794,"[[0.002610715086827884, 0.0012647352623545009,..."
2,2009-03-20,HD,16.65284,22.73,21.76,22.59,22361800.0,16.276229,243.588752,-14428400000.0,-14.63266,-2.734109,5.68613,19.282005,18.559885,-41.654768,48.539462,"[[0.002610715086827884, 0.0012647352623545009,..."
3,2009-03-20,IBM,64.557983,95.0,92.18,93.160004,12193900.0,21.418925,243.773182,-14474540000.0,-13.786223,2.275133,7.138456,20.680496,27.180991,218.356625,60.801785,"[[0.002610715086827884, 0.0012647352623545009,..."
4,2009-03-20,INTC,10.250909,15.4,14.35,15.19,84639100.0,14.87889,244.03741,-14513040000.0,-13.949048,-3.194407,5.647357,18.807598,17.281581,-70.446011,46.698434,"[[0.002610715086827884, 0.0012647352623545009,..."


### 4.6 Store the Dataframe

In [70]:
df = data_with_features_covs

In [71]:
%store df

Stored 'df' (DataFrame)
