# AMP-SOA portfolio optimization using adaptive meta-policy based on second-order agents with deep reinforcment learning
---

## 4.0 Feature Engineering and Data Preprocessing
---
We perform feature engineering and data preprocessing by:
* Adding Technical Indicators to the data. The technical inicators are used as inputs in the training of our Reinforcement Learning Model
* Adding Coveriance Matrices which is also used as input for training the Models
* Splitting the data into the training set and the testing (trading) set

### 4.1 Import Relevant Libraries

In [1]:
import pandas as pdp
import pandas as pd
import numpy as np
import ta
from ta import add_all_ta_features
from ta.utils import dropna

from finrl.preprocessing.data import data_split
from finrl.preprocessing.preprocessors import FeatureEngineer
from pickleshare import PickleShareDB


### 4.2 Load the Data from the csv Files

In [2]:
# Load the whole data set
data = pdp.read_csv('./datasets/data.csv')

# Load the close prices dataset
prices_data = pdp.read_csv('./datasets/close_prices.csv')



In [3]:
filtered_stocks = pd.read_csv('filtered_stocks.csv')
filtered_stocks = filtered_stocks.drop(columns=['Unnamed: 0'])
filtered_stocks = filtered_stocks['stock_name'].tolist()
%store filtered_stocks

Stored 'filtered_stocks' (list)


In [4]:
list_of_stocks = filtered_stocks
print(list_of_stocks)

['PG', 'JNJ', 'VZ', 'KO', 'PFE', 'MCD', 'MMM', 'IBM', 'WMT', 'RTX', 'HD', 'MRK', 'CSCO', 'CAT', 'V', 'AAPL', 'AXP', 'MSFT', 'XOM', 'TRV']


In [5]:
data.head()

Unnamed: 0,date,tic,close,high,low,open,volume
0,2008-03-19,AAPL,3.915352,4.796071,4.631071,4.754286,1010537000.0
1,2008-03-19,AXP,32.275002,44.48,41.919998,44.200001,14098300.0
2,2008-03-19,BA,54.094528,77.0,73.449997,76.980003,9195600.0
3,2008-03-19,CAT,47.48143,77.0,73.730003,76.620003,7377400.0
4,2008-03-19,CSCO,16.453362,25.58,24.459999,25.469999,63988600.0


In [6]:
data = data[data['tic'].isin(list_of_stocks)]

In [7]:
data.tic.unique()

array(['AAPL', 'AXP', 'CAT', 'CSCO', 'HD', 'IBM', 'JNJ', 'KO', 'MCD',
       'MMM', 'MRK', 'MSFT', 'PFE', 'PG', 'RTX', 'TRV', 'V', 'VZ', 'WMT',
       'XOM'], dtype=object)

### 4.3 Add Technical Indicators
---
We define a function to add technical indicators to the dataset by making use of the ta library

The folloing indicators are considered:
* Volatility Average True Range (ATR)
* Volatility Bollinger Band Width (BBW)
* Volume On-balance Volume (OBV
* Volume Chaikin Money Flow (CMF)
* Trend Moving Average Convergence Divergence (MACD)
* Trend Average Directional Index (ADX)
* Trend Fast Simple Moving Average (SMA)
* Trend Fast Exponential Moving Average (EMA)
* Trend Commodity Channel Index (CCI)
* Momentum Relative Strength Index (RSI)

In [8]:
# Define a Function for adding technical indicators

def add_features(data, feature_list, short_names):
    """
    Function to add technical indicators for features
    -Takes in a dataset with Open, High, Low, Close and Volume
    -Also takes in a list of the technical indicators to be added 
     as well as a list of the shortened indicator names
    """
    
    # list of column names to filter the features
    data_col_names = list(data.columns)
    filter_names = data_col_names + feature_list
    col_rename = data_col_names +  short_names
    
    # Add technical indicators using the ta Library
    data = add_all_ta_features(data, open="open", high="high", 
    low="low", close="close", volume="volume") 
    
    # Filter the Indicators with the required features
    data = data[filter_names]
    data.columns = col_rename # rename the columns to use shortened indicator names
    data = data.dropna()
    
    return data

In [9]:
# List of Features to add
feature_list= ['volatility_atr','volatility_bbw','volume_obv','volume_cmf',
               'trend_macd', 'trend_adx', 'trend_sma_fast', 
               'trend_ema_fast', 'trend_cci', 'momentum_rsi']

# Short names of the features
short_names = ['atr', 'bbw','obv','cmf','macd', 'adx', 'sma', 'ema', 'cci', 'rsi']

#feature_list= ['volatility_atr','volatility_bbw','volume_obv','volume_cmf','trend_macd']

# Short names of the features
#short_names = ['atr', 'bbw','obv','cmf','macd']

In [10]:
# Add Indicators to our dataset
data_with_features = data.copy()

data_with_features = add_features(data_with_features, feature_list, short_names)

  self._nvi.iloc[i] = self._nvi.iloc[i - 1] * (1.0 + price_change.iloc[i])


In [11]:
data_with_features.head()

Unnamed: 0,date,tic,close,high,low,open,volume,atr,bbw,obv,cmf,macd,adx,sma,ema,cci,rsi
40,2008-03-20,IBM,65.684784,113.2696,111.520073,111.940727,11943123.0,34.447397,218.648382,-789386254.0,-11.243337,5.195966,0.0,27.82283,31.272448,206.878734,58.752809
42,2008-03-20,JNJ,40.486256,65.5,64.889999,64.970001,16276300.0,31.082136,218.70276,-805662554.0,-11.486798,5.546084,0.0,28.762572,32.689956,62.821851,52.906277
44,2008-03-20,KO,18.418718,30.57,30.02,30.08,31028600.0,29.020548,218.42731,-836691154.0,-11.719739,3.996817,7.302938,27.695762,30.494381,-47.899937,48.367042
45,2008-03-20,MCD,34.413494,54.759998,53.700001,53.950001,13075600.0,29.752621,218.374354,-823615554.0,-11.837907,4.013392,7.171068,29.513118,31.097322,29.079237,51.607892
46,2008-03-20,MMM,39.348988,66.337791,64.765884,65.635452,8297011.0,29.969789,218.263812,-815318543.0,-11.900783,4.374356,7.20891,31.609394,32.366809,62.763966,52.596624


In [12]:
feature_list = list(data_with_features.columns)[7:]

In [13]:
print(feature_list)

['atr', 'bbw', 'obv', 'cmf', 'macd', 'adx', 'sma', 'ema', 'cci', 'rsi']


### 4.4 Add Covariance Matrix
---
We define a function that will add Covarance Matrices to our dataset

In [14]:
def add_cov_matrix(df):
    """
    Function to add Coveriance Matrices as part of the defined states
    """
    # Sort the data and index by date and tic
    df=df.sort_values(['date','tic'],ignore_index=True) 
    df.index = df.date.factorize()[0]
    
    cov_list = [] # create empty list for storing coveriance matrices at each time step
    
    # look back for constructing the coveriance matrix is one year
    lookback=252
    for i in range(lookback,len(df.index.unique())):
        data_lookback = df.loc[i-lookback:i,:]
        price_lookback=data_lookback.pivot_table(index = 'date',columns = 'tic', values = 'close')
        return_lookback = price_lookback.pct_change().dropna()
        covs = return_lookback.cov().values 
        covs = covs#/covs.max()
        cov_list.append(covs)
        
    df_cov = pd.DataFrame({'date':df.date.unique()[lookback:],'cov_list':cov_list})
    df = df.merge(df_cov, on='date')
    df = df.sort_values(['date','tic']).reset_index(drop=True)
    
    return df

In [15]:
# Add Covariance Matrices to our dataset
data_with_features_covs = data_with_features.copy()
data_with_features_covs = add_cov_matrix(data_with_features_covs)

In [16]:
data_with_features_covs.head()

Unnamed: 0,date,tic,close,high,low,open,volume,atr,bbw,obv,cmf,macd,adx,sma,ema,cci,rsi,cov_list
0,2009-03-20,AAPL,3.067483,3.6825,3.591786,3.646071,695587200.0,21.624245,240.771026,-287842300000.0,-15.604885,-1.092894,4.784446,17.715581,18.197433,-112.331634,45.730861,"[[0.0013155151011021062, 0.0011575384750591776..."
1,2009-03-20,AXP,9.629786,13.19,12.12,13.19,31088200.0,20.474072,241.683988,-287811200000.0,-15.658606,-1.695398,4.652402,16.456213,16.879334,-74.319369,47.538704,"[[0.0013155151011021062, 0.0011575384750591776..."
2,2009-03-20,CAT,17.987209,28.9,26.73,28.629999,16531300.0,20.353686,242.338441,-287794700000.0,-15.545789,-1.481436,4.441961,16.727739,17.049776,-15.127649,49.830865,"[[0.0013155151011021062, 0.0011575384750591776..."
3,2009-03-20,CSCO,10.697708,16.57,15.75,16.370001,66078200.0,18.542039,242.632129,-287860800000.0,-15.336297,-1.878417,4.241299,16.557303,16.072535,-61.675433,47.866388,"[[0.0013155151011021062, 0.0011575384750591776..."
4,2009-03-20,HD,15.282681,22.73,21.76,22.59,22361800.0,17.891064,242.822755,-287838400000.0,-15.196946,-1.802284,3.962916,17.226666,15.951019,-36.079148,49.222342,"[[0.0013155151011021062, 0.0011575384750591776..."


### 4.6 Store the Dataframe

In [17]:
df = data_with_features_covs

In [18]:
df.to_csv('df.csv', index=False)
%store df

Stored 'df' (DataFrame)
