# Data Preparation 

## Imports
#### Base Imports


In [1]:
import warnings
warnings.filterwarnings('ignore')

import sys
import os
sys.path.append("../FinRL/")
sys.path.append("../")

import pandas as pd 
import numpy as np
import itertools
import datetime
import random
import pickle
import yfinance as yf

#### PCA Imports

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay

#### FinRL Imports

In [3]:
from finrl.meta.preprocessor.yahoodownloader import YahooDownloader
from finrl.meta.preprocessor.preprocessors import FeatureEngineer, data_split

#### Configs

In [4]:
from Config import local_config
from Config.local_config import (
    TRAIN_START_DATE,
    TRAIN_END_DATE,

    TRADE_START_DATE,
    TRADE_END_DATE,

    DOW_30_TICKERS
)

from finrl import config

## Feature Engineering

Feature Engineering consists of a set of classes aimed at retrieving and extrapolating data. The functions can:
1. Retrieve OHLCV data for any set of tickers
2. Compute the values for a list of indicators on the OHLCV data
3. Run PCA on the indicator data.

In [5]:
class run_PCA():
    def __init__(self, indicators, data, tickers) -> None:
        self.data = data
        self.tickers = tickers
        self.indicators = indicators
        self.number_of_components = -1
    
    def run_PCA(self, n_components = .99):
        x = self.data.loc[:, self.indicators].values
        x = StandardScaler().fit_transform(x)
        pca = PCA(n_components=n_components)

        principalComponents = pca.fit_transform(x)
        principalDf = pd.DataFrame(data = principalComponents)

        pickle.dump(pca, open(f"trained_models/PCA_Model_{str(n_components)}.pickle", 'wb'))

        ret = self.data[['date','tic','open','high','low','close','volume', 'day']]

        for count in range(0, len(principalDf.columns)):
            ret[count] = principalDf[count].values

        ret['vix'] = self.data['vix']
        ret['turbulence'] = self.data['turbulence']
        
        self.number_of_components = len(principalDf.columns)

        ret = ret.fillna(0)

        # ConfusionMatrixDisplay.from_estimator(
        #     principalComponents, display_labels = self.indicators, xticks_rotation = "vertical"
        # )

        return ret

In [6]:
class data_preparation:
    def __init__(self, start_date, end_date, tickers, indicators) -> None:
        self.start_date = start_date
        self.end_date = end_date
        self.tickers = tickers
        self.indicators = indicators

        self.data = YahooDownloader(start_date = self.start_date,
                                    end_date = self.end_date,
                                    ticker_list = tickers).fetch_data()
        self.data.sort_values(['date', 'tic'], ignore_index = True)
        

    def add_indicators(self, vix:bool, turbulence:bool):
        fe = FeatureEngineer(
            use_technical_indicator = True,
            tech_indicator_list = self.indicators, 
            use_vix = vix, # a real time market index representing the markets expectations for volatility over the next 30 days
            use_turbulence = True, # accounts for unexpected rising and falling of the stock market
            user_defined_feature = False)

        processed = fe.preprocess_data(self.data)
        
        list_ticker = processed["tic"].unique().tolist()
        list_date = list(pd.date_range(processed['date'].min(),processed['date'].max()).astype(str))
        combination = list(itertools.product(list_date,list_ticker))

        processed_full = pd.DataFrame(combination,columns=["date","tic"]).merge(processed,on=["date","tic"],how="left")
        processed_full = processed_full[processed_full['date'].isin(processed['date'])]
        processed_full = processed_full.sort_values(['date','tic'])

        self.data = processed_full.fillna(0)
    
    def add_pca(self, vix, turbulence, n_components):
        self.add_indicators(vix, turbulence)
        self.pca = run_PCA(self.indicators, self.data, self.tickers)
        self.data = self.pca.run_PCA(n_components)
    
    def save_data(self, name):
        self.data.to_csv(f"../Datasets/{name}.csv")

## Data Retrieval and Saving 

#### Indicator Data for DOW Tickers

In [None]:
indicatorData = data_preparation(TRAIN_START_DATE, TRADE_END_DATE, local_config.DOW_30_TICKERS, local_config.PCA_INDICATORS)
indicatorData.add_indicators(vix=True, turbulence=True)
indicatorData.save_data(name = "DowIndicatorData")

#### PCA Data for DOW Tickers

In [None]:
n_components = .70
pcaData = data_preparation(TRAIN_START_DATE, TRADE_END_DATE, local_config.DOW_30_TICKERS, local_config.PCA_INDICATORS)
pcaData.add_pca(vix=True, turbulence=True, n_components = n_components)
pcaData.save_data(name = f"/PCA/Dow_Pca_{str(n_components)}")

#### PCA Data up to data

In [7]:
n_components = .99
pcaData = data_preparation(TRAIN_START_DATE, "2023-05-08", local_config.DOW_30_TICKERS, local_config.PCA_INDICATORS)
pcaData.add_pca(vix=True, turbulence=True, n_components = n_components)
pcaData.save_data(name = f"/PCA/Dow_Pca_{str(n_components)}_current")

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

In [8]:
n_components = .85
pcaData = data_preparation(TRAIN_START_DATE, "2023-05-08", local_config.DOW_30_TICKERS, local_config.PCA_INDICATORS)
pcaData.add_pca(vix=True, turbulence=True, n_components = n_components)
pcaData.save_data(name = f"/PCA/Dow_Pca_{str(n_components)}_current")

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

## Dataset Analysis 

#### Indicator Data for Dow Tickers

In [10]:
indicatorDataset = pd.read_csv("../Datasets/DOWIndicatorData.csv", index_col=0)

In [11]:
print(f"Num of rows: {len(indicatorDataset)}")
rand = random.randint(0, len(indicatorDataset))
print(f"Sample row ({rand}):\n{indicatorDataset.iloc[rand]}")

Num of rows: 80301
Sample row (2268):
date              2010-04-26
tic                     CSCO
open                   27.58
high                    27.7
low                    27.48
close                19.5783
volume           3.56634e+07
day                        0
macd                0.306866
boll_ub              19.7016
boll_lb              18.2563
rsi_30               67.1523
cci_30               190.184
dx_30                25.4766
close_30_sma          18.884
close_60_sma         18.0729
close_60_smma        18.1605
atr_30               7.75297
supertrend_ub        44.0752
supertrend_lb        4.01207
supertrend           44.0752
pdi                  1.70643
mdi                  1.03068
dx                   24.6884
adx                  23.1065
adxr                  25.206
vix                    17.47
turbulence                 0
Name: 3370, dtype: object


#### PCA Data for DOW Tickers

In [12]:
pcaDataset = pd.read_csv("../Datasets/DowPcaData.csv", index_col=0)

In [13]:
print(f"Num of rows: {len(pcaDataset)}")
rand = random.randint(0, len(pcaDataset))
print(f"Sample row ({rand}):\n{pcaDataset.iloc[rand]}")

Num of rows: 80301
Sample row (36678):
date          2015-01-09
tic                   PG
open               91.18
high               91.18
low                90.12
close            71.6012
volume        4.8728e+06
day                    4
0               -0.30608
1              -0.747747
2             -0.0259247
3               -1.02958
4              0.0249799
5               -0.18917
6               0.626527
7             -0.0369904
8              0.0133445
9              0.0480704
10             -0.441919
vix                17.55
turbulence       24.4064
Name: 53237, dtype: object
