# Feature Engineering

In [12]:
# Environment variables
from dotenv import load_dotenv
load_dotenv("mysql.env")

import os
import sys
import mysql.connector

import pandas as pd
import numpy as np
from ta import add_all_ta_features
from ta.utils import dropna

# List of Stocks and ETFs
Provided by Thomas Choi.

In [2]:
stock_list = pd.read_csv("stocks_and_etfs/stock_list.csv")
etf_list = pd.read_csv("stocks_and_etfs/etf_list.csv")

In [3]:
stock_symbol = stock_list.iloc[20,0]
stock_symbol

'VIAC'

# MySQL connection

In [4]:
HOST=os.environ.get("HOST")
PORT=os.environ.get("PORT")
USER=os.environ.get("USER")
PASSWORD=os.environ.get("PASSWORD")

try: 
    conn = mysql.connector.connect(
        host=HOST,
        port=PORT,
        user=USER,
        password=PASSWORD,
        database="GlobalMarketData"
    )
    query = f"SELECT Date, Close, Open, High, Low, Volume from histdailyprice3 WHERE Symbol='{stock_symbol}';"
    histdailyprice3 = pd.read_sql(query, conn)
    conn.close()
except Exception as e:
    conn.close()
    print(str(e))

# Load Data

In [19]:
df = histdailyprice3.copy()
df.set_index("Date", drop=True, inplace=True)

# Stock Dataset

In [20]:
# Clean NaN values
df = dropna(df)

# Add ta features filling NaN values
df = add_all_ta_features(
    df, open="Open", high="High", low="Low", close="Close", volume="Volume", fillna=True)

In [22]:
df.columns

Index(['Close', 'Open', 'High', 'Low', 'Volume', 'volume_adi', 'volume_obv',
       'volume_cmf', 'volume_fi', 'volume_mfi', 'volume_em', 'volume_sma_em',
       'volume_vpt', 'volume_nvi', 'volume_vwap', 'volatility_atr',
       'volatility_bbm', 'volatility_bbh', 'volatility_bbl', 'volatility_bbw',
       'volatility_bbp', 'volatility_bbhi', 'volatility_bbli',
       'volatility_kcc', 'volatility_kch', 'volatility_kcl', 'volatility_kcw',
       'volatility_kcp', 'volatility_kchi', 'volatility_kcli',
       'volatility_dcl', 'volatility_dch', 'volatility_dcm', 'volatility_dcw',
       'volatility_dcp', 'volatility_ui', 'trend_macd', 'trend_macd_signal',
       'trend_macd_diff', 'trend_sma_fast', 'trend_sma_slow', 'trend_ema_fast',
       'trend_ema_slow', 'trend_adx', 'trend_adx_pos', 'trend_adx_neg',
       'trend_vortex_ind_pos', 'trend_vortex_ind_neg', 'trend_vortex_ind_diff',
       'trend_trix', 'trend_mass_index', 'trend_cci', 'trend_dpo', 'trend_kst',
       'trend_kst_sig', '

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1124 entries, 2005-01-21 to 2021-09-03
Data columns (total 88 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Close                      1124 non-null   float64
 1   Open                       1124 non-null   float64
 2   High                       1124 non-null   float64
 3   Low                        1124 non-null   float64
 4   Volume                     1124 non-null   float64
 5   volume_adi                 1124 non-null   float64
 6   volume_obv                 1124 non-null   float64
 7   volume_cmf                 1124 non-null   float64
 8   volume_fi                  1124 non-null   float64
 9   volume_mfi                 1124 non-null   float64
 10  volume_em                  1124 non-null   float64
 11  volume_sma_em              1124 non-null   float64
 12  volume_vpt                 1124 non-null   float64
 13  volume_nvi                 1124 non-nu

In [24]:
# Descriptive statistics
df.describe()

Unnamed: 0,Close,Open,High,Low,Volume,volume_adi,volume_obv,volume_cmf,volume_fi,volume_mfi,...,momentum_wr,momentum_ao,momentum_kama,momentum_roc,momentum_ppo,momentum_ppo_signal,momentum_ppo_hist,others_dr,others_dlr,others_cr
count,1124.0,1124.0,1124.0,1124.0,1124.0,1124.0,1124.0,1124.0,1124.0,1124.0,...,1124.0,1124.0,1124.0,1124.0,1124.0,1124.0,1124.0,1124.0,1124.0,1124.0
mean,16.634327,16.998821,16.216437,16.598891,6368148.0,-319449400.0,-64338740.0,-5.834508,-4398003.0,51.656557,...,-32.427277,0.391482,16.577363,5.77959,-3.516326,-3.483439,-0.032887,0.451871,0.118239,51.221153
std,16.669061,17.101092,16.250944,16.691785,14515360.0,19920820.0,196506100.0,5.880422,45777120.0,20.076531,...,69.400494,5.989693,16.480865,49.238463,19.606289,17.004885,8.725381,14.694226,6.855726,151.536922
min,3.68,3.72,3.56,3.66,16800.0,-322699600.0,-696020500.0,-46.962864,-740331800.0,0.0,...,-414.965986,-25.884737,3.94508,-55.763994,-69.852645,-63.5818,-23.464178,-33.871685,-33.592486,-66.545455
25%,5.3475,5.49,5.235,5.33,137429.5,-322686400.0,-56408460.0,-6.638736,-15066.66,36.173431,...,-77.7884,-0.498126,5.402529,-7.786027,-12.947378,-10.782003,-4.640794,-1.993854,-2.009279,-51.386364
50%,7.22,7.245,7.13,7.2,286882.0,-322686400.0,2954816.0,-4.513295,1472.039,50.718567,...,-29.62963,-0.023059,7.218502,0.0,-3.208875,-2.576071,-0.989282,0.0,0.0,-34.363636
75%,27.5,28.115,27.10125,27.685,9300975.0,-322686400.0,8144266.0,-2.445197,102439.6,65.255961,...,3.373756,0.850085,26.827731,7.569035,6.828985,5.809818,3.285488,1.781983,1.766292,150.0
max,99.15,101.97,96.21,100.34,216574100.0,-54518870.0,486998100.0,0.624604,82223770.0,100.0,...,1666.666667,30.060453,90.612185,493.655172,85.15502,42.651885,71.869954,465.517241,173.25706,801.363636
