# Feature Engineering

In [None]:
# Environment variables
from dotenv import load_dotenv
load_dotenv("mysql.env")

import os
import sys
import mysql.connector

# Data Manipulation
import pandas as pd
import numpy as np

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Feature Engineering - Technical Analysis
from ta import add_all_ta_features
from ta.utils import dropna

# Feature Selection
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn import model_selection, preprocessing
import xgboost as xgb

# ignore warnings from pandas
import warnings
warnings.filterwarnings('ignore')

# List of Stocks and ETFs
Provided by Thomas Choi.

In [None]:
stock_list = pd.read_csv("stocks_and_etfs/stock_list.csv")
etf_list = pd.read_csv("stocks_and_etfs/etf_list.csv")
exchange_list = ['HKEX', 'NYSE', 'NASDAQ', 'AMEX']

In [None]:
stock_symbol = stock_list.iloc[20,0]
stock_symbol

# MySQL connection

In [None]:
HOST=os.environ.get("HOST")
PORT=os.environ.get("PORT")
USER=os.environ.get("USER")
PASSWORD=os.environ.get("PASSWORD")

try: 
    conn = mysql.connector.connect(
        host=HOST,
        port=PORT,
        user=USER,
        password=PASSWORD,
        database="GlobalMarketData"
    )
    query = f"SELECT Date, Close, Open, High, Low, Volume from histdailyprice3 WHERE Symbol='{stock_symbol}';"
    #query = f"SELECT Symbol, Date, Close, Open, High, Low, Volume from histdailyprice3 WHERE Exchange='{exchange_list[1]}';"
    histdailyprice3 = pd.read_sql(query, conn)
    conn.close()
except Exception as e:
    conn.close()
    print(str(e))

# Load Data

In [None]:
df = histdailyprice3.copy()
#df.set_index("Date", drop=True, inplace=True)

# Clean NaN values
df = dropna(df)

# Stock Dataset with Technical Analysis

In [None]:
df = add_all_ta_features(df, open="Open", high="High", low="Low", close="Close", volume="Volume", fillna=True)

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.info()

In [None]:
# Descriptive statistics
df.describe()

# Feature Engineering

In [None]:
for f in df.columns:
    if df[f].dtype=='object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(df[f].values)) 
        df[f] = lbl.transform(list(df[f].values))
dtype_df = df.dtypes.reset_index()
dtype_df.columns = ["Count", "Column Type"]
dtype_df.groupby("Column Type").aggregate('count').reset_index()

In [None]:
corrmat = df.drop(["Date"], axis=1).corr(method='pearson', min_periods=1000)
# plot absolute values
corrmat = np.abs(corrmat)
sns.set(context="paper", font="monospace")
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True, xticklabels = False, yticklabels = False)
plt.show()

In [None]:
remain_num = 50
corr_target = corrmat['Close'].reset_index()[:-2]
corr_target.columns = ['feature','abs_corr']
corr_target = corr_target.sort_values(by = 'abs_corr', ascending = True)[:remain_num].loc[corr_target['abs_corr'] >0.01]
ind = np.arange(corr_target.shape[0])
width = 0.9
fig, ax = plt.subplots(figsize=(10,18))
rects = ax.barh(ind, corr_target.abs_corr.values, color='r')
ax.set_yticks(ind)
ax.set_yticklabels(corr_target.feature.values, rotation='horizontal')
ax.set_xlabel("absolute corr", fontsize = 14)
ax.set_title("Correlations between features and Close pirce", fontsize = 18)
plt.show()

In [None]:
corr_target_f = list(corr_target.feature.values)

# Correlation between features

In [None]:
corr_target_f2 = corr_target_f
corr_target_f2.append('Close')
high_corr = df[corr_target_f2].corr(method='pearson', min_periods=1000)
high_corr = np.abs(high_corr)*100
f, ax = plt.subplots(figsize=(11, 11))
sns.heatmap(high_corr, cbar=False, annot=True, square=True, fmt='.0f', 
            annot_kws={'size': 8})
plt.title('High-corrlation Features')
plt.show()

In [None]:
df_y = df.Close.values
df_X = df.drop(["Date", "Close"], axis=1)
xgb_params = {
    'eta': 0.05,
    'max_depth': 10,
    'subsample': 1.0,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}
dtrain = xgb.DMatrix(df_X, df_y, feature_names=df_X.columns.values)
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=100)
remain_num = 99

In [None]:
fig, ax = plt.subplots(figsize=(10,18))
xgb.plot_importance(model, max_num_features=remain_num, height=0.8, ax=ax)
plt.show()

In [None]:
importance = model.get_score(importance_type='weight')
tuples = sorted([(k, importance[k]) for k in importance], key=lambda x: x[1], reverse=True)[:remain_num]
xgb_imp_f = [x[0] for x in tuples]