# Principal Component Analysis (PCA)

## Setup

In [5]:
%load_ext autoreload
%autoreload 2

from notebook_config import setup_notebook
import matplotlib.pyplot as plt
import numpy as np

setup_notebook()

In [6]:
from datasets.stocks_data_wrapper import StocksDataWrapper
from helpers.data_helper import *
from helpers.plots_helper import *

### Prepare data

In [7]:
DATA_PATH = '../data/'
FILE_SUFFIX='.txt'

quotation = 'IBM'
price_column = 'Close'

predict_n = 5

In [8]:
data_wrapper = StocksDataWrapper.read_from(file_path=f"{DATA_PATH}{quotation}{FILE_SUFFIX}", thresh_diff=0.005,
                                           compute_features=True, predict_n=predict_n, normalize=True)

data_wrapper.head()

Unnamed: 0,Date,Open,High,Low,Close,Adjusted,Volume,LowLen,RSI(14),GAP,...,SMA(10),SMA(20),EMA(14),EMA_Diff,SMA(20) - SMA(10),Difference,PercentageDiff,Tendency,NextPrice,Next
0,1962-03-21,0.015428,0.014523,0.015787,0.015429,0.00502,0.00378,0.002525,0.595025,0.0,...,0.014503,0.013555,0.013679,0.614097,0.394439,0.553595,0.54467,stay,0.014705,higher
1,1962-03-22,0.015334,0.014334,0.015533,0.015051,0.004897,0.003456,0.0,0.456905,0.001641,...,0.014474,0.013587,0.013637,0.61356,0.394997,0.552214,0.528665,lower,0.01461,stay
2,1962-03-23,0.015113,0.014271,0.015565,0.015177,0.004938,0.0027,0.0,0.505305,0.00041,...,0.014436,0.013619,0.013618,0.613056,0.395653,0.552076,0.527143,stay,0.01428,lower
3,1962-03-26,0.015207,0.014334,0.015597,0.015162,0.004933,0.001944,0.001263,0.499957,0.0,...,0.014399,0.013656,0.0136,0.612481,0.396346,0.551593,0.52163,stay,0.013823,lower
4,1962-03-27,0.015113,0.014114,0.014963,0.014516,0.004723,0.005076,0.000842,0.332912,0.001026,...,0.0143,0.013658,0.013496,0.61145,0.397285,0.54973,0.499895,lower,0.013005,lower


In [13]:
df = data_wrapper.df
feature_names = data_wrapper.feature_names
print(feature_names)
df.isna().any()

['LowLen', 'RSI(14)', 'GAP', 'RSI_diff', 'Volume_diff', 'MACD', 'MACD_diff', 'MACD_signal', 'BodyLen', 'BG_L_Band', 'BG_H_Band', 'BG_L_Band_Indicator', 'BG_H_Band_Indicator', 'ROC', 'StochOsc', 'SMA(10)', 'SMA(20)', 'EMA(14)', 'EMA_Diff', 'SMA(20) - SMA(10)', 'Difference', 'PercentageDiff', 'Tendency', 'NextPrice', 'Next']


Date                   False
Open                   False
High                   False
Low                    False
Close                  False
Adjusted               False
Volume                 False
LowLen                 False
RSI(14)                False
GAP                    False
RSI_diff               False
Volume_diff            False
MACD                   False
MACD_diff              False
MACD_signal            False
BodyLen                False
BG_L_Band              False
BG_H_Band              False
BG_L_Band_Indicator    False
BG_H_Band_Indicator    False
ROC                    False
StochOsc               False
SMA(10)                False
SMA(20)                False
EMA(14)                False
EMA_Diff               False
SMA(20) - SMA(10)      False
Difference             False
PercentageDiff         False
Tendency               False
NextPrice              False
Next                   False
dtype: bool

In [17]:
y_column = 'Next'
data_columns = ['Close', 'Volume'] + feature_names
data_columns.remove('NextPrice')
data_columns.remove('Tendency')

X_train, X_test, y_train, y_test = data_wrapper.get_datasets(n_splits=1, val_size=0.3, 
                                                             y_column=y_column, 
                                                             features_list=data_columns)

In [18]:
from sklearn.decomposition import PCA

pca = PCA()
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

In [19]:
explained_variance = pca.explained_variance_ratio_

for explained_var, col in zip(explained_variance, X.columns.values):
    print(f"{col} -> {explained_var}")

Close -> 0.39280583941416947
Volume -> 0.2712347043663882
LowLen -> 0.13733587233647115
RSI(14) -> 0.09730154557191174
GAP -> 0.03688124596288666
RSI_diff -> 0.018448645365126397
Volume_diff -> 0.011035430987693374
MACD -> 0.010339886910344213
MACD_diff -> 0.0073564020950081495
MACD_signal -> 0.00510722658333324
BodyLen -> 0.0028779125047979667
BG_L_Band -> 0.002687802224306727
BG_H_Band -> 0.0020763163361475346
BG_L_Band_Indicator -> 0.0019602577143124603
BG_H_Band_Indicator -> 0.0013535016146364396
ROC -> 0.0005492270983866083
StochOsc -> 0.000409919986170451
SMA(10) -> 0.0001905461565261064
SMA(20) -> 4.287494875536624e-05
EMA(14) -> 4.647428525580513e-06
EMA_Diff -> 1.9439410218377953e-07
SMA(20) - SMA(10) -> 1.3750107693626623e-30
Difference -> 1.0873479518901958e-32
PercentageDiff -> 2.881686169659581e-33
