In [5]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib

import sys
sys.path.append("..") # Adds higher directory to python modules path.


%matplotlib inline
%load_ext autoreload
%autoreload 2

plt.rcParams["figure.figsize"] = (20,8)

sns.set_theme()

font = {'family' : 'DejaVu Sans', 'size'   : 25}

matplotlib.rc('font', **font)

In [6]:
from StocksDataWrapper import *
from DataHelper import *
from Plots import *
from DL_utils import *

In [7]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = torch.device('cpu')
print(device)

cuda


### Prepare data

In [105]:
DATA_PATH = '../data/'
QUOTATIONS = ['AMZN', 'GOOG', 'AAPL', 'GM', 'TSLA', 'JNJ', 'XOM', 'AAL', 'KO', 'WMT']
QUOTATIONS = ['AAPL']
quotation = QUOTATIONS[0]
FILE_SUFFIX = '.txt'
price_column = 'Close'
project_label='PCA'
normalize = True
seq_len = 10
predict_n = 1

In [106]:
data_wrapper = StocksDataWrapper.read_from(file_path=f"{DATA_PATH}{quotation}{FILE_SUFFIX}", 
                                           compute_features=True, predict_n=predict_n, normalize=normalize)

data_wrapper.head()

0         2.992857
1         3.059286
2         3.037500
3         3.052500
4         3.306072
           ...    
3520    131.970001
3521    136.690002
3522    134.869995
3523    133.720001
3524    132.690002
Name: Close, Length: 3525, dtype: float64


Unnamed: 0,Date,Open,High,Low,Close,Volume,Adjusted,LowLen,RSI(14),GAP,...,SMA(10),SMA(20),EMA(14),EMA_Diff,SMA(20) - SMA(10),Difference,PercentageDiff,Tendency,NextPrice,Next
0,2007-02-21,0.001741,0.001969,0.002107,0.002934,0.337769,0.002524,7.8e-05,0.597417,0.000317,...,2.3e-05,0.0,0.0,0.445527,0.531663,0.516652,0.683788,higher,0.003017,higher
1,2007-02-22,0.003014,0.002316,0.002805,0.003017,0.243081,0.002595,0.003813,0.608719,0.006346,...,0.000117,4e-05,0.000135,0.445232,0.531187,0.511465,0.573995,higher,0.002899,lower
2,2007-02-23,0.002581,0.002192,0.002891,0.002899,0.147424,0.002494,0.000856,0.582323,0.001388,...,0.000197,8.1e-05,0.000236,0.443849,0.530828,0.510164,0.547628,lower,0.00275,lower
3,2007-02-26,0.00276,0.002103,0.002555,0.00275,0.176675,0.002365,0.003501,0.548723,0.003054,...,0.000343,0.000126,0.000302,0.44247,0.529935,0.509956,0.543318,lower,0.001528,lower
4,2007-02-27,0.001825,0.001335,0.001414,0.001528,0.334932,0.001315,0.002023,0.340556,0.008765,...,0.000317,9.7e-05,0.000187,0.435232,0.529921,0.502983,0.400476,lower,0.00171,higher


In [107]:
df = data_wrapper.df
feature_names = data_wrapper.feature_names
df.isna().any()

Date                   False
Open                   False
High                   False
Low                    False
Close                  False
Volume                 False
Adjusted               False
LowLen                 False
RSI(14)                False
GAP                    False
RSI_diff               False
Volume_diff            False
MACD                   False
MACD_diff              False
MACD_signal            False
BodyLen                False
BG_L_Band              False
BG_H_Band              False
BG_L_Band_Indicator    False
BG_H_Band_Indicator    False
ROC                    False
StochOsc               False
SMA(10)                False
SMA(20)                False
EMA(14)                False
EMA_Diff               False
SMA(20) - SMA(10)      False
Difference             False
PercentageDiff         False
Tendency               False
NextPrice              False
Next                   False
dtype: bool

In [108]:
from torch.utils.data import Dataset
from TorchDatasets import StocksSeqDataset

y_column = 'Next'
data_columns = ['Close', 'Volume'] + feature_names
data_columns.remove('Tendency')
data_columns += [y_column]


dataset = df.copy()
dataset = dataset.loc[:, data_columns]

    
X = dataset.loc[:, dataset.columns != y_column]
y = dataset[y_column]

X.head(), y.head()

(      Close    Volume    LowLen   RSI(14)       GAP  RSI_diff  Volume_diff  \
 0  0.002934  0.337769  0.000078  0.597417  0.000317  0.732067     0.648025   
 1  0.003017  0.243081  0.003813  0.608719  0.006346  0.537803     0.357237   
 2  0.002899  0.147424  0.000856  0.582323  0.001388  0.483108     0.356134   
 3  0.002750  0.176675  0.003501  0.548723  0.003054  0.472655     0.498444   
 4  0.001528  0.334932  0.002023  0.340556  0.008765  0.219384     0.645423   
 
        MACD  MACD_diff  MACD_signal  ...       ROC  StochOsc   SMA(10)  \
 0  0.382633   0.622316     0.353871  ...  0.647213  0.956260  0.000023   
 1  0.383511   0.623797     0.354309  ...  0.668568  0.836480  0.000117   
 2  0.384091   0.624267     0.354790  ...  0.656403  0.781135  0.000197   
 3  0.384403   0.624000     0.355247  ...  0.608493  0.710694  0.000343   
 4  0.383561   0.621014     0.355420  ...  0.526000  0.094736  0.000317   
 
     SMA(20)   EMA(14)  EMA_Diff  SMA(20) - SMA(10)  Difference  \
 0  0

In [109]:
from sklearn.model_selection import train_test_split
TEST_SIZE = 0.3

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
#X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq, test_size=0.3, shuffle=True)
    
print(f"Train size : {len(X_train)}, test_size : {len(X_test)}")

Train size : 2443, test_size : 1048


$Seq_X = [[x_i*N_{features}], [x_{i+1} * N_{features}], ... , x_{seq_len} * N_{features}]$

$Seq_Y = [y_{seq\_len + 1}]$

In [110]:
from sklearn.decomposition import PCA

pca = PCA()
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

In [111]:
explained_variance = pca.explained_variance_ratio_

for explained_var, col in zip(explained_variance, X.columns.values):
    print(f"{col} -> {explained_var}")

Close -> 0.4648976640067948
Volume -> 0.26698572238711904
LowLen -> 0.1112103306594884
RSI(14) -> 0.05771812144411184
GAP -> 0.027713176869531543
RSI_diff -> 0.016425102338813623
Volume_diff -> 0.014564482192591773
MACD -> 0.010991336609936921
MACD_diff -> 0.0061332971034368305
MACD_signal -> 0.0059224804672117996
BodyLen -> 0.004840356755621142
BG_L_Band -> 0.003938969880282683
BG_H_Band -> 0.0030120047671704122
BG_L_Band_Indicator -> 0.002055475697341416
BG_H_Band_Indicator -> 0.0013709146481393612
ROC -> 0.0009940473525385958
StochOsc -> 0.0008144349122899613
SMA(10) -> 0.0002531290462143294
SMA(20) -> 0.00011328220466770469
EMA(14) -> 4.5057878052785434e-05
EMA_Diff -> 6.12778645006893e-07
SMA(20) - SMA(10) -> 1.619234848730719e-32
Difference -> 1.7661657089592286e-33
PercentageDiff -> 1.0781015124910983e-33
NextPrice -> 5.42260755874504e-34
