In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib

import sys
sys.path.append("..") # Adds higher directory to python modules path.


%matplotlib inline
%load_ext autoreload
%autoreload 2

plt.rcParams["figure.figsize"] = (20,8)

sns.set_theme()

font = {'family' : 'DejaVu Sans', 'size'   : 25}

matplotlib.rc('font', **font)

In [2]:
from StocksDataWrapper import *
from DataHelper import *
from Plots import *
from DL_utils import *

In [3]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = torch.device('cpu')
print(device)

cuda


### Prepare data

In [4]:
DATA_PATH = '../data/'
QUOTATIONS = ['AMZN', 'GOOG', 'AAPL', 'GM', 'TSLA', 'JNJ', 'XOM', 'AAL', 'KO', 'WMT']
QUOTATIONS = ['IBM']
quotation = QUOTATIONS[0]
FILE_SUFFIX = '.txt'
price_column = 'Close'
project_label='PCA'
normalize = True
seq_len = 10
predict_n = 1

In [5]:
data_wrapper = StocksDataWrapper.read_from(file_path=f"{DATA_PATH}{quotation}{FILE_SUFFIX}", 
                                           compute_features=True, predict_n=predict_n, normalize=normalize)

data_wrapper.head()

0          7.393333
1          7.440000
2          7.346667
3          7.353333
4          7.360000
            ...    
14826    124.690002
14827    124.820000
14828    123.800003
14829    124.339996
14830    125.879997
Name: Close, Length: 14831, dtype: float64


Unnamed: 0,Date,Open,High,Low,Close,Adjusted,Volume,LowLen,RSI(14),GAP,...,SMA(10),SMA(20),EMA(14),EMA_Diff,SMA(20) - SMA(10),Difference,PercentageDiff,Tendency,NextPrice,Next
0,1962-03-21,0.015428,0.014523,0.015787,0.015429,0.00502,0.00378,0.002525,0.595025,0.0,...,0.014503,0.013555,0.013679,0.613174,0.394439,0.575048,0.643619,higher,0.015051,lower
1,1962-03-22,0.015334,0.014334,0.015533,0.015051,0.004897,0.003456,0.0,0.456905,0.001641,...,0.014474,0.013587,0.013637,0.611388,0.394997,0.572212,0.611458,lower,0.015177,higher
2,1962-03-23,0.015113,0.014271,0.015565,0.015177,0.004938,0.0027,0.0,0.505305,0.00041,...,0.014436,0.013619,0.013618,0.612154,0.395653,0.575703,0.651147,higher,0.015162,lower
3,1962-03-26,0.015207,0.014334,0.015597,0.015162,0.004933,0.001944,0.001263,0.499957,0.0,...,0.014399,0.013656,0.0136,0.612167,0.396346,0.574721,0.639898,lower,0.014516,lower
4,1962-03-27,0.015113,0.014114,0.014963,0.014516,0.004723,0.005076,0.000842,0.332912,0.001026,...,0.0143,0.013658,0.013496,0.609286,0.397285,0.570357,0.590036,lower,0.014705,higher


In [6]:
df = data_wrapper.df
feature_names = data_wrapper.feature_names
df.isna().any()

Date                   False
Open                   False
High                   False
Low                    False
Close                  False
Adjusted               False
Volume                 False
LowLen                 False
RSI(14)                False
GAP                    False
RSI_diff               False
Volume_diff            False
MACD                   False
MACD_diff              False
MACD_signal            False
BodyLen                False
BG_L_Band              False
BG_H_Band              False
BG_L_Band_Indicator    False
BG_H_Band_Indicator    False
ROC                    False
StochOsc               False
SMA(10)                False
SMA(20)                False
EMA(14)                False
EMA_Diff               False
SMA(20) - SMA(10)      False
Difference             False
PercentageDiff         False
Tendency               False
NextPrice              False
Next                   False
dtype: bool

In [7]:
from torch.utils.data import Dataset
from TorchDatasets import StocksSeqDataset

y_column = 'Next'
data_columns = ['Close', 'Volume'] + feature_names
data_columns.remove('Tendency')
data_columns += [y_column]


dataset = df.copy()
dataset = dataset.loc[:, data_columns]

    
X = dataset.loc[:, dataset.columns != y_column]
y = dataset[y_column]

X.head(), y.head()

(      Close    Volume    LowLen   RSI(14)       GAP  RSI_diff  Volume_diff  \
 0  0.015429  0.003780  0.002525  0.595025  0.000000  0.483363     0.425147   
 1  0.015051  0.003456  0.000000  0.456905  0.001641  0.269859     0.423957   
 2  0.015177  0.002700  0.000000  0.505305  0.000410  0.535921     0.423659   
 3  0.015162  0.001944  0.001263  0.499957  0.000000  0.459252     0.423659   
 4  0.014516  0.005076  0.000842  0.332912  0.001026  0.228598     0.426337   
 
        MACD  MACD_diff  MACD_signal  ...       ROC  StochOsc   SMA(10)  \
 0  0.631696   0.546478     0.620411  ...  0.497564  0.583338  0.014503   
 1  0.631341   0.545495     0.620407  ...  0.480218  0.083338  0.014474   
 2  0.631170   0.545121     0.620363  ...  0.480896  0.181814  0.014436   
 3  0.631018   0.544865     0.620293  ...  0.476925  0.159089  0.014399   
 4  0.630319   0.543437     0.620074  ...  0.440735  0.024998  0.014300   
 
     SMA(20)   EMA(14)  EMA_Diff  SMA(20) - SMA(10)  Difference  \
 0  0

In [8]:
from sklearn.model_selection import train_test_split
TEST_SIZE = 0.3

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
#X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq, test_size=0.3, shuffle=True)
    
print(f"Train size : {len(X_train)}, test_size : {len(X_test)}")

Train size : 10357, test_size : 4440


$Seq_X = [[x_i*N_{features}], [x_{i+1} * N_{features}], ... , x_{seq_len} * N_{features}]$

$Seq_Y = [y_{seq\_len + 1}]$

In [9]:
from sklearn.decomposition import PCA

pca = PCA()
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

In [10]:
explained_variance = pca.explained_variance_ratio_

for explained_var, col in zip(explained_variance, X.columns.values):
    print(f"{col} -> {explained_var}")

Close -> 0.6511525967757171
Volume -> 0.17864217402636998
LowLen -> 0.0658113164352582
RSI(14) -> 0.04688202284820852
GAP -> 0.019473414017687374
RSI_diff -> 0.008619976279957134
Volume_diff -> 0.0067443501218334674
MACD -> 0.006145583125363443
MACD_diff -> 0.003925148603912042
MACD_signal -> 0.0036176929597126106
BodyLen -> 0.0024432043760219476
BG_L_Band -> 0.0017211393597401392
BG_H_Band -> 0.0015626640418320023
BG_L_Band_Indicator -> 0.001184804659857424
BG_H_Band_Indicator -> 0.0007930803180363271
ROC -> 0.0006762950843801509
StochOsc -> 0.00026481521846067235
SMA(10) -> 0.00022506327841985614
SMA(20) -> 7.757032492017195e-05
EMA(14) -> 3.657409162919554e-05
EMA_Diff -> 5.140526823214698e-07
SMA(20) - SMA(10) -> 2.4288737166148497e-32
Difference -> 7.784552168499159e-33
PercentageDiff -> 3.717881031235247e-33
NextPrice -> 1.1030996397210439e-33
