In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from DataHelper import *

In [2]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [3]:
def compute_MA(df, price_column, n=10):
    return df[price_column].rolling(n).mean()

def pipeline_preprocessing(path, price_column, predict_n=1, thresh_diff=0.5, verbose=False):
    df = get_data(path)
    keep_columns = ['Date', price_column, 'Volume', 'Open']
    df = df.loc[:, keep_columns]
    
    df['Difference'] = compute_column_difference(df, column=price_column, periods_offset=predict_n)
    df['PercentageDiff'] = compute_percentage_diff(df)
    df['Tendency'] = compute_tendency_percentage(df, diff_column='Difference', labels=['lower','higher'])
    
    if verbose:
        value_counts = df.Tendency.value_counts().to_dict()
        for value, count in value_counts.items():
            print(f"[{value}] : {count} ({count * 100.0 / len(df['Tendency']):.1f}%)")
            
    df['MA'] = compute_MA(df, price_column)
    df['MA_diff'] = compute_MA(df, price_column, n=20) - compute_MA(df, price_column, n=10)
    df['RSI'] = compute_RSI(df, n=10, price_column=price_column, diff_column='Difference')
    df['GAP'] = compute_GAP(df)
    df['Volume_diff'] = compute_column_difference(df, column='Volume')
    df['Next'] = shift_values(df, column='Tendency', periods=-predict_n)
    df = df.dropna()
    return df

In [4]:
DATA_PATH = './data'
QUOTATIONS = ['AMZN', 'GOOG', 'AAPL', 'GM', 'TSLA', 'JNJ', 'XOM', 'AAL', 'KO', 'WMT']
FILE_SUFFIX = '.txt'
price_column = 'Close'

df = None
predict_n = 5

for quot in QUOTATIONS:
    temp_df = pipeline_preprocessing(f"{DATA_PATH}/{quot}{FILE_SUFFIX}", predict_n=predict_n, price_column=price_column)
    if df is None:
        df = temp_df
    else:
        df = df.append(temp_df)

print(len(df))
df.head()

33154


Unnamed: 0,Date,Close,Volume,Open,Difference,PercentageDiff,Tendency,MA,MA_diff,RSI,GAP,Volume_diff,Next
20,2007-01-31,37.669998,7277500.0,36.950001,0.41,1.100376,higher,37.072,0.491001,25.038845,0.099998,2464000.0,higher
21,2007-02-01,38.700001,26123100.0,37.950001,1.619999,4.368929,higher,37.244,0.319,40.989804,-0.280003,18845600.0,higher
22,2007-02-02,37.389999,25850700.0,37.23,0.540001,1.465403,higher,37.281,0.2065,46.890518,1.470001,-272400.0,higher
23,2007-02-05,37.16,6110900.0,37.25,-0.27,-0.721347,lower,37.302,0.125001,53.399396,0.139999,-19739800.0,higher
24,2007-02-06,38.27,8612700.0,37.200001,1.220001,3.29285,higher,37.486,-0.020499,82.479274,-0.040001,2501800.0,higher


In [5]:
from torch.utils.data import Dataset

class StocksDataset(Dataset):
    def __init__(self, data, target):
        self.data = torch.Tensor(data.values)
        self.target = torch.Tensor(target.values)
        
    def __getitem__(self, index):
        datapoint = self.data[index]
        target = self.target[index]
        return datapoint, target
        
    def __len__(self):
        return len(self.data)
    
data_columns = ['MA', 'Close']
y_column = 'Next'
data_columns.append(y_column)

dataset = df.copy()
dataset = dataset.loc[:, data_columns]
for col in dataset.columns:
    dataset[col] = dataset[col].replace({'higher':1, 'stay':0, 'lower':0})
    

X = dataset.loc[:, dataset.columns != y_column]
Y = dataset[y_column]

print(dataset.head())

dataset = StocksDataset(X,Y)

        MA      Close  Next
20  37.072  37.669998     1
21  37.244  38.700001     1
22  37.281  37.389999     1
23  37.302  37.160000     1
24  37.486  38.270000     1


In [6]:
from torch.utils.data import DataLoader
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=0)

In [7]:
import torch.nn as nn

class LinearModel(nn.Module):
    
    def __init__(self, input_dim):
        super(LinearModel, self).__init__()
        self.classifier = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(True),
            nn.Linear(64, 128),
            nn.ReLU(True),
            nn.Linear(128, 128),
            nn.ReLU(True),
            nn.Linear(128, 64),
            nn.ReLU(True),
            nn.Linear(64, 1),
            nn.Softmax(1)
        )
        #self.classifier = nn.Linear(input_dim, 1)
        
        
    def forward(self, x):
        out = self.classifier(x)
        #print(out)
        return out
    


In [10]:
def create_model(lr, input_dim):
    model = LinearModel(input_dim=input_dim)
    model = model.to(device)

    loss_fn = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    
    return model, optimizer, loss_fn

lr = 0.00001

model, optimizer, loss_fn = create_model(lr,input_dim = len(X.columns))
model = model.to(device)

In [11]:
import DL_utils
import importlib
importlib.reload(DL_utils)
from DL_utils import train
train(dataloader, model, n_epochs=50, optimizer=optimizer, loss_fn=loss_fn, device=device)

Epoch 1, loss: 0.3132617473602295, accuracy : 54.34
Epoch 2, loss: 0.3132617473602295, accuracy : 54.33
Epoch 3, loss: 0.3132617473602295, accuracy : 54.33
Epoch 4, loss: 0.3132617473602295, accuracy : 54.35
Epoch 5, loss: 0.8132617473602295, accuracy : 54.30
Epoch 6, loss: 0.8132617473602295, accuracy : 54.28
Epoch 7, loss: 0.3132617473602295, accuracy : 54.33
Epoch 8, loss: 0.3132617473602295, accuracy : 54.32


KeyboardInterrupt: 