In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from DataHelper import *

In [2]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [3]:
def compute_MA(df, price_column, n=10):
    return df[price_column].rolling(n).mean()

def pipeline_preprocessing(path, price_column, thresh_diff=0.5, verbose=False):
    df = get_data(path)
    keep_columns = ['Date', price_column, 'Volume']
    df = df.loc[:, keep_columns]
    
    df['Difference'] = compute_column_difference(df, column=price_column)
    df['PercentageDiff'] = compute_percentage_diff(df)
    df['Tendency'] = compute_tendency_percentage(df, diff_column='Difference', labels=['lower','higher'])
    
    if verbose:
        value_counts = df.Tendency.value_counts().to_dict()
        for value, count in value_counts.items():
            print(f"[{value}] : {count} ({count * 100.0 / len(df['Tendency']):.1f}%)")
            
    df['MA'] = compute_MA(df, price_column)
    df['MA_diff'] = compute_MA(df, price_column, n=20) - compute_MA(df, price_column, n=10)
    df['RSI'] = compute_RSI(df, n=10, price_column=price_column, diff_column='Difference')
    df['Next'] = shift_values(df, column='Tendency', periods=-1)
    df = df.dropna()
    return df

In [4]:
DATA_PATH = './data'
QUOTATIONS = ['AMZN', 'GOOG', 'AAPL', 'GM']
FILE_SUFFIX = '.txt'
price_column = 'Close'

df = None

for quot in QUOTATIONS:
    if df is None:
        df = pipeline_preprocessing(f"{DATA_PATH}/{quot}{FILE_SUFFIX}", price_column=price_column)
    else:
        df = df.append(pipeline_preprocessing(f"{DATA_PATH}/{quot}{FILE_SUFFIX}", price_column=price_column))

print(len(df))
df.head()

13042


Unnamed: 0,Date,Close,Volume,Difference,PercentageDiff,Tendency,MA,MA_diff,RSI,Next
20,2007-01-31,37.669998,7277500.0,0.619999,1.673412,higher,37.072,0.491001,47.586173,higher
21,2007-02-01,38.700001,26123100.0,1.030003,2.734279,higher,37.244,0.319,69.196427,lower
22,2007-02-02,37.389999,25850700.0,-1.310002,-3.385018,lower,37.281,0.2065,53.21738,lower
23,2007-02-05,37.16,6110900.0,-0.229999,-0.615135,lower,37.302,0.125001,51.77664,higher
24,2007-02-06,38.27,8612700.0,1.11,2.987083,higher,37.486,-0.020499,64.153837,higher


In [5]:
from torch.utils.data import Dataset

class StocksDataset(Dataset):
    def __init__(self, data, target):
        self.data = torch.Tensor(data.values)
        self.target = target.values
        
    def __getitem__(self, index):
        datapoint = self.data[index]
        target = self.target[index]
        return torch.tensor(datapoint), torch.tensor(target)
        
    def __len__(self):
        return len(self.data)
    
data_columns = ['Close', 'PercentageDiff', 'MA_diff', 'RSI', 'Next', 'Volume']
y_column = 'Next'

dataset = df.copy()
dataset = df.loc[:, data_columns]

X = dataset.loc[:, dataset.columns != y_column]
Y = dataset[y_column]

dataset = StocksDataset(X,Y)

In [6]:
from torch.utils.data import DataLoader
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4) #num_workers = n - how many threads in 

In [7]:
import torch.nn as nn

class LinearModel(nn.Module):
    
    def __init__(self, input_dim):
        super(LinearModel, self).__init__()
        self.fc = nn.Linear(input_dim, 1)
        
    def forward(self, x):
        out = self.fc(x)
        return out
    


In [12]:
def create_model(lr, input_dim):
    model = LinearModel(input_dim=input_dim)
    model = model.to(device)

    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    
    return model, optimizer, loss_fn

lr = 0.01

model, optimizer, loss_fn = create_model(lr,input_dim = len(data_columns))