In [10]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pda
import tensorflow as tf

from index import *
import yfinance as yf

Init Plugin
Init Graph Optimizer
Init Kernel


In [None]:
from torch.utils.data import Dataset
import torch
class PTDataset(Dataset):
    '''
    데이터 셋에서는 Adj Close만 가져오는 걸로
    stock_name
    target -> position or price
    window size
    scaling -> each features and target
    indicators(technical and economic)

    '''
    def __init__(self, 
                    ticker, 
                    window= 20, 
                    target = 'Adj Close', 
                    scaling = False, 
                    target_gen = None, 
                    drop_feature = None, 
                    mode = 'train', 
                    split = 0.7) :
                    
        super(PTDataset).__init__()
        self.ticker = ticker
        df = yf.Ticker(self.ticker).history(period="max",auto_adjust = False)[['Open','High','Low','Adj Close','Volume']]
        self.target = target
        self.df = read_all(df)
        if drop_feature != None:
            self.df.drop(drop_feature, axis = 1)
        self.target_generator(target_gen)
        if scaling == True:
            self.scaler()
        self.df = self.df.dropna(axis = 0)
        self.columns = self.df.columns
        self.period = (self.df.index.max()-self.df.index.min()).days
        if mode == 'train':
            self.df = self.df.iloc[:int(split*len(self.df))]
        else:
            self.df = self.df.iloc[:int(split*len(self.df))]
        self.X, self.y = self.my_window_data(window)

        self.X = torch.tensor([self.X], dtype = torch.float32)
        self.y = torch.tensor([self.y], dtype = torch.float32)
    def my_window_data(self, window_size):
        X_list = [self.df.iloc[i:i+window_size] for i in range(len(self.df) - window_size-1)]
        y_list = [self.df.iloc[i+window_size][self.target] for i in range(len(self.df) - window_size-1)]
        return  np.array(X_list), np.array(y_list).reshape(-1)

    def scaler(self):
        scaler = MinMaxScaler()
        self.df = pd.DataFrame(columns = self.df.columns, data = scaler.fit_transform(self.df))

    def target_generator(self, target_gen):
        if target_gen == None:
            pass
        elif target_gen == 'trend':
            self.df[self.target] = [1 if self.df[self.target].diff().iloc[i]>0 else 0 for i in range(len(self.df))]
        else:
            self.df[self.target] = [  1 if self.df[self.target].diff(1).iloc[i]>0 and self.df[self.target].diff(-1).iloc[i]>0
                                else -1 if self.df[self.target].diff(1).iloc[i]<0 and self.df[self.target].diff(-1).iloc[i]<0
                                else 0 for i in range(len(self.df))]

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, i): 
        return self.X[i], self.y[i]
    

In [198]:
class TFDataset(tf.keras.utils.Sequence):
    """
    Custom data generator class for Digits dataset
    """
    def __init__(self, 
                 ticker,
                 window = 20,
                 period = 1,
                 target = 'Adj Close',
                 scaling = False, 
                 target_gen = None, 
                 drop_feature = None, 
                 batch_size: int=64,
                 mode = 'train', 
                 split = 0.7):
        self.ticker = ticker
        df = yf.Ticker(self.ticker).history(period="max",auto_adjust = False)[['Open','High','Low','Adj Close','Volume']]
        self.target = target
        self.df = read_all(df)
        if drop_feature != None:
            self.df.drop(drop_feature, axis = 1)
        self.df = self.df.dropna(axis = 0)
        if mode == 'train':
            self.df = self.df.iloc[:int(split*len(self.df))]
        else:
            self.df = self.df.iloc[:int(split*len(self.df))]

        self.X, self.y = self.df, self.df[[self.target]]

        self.target_generator(target_gen)
        if scaling == True:
            self.scaler()
        self.columns = self.df.columns
        self.period = (self.df.index.max()-self.df.index.min()).days
        self.batch_size = batch_size
        self.X, self.y = self.my_window_data(window_size = window, for_period=period)

    def my_window_data(self, window_size, for_period):
        X_list = [self.X.iloc[i:i+window_size] for i in range(len(self.X) - window_size-for_period)]
        y_list = [self.y[self.target].iloc[i+window_size:i+window_size+for_period].values for i in range(len(self.y) - window_size-for_period)]
        return  np.array(X_list), np.array(y_list)

    def scaler(self):
        scaler = MinMaxScaler()
        self.X = pd.DataFrame(columns = self.X.columns, data = scaler.fit_transform(self.X))
        self.y = pd.DataFrame(columns = self.X.columns, daat = np.log1p(self.y))

    def target_generator(self, target_gen):

        if target_gen == None:
            pass
        elif target_gen == 'trend':
            self.y[self.target] = ['up' if self.y[self.target].diff().iloc[i]>0 else 'down' for i in range(len(self.y[self.target]))]
            self.y = pd.get_dummies(self.y[self.target])
            # self.y = tf.keras.utils.to_categorical(self.y[self.target],  num_classes=2)

        else:
            self.y[self.target] = [  'buy' if self.y[self.target].diff(1).iloc[i]>0 and self.y[self.target].diff(-1).iloc[i]>0
                                else 'sell' if self.y[self.target].diff(1).iloc[i]<0 and self.y[self.target].diff(-1).iloc[i]<0
                                else 'hold' for i in range(len(self.y[self.target]))]
            self.y = pd.get_dummies(self.y[self.target])
            
    def __len__(self):
        return np.math.ceil(len(self.X) / self.batch_size)
    
    def __getitem__(self, index):
        """
        Returns a batch of data
        """
        batch_X = self.X[index * self.batch_size : (index + 1) * self.batch_size]
        batch_y = self.y[index * self.batch_size : (index + 1) * self.batch_size]

        return batch_X, batch_y
    def only_data(self):
        return self.X, self.y

In [None]:
train_dataset = PTDataset(ticker = 'AAPL', drop_feature=['Open', 'Close', 'High'], mode = 'train')
# train_dataloader = data.DataLoader(train_dataset, batch_size=20)
test_dataset = PTDataset(ticker = 'AAPL', drop_feature=['Open', 'Close', 'High'], mode = 'test')
# test_dataloader = data.DataLoader(test_dataset, batch_size=20)


In [199]:
td = TFDataset(ticker = 'AAPL', mode = 'train', period=5)
X, y = td.only_data()

In [203]:
X

array([[[3.14731985e-01, 3.34820986e-01, 3.12500000e-01, ...,
         1.61849867e+10, 2.52797294e-01, 2.53152910e-01],
        [3.39286000e-01, 3.39286000e-01, 3.34820986e-01, ...,
         1.60858947e+10, 2.55641657e-01, 2.55108374e-01],
        [3.41518015e-01, 3.45981985e-01, 3.32589000e-01, ...,
         1.60039667e+10, 2.59197274e-01, 2.56975055e-01],
        ...,
        [3.05804014e-01, 3.10268015e-01, 3.03570986e-01, ...,
         1.60136127e+10, 2.40886450e-01, 2.40797643e-01],
        [3.03570986e-01, 3.03570986e-01, 2.87945986e-01, ...,
         1.60353872e+10, 2.38753119e-01, 2.39553215e-01],
        [2.94643015e-01, 2.99106985e-01, 2.86830008e-01, ...,
         1.60555377e+10, 2.38753119e-01, 2.38842079e-01]],

       [[3.39286000e-01, 3.39286000e-01, 3.34820986e-01, ...,
         1.60858947e+10, 2.55641657e-01, 2.55108374e-01],
        [3.41518015e-01, 3.45981985e-01, 3.32589000e-01, ...,
         1.60039667e+10, 2.59197274e-01, 2.56975055e-01],
        [3.37054014e-01, 

In [202]:
y

array([[ 0.24177493,  0.24177493,  0.23910849,  0.24355341,  0.24888638],
       [ 0.24177493,  0.23910849,  0.24355341,  0.24888638,  0.24710876],
       [ 0.23910849,  0.24355341,  0.24888638,  0.24710876,  0.23644206],
       ...,
       [15.95062447, 15.97642231, 15.75620461, 15.81947327, 15.65146828],
       [15.97642231, 15.75620461, 15.81947327, 15.65146828, 16.34498978],
       [15.75620461, 15.81947327, 15.65146828, 16.34498978, 16.86282349]])

In [183]:
for idx,( X, y )in enumerate(train_loader):
    print(idx, X.shape, y.shape)
    if idx ==10:
        break

0 (64, 20, 25) (64, 5)
1 (64, 20, 25) (64, 5)
2 (64, 20, 25) (64, 5)
3 (64, 20, 25) (64, 5)
4 (64, 20, 25) (64, 5)
5 (64, 20, 25) (64, 5)
6 (64, 20, 25) (64, 5)
7 (64, 20, 25) (64, 5)
8 (64, 20, 25) (64, 5)
9 (64, 20, 25) (64, 5)
10 (64, 20, 25) (64, 5)


In [None]:
@tf.funciton
def train(model, optimizer, loss_fn, en_epoch):
    train_set = TFDataset( ticker = 'AAPL', period=5)
    for X_batch, y_batch in train_set:
        with tf.GradientTape() as tape:
            y_pred = model(X_batch)
            main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))
            loss = tf.add_n([main_loss]+model.losses)
        grads = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))