In [1]:
import yfinance as yf

In [2]:
from src.index import *

In [3]:
import numpy as np
import torch
from torch.utils.data import Dataset
from torch.utils import data

from sklearn.preprocessing import MinMaxScaler

In [42]:
from torch.utils.data import Dataset
class CustomDataset(Dataset):
    '''
    데이터 셋에서는 Adj Close만 가져오는 걸로
    stock_name
    target -> position or price
    window size
    scaling -> each features and target
    indicators(technical and economic)

    '''
    def __init__(self, 
                    ticker, 
                    window= 20, 
                    target = 'Adj Close', 
                    scaling = False, 
                    target_gen = None, 
                    drop_feature = None, 
                    mode = 'train', 
                    split = 0.7) :
                    
        super(CustomDataset).__init__()
        self.ticker = ticker
        df = yf.Ticker(self.ticker).history(period="max",auto_adjust = False)
        self.target = target
        self.df = read_all(df)
        if drop_feature != None:
            self.df.drop(drop_feature, axis = 1)
        self.target_generator(target_gen)
        if scaling == True:
            self.scaler()
        self.df = self.df.dropna(axis = 0)
        self.columns = self.df.columns
        self.period = (self.df.index.max()-self.df.index.min()).days
        if mode == 'train':
            self.df = self.df.iloc[:int(split*len(self.df))]
        else:
            self.df = self.df.iloc[:int(split*len(self.df))]
        self.X, self.y = self.my_window_data(window)

        self.X = torch.tensor([self.X], dtype = torch.float32)
        self.y = torch.tensor([self.y], dtype = torch.float32)
    def my_window_data(self, window_size):
        X_list = [self.df.iloc[i:i+window_size] for i in range(len(self.df) - window_size-1)]
        y_list = [self.df.iloc[i+window_size][self.target] for i in range(len(self.df) - window_size-1)]
        return  np.array(X_list), np.array(y_list).reshape(-1)

    def scaler(self):
        scaler = MinMaxScaler()
        self.df = pd.DataFrame(columns = self.df.columns, data = scaler.fit_transform(self.df))

    def target_generator(self, target_gen):
        if target_gen == None:
            pass
        elif target_gen == 'trend':
            self.df[self.target] = [1 if self.df[self.target].diff().iloc[i]>0 else 0 for i in range(len(self.df))]
        else:
            self.df[self.target] = [  1 if self.df[self.target].diff(1).iloc[i]>0 and self.df[self.target].diff(-1).iloc[i]>0
                                else -1 if self.df[self.target].diff(1).iloc[i]<0 and self.df[self.target].diff(-1).iloc[i]<0
                                else 0 for i in range(len(self.df))]

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, i): 
        return self.X[i], self.y[i]
    

In [43]:
train_dataset = CustomDataset('AAPL', drop_feature=['Open', 'Close', 'High'], mode = 'train')
# train_dataloader = data.DataLoader(train_dataset, batch_size=20)
test_dataset = CustomDataset('AAPL', drop_feature=['Open', 'Close', 'High'], mode = 'test')
# test_dataloader = data.DataLoader(test_dataset, batch_size=20)


In [36]:
from torch import nn

class StockPredictor(nn.Module):

    def __init__(self, in_channel=3, out_channel=1):
        super(StockPredictor, self).__init__()
        self.conv1d_1 = nn.Conv1d(in_channels=in_channel,
                                out_channels=16,
                                kernel_size=3,
                                stride=1,
                                padding=1)
        self.conv1d_2 = nn.Conv1d(in_channels=16,
                                out_channels=32,
                                kernel_size=3,
                                stride=1,
                                padding=1)
        
        self.lstm = nn.LSTM(input_size=32,
                            hidden_size=50,
                            num_layers=1,
                            bias=True,
                            bidirectional=False,
                            batch_first=True)
        
        self.dropout = nn.Dropout(0.5)

        self.dense1 = nn.Linear(50, 32)
        self.dense2 = nn.Linear(32, out_channel)

    def forward(self, x):
	# Raw x shape : (B, S, F) => (B, 10, 3)
        
        # Shape : (B, F, S) => (B, 3, 10)
        x = x.transpose(1, 2)
        # Shape : (B, F, S) == (B, C, S) // C = channel => (B, 16, 10)
        x = self.conv1d_1(x)
        # Shape : (B, C, S) => (B, 32, 10)
        x = self.conv1d_2(x)
        # Shape : (B, S, C) == (B, S, F) => (B, 10, 32)
        x = x.transpose(1, 2)
        
        self.lstm.flatten_parameters()
        # Shape : (B, S, H) // H = hidden_size => (B, 10, 50)
        _, (hidden, _) = self.lstm(x)
        # Shape : (B, H) // -1 means the last sequence => (B, 50)
        x = hidden[-1]
        
        # Shape : (B, H) => (B, 50)
        x = self.dropout(x)
        
        # Shape : (B, 32)
        x = self.fc_layer1(x)
        # Shape : (B, O) // O = output => (B, 1)
        x = self.fc_layer2(x)

        return x

In [29]:
model = StockPredictor(
    n_features=len(train_dataset.columns),
    n_hidden=4,
    seq_len=20,
    n_layers=1
)

In [32]:
loss_fn = torch.nn.L1Loss() #
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
device = 'cpu'

In [33]:
def training(epoch, model, train_loader ):
    running_loss = 0

    model.train()
    for idx, (X, y) in enumerate(train_loader):
        X, y = X.to(device), y.to(device)
        y_pred = model(X)
        loss = loss_fn(y_pred, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        with torch.no_grad():
            running_loss += loss.item()

    epoch_loss = running_loss / len(train_loader. dataset)
    return epoch_loss
    
def testing(epoch, model,valid_loader):
    valid_running_loss = 0
    model.eval()
    with torch.no_grad():
        for idx, (X, y) in enumerate(valid_loader):
             X, y = X.to(device), y.to(device)
             y_pred = model(X)
             loss = loss_fn(y_pred, y)
             valid_running_loss += loss.item()
    epoch_valid_loss = valid_running_loss / len(valid_loader.dataset)
    # print(f'epoch:{epoch},\nloss:{round(epoch_loss, 3)},valid_loss:{round(epoch_valid_loss, 3)}')
    return epoch_valid_loss

In [34]:
from tqdm import tqdm

In [35]:
epochs = 10
pbar = tqdm(range(epochs))
total_loss = []
total_val_loss = []
for epoch in pbar:
        loss_list = []
        val_loss_list = []
        loss = training(epoch, model, train_dataloader)
        valid_loss = testing(epoch, model, test_dataloader)
        loss_list.append(loss)
        val_loss_list.append(valid_loss)
        print(f'Epoch{str(epoch).zfill(3)}\t\t loss : {round((np.mean(loss_list)), 3)}, valid loss : {round(np.mean(val_loss_list), 3)}')

  0%|          | 0/10 [00:00<?, ?it/s]


RuntimeError: shape '[1, 19, -1]' is invalid for input of size 3233999

In [41]:
train_dataset

<__main__.CustomDataset at 0x29359c940>

In [8]:
# 검증할 때 렉 현상에 관해서
def generate_time_lags(df, n_lags):
    df_n = df.copy()
    for n in range(1, n_lags + 1):
        df_n[f"lag{n}"] = df_n["value"].shift(n)
    df_n = df_n.iloc[n_lags:]
    return df_n

In [9]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler, RobustScaler

def get_scaler(scaler):
    scalers = {
        "minmax": MinMaxScaler,
        "standard": StandardScaler,
        "maxabs": MaxAbsScaler,
        "robust": RobustScaler,
    }
    return scalers.get(scaler.lower())()
    
scaler = get_scaler('robust')