# Preprocessing of the dataset (normalization, split, sequence)

In [None]:
import pandas as pd
import numpy as np
import yfinance as yf
from sklearn.preprocessing import StandardScaler
import torch as th

In [None]:
data = yf.download(tickers=["AAPL"], period="5y", interval="1d", auto_adjust=True)
data.columns = data.columns.droplevel(1)
data = data.drop(columns=["High", "Low", "Open", "Volume"])

In [None]:
train_size = int(len(data) * 0.7)
val_size = int(len(data) * 0.2)

X_train = data[:train_size]
X_val = data[train_size:(train_size+val_size)]
X_test = data[(train_size+val_size):]

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Extract sequences
from typing import Tuple

def sliding_window(X: np.ndarray, lookback=60) -> Tuple[th.Tensor, th.Tensor]:
    X_seq = []
    y_seq = []
    for i in range(0, len(X)-lookback):
        X_seq.append(X[i:i+lookback])
        y_seq.append(X[i+lookback, 0])

    X_seq = th.tensor(np.array(X_seq), dtype=th.float32)
    y_seq = th.tensor(np.array(y_seq), dtype=th.float32)

    return X_seq, y_seq

X_train_seq, y_train_seq = sliding_window(X_train_scaled)
X_val_seq, y_val_seq = sliding_window(X_val_scaled)
X_test_seq, y_test_seq = sliding_window(X_test_scaled)