In [1]:
import yfinance as yf
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
%matplotlib inline

In [2]:
### define fetching data function 
def get_data():
    aapl = yf.download("AAPL", start="2020-01-01", end="2025-01-01")['Close']
    spy = yf.download("SPY", start="2020-01-01", end="2025-01-01")

    df = pd.DataFrame({'Close': aapl['AAPL']})

    ### Feature 1 - Calculate return as 
    for i in range(1,6):
        df[f'Lag_{i}'] = df['Close'].pct_change(i)

    ### Feature 2 - Rolling std
    #df['Volatility'] = df['Close'].pct_change().rolling(5).std()

    ### Feature 3 - average distance (Price / SMA_20 - 1)
    #df['MA_Dist'] = df['Close'] / df['Close'].rolling(20).mean() - 1

    ### Feature 4 - RSI
    delta = df['Close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rsi = gain / loss
    df["RSI"] = 100 - (100/(1+rsi))

    ### Feature 5 - SPY Lag 1
    df['SPY_Lag_1'] = spy['Close'].pct_change(1)

    ### 3. Label
    # If close price: tomorrow > today, set 1 (up), else set 0 (down)
    # .shift(-1) is required, moving tomorrow rows to today rows as reference
    df['Target'] = np.where(df['Close'].shift(-1) > df["Close"], 1, 0)

    df.dropna(inplace=True)

    return df

In [3]:
data = get_data()

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [5]:
###  Split dataset into Train and Test
# Financial data could not be shuffled randomly, it must be divided according to time
# First 80% data as Train set, rest as Test set
split = int(len(data) * 0.8)
train = data.iloc[:split]
test = data.iloc[split:]

features = [col for col in data.columns if col not in ["Close", "Target"]]
X_train, y_train = train[features], train['Target']
X_test, y_test = test[features], test['Target']

In [6]:
features

['Lag_1', 'Lag_2', 'Lag_3', 'Lag_4', 'Lag_5', 'RSI', 'SPY_Lag_1']

In [None]:
### 0. Scale the data
# SVM and Logistic Regression are sensitive to the data scale, so standardization is required.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [None]:
###