In [None]:
!pip install yfinance

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import datetime
import yfinance as yf
import numpy as np
import pandas as pd

In [None]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 200)

In [None]:
class Stock_Data:

    def __init__(self, tickers, start_year, start_month, start_date, end_year,
                 end_month, end_date, freq, scaling_factor):
        self.tickers = tickers
        self.start = datetime.datetime(start_year, start_month, start_date)
        self.end = datetime.datetime(end_year, end_month, end_date)
        # granularity of price data
        self.freq = freq
        # scaling factor for log return
        self.scaling_factor = scaling_factor
        # cache the raw data
        self.raw_data = None


    def get_raw_data(self, refresh = True):
        """
        Get raw data from Yahoo Finance.

        Output schema:
        ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'Log Adj Close', 'Log Return']
        """
        if refresh or self.raw_data is None:
            data = yf.download(self.tickers, start=self.start, end=self.end, interval=self.freq)
            data["Log Adj Close"] = np.log(data["Adj Close"])
            data["Log Return"] = np.insert(np.diff(data["Log Adj Close"]), 0, 0) * self.scaling_factor
            data.drop(columns=["Close", "Log Adj Close"], inplace=True)
            data["vol"] = data["Log Return"].rolling(10).std(ddof=0)
            # cache the raw data
            self.raw_data = data

        return self.raw_data


    def prepare_train_data(self, *args):
        """
        Prepare data for training and perform normalization if needed.

        Output schema:
        [10-day-vol, Log Return, Log Volume Chg, Log Range, VIX]
        """
        return None

In [None]:
SNP_500_data = Stock_Data(
                "^GSPC",
                1990, 1, 2,
                2022, 12, 31,
                freq="1d",
                scaling_factor=100,
              )

raw_data = SNP_500_data.get_raw_data()

print(np.shape(raw_data))
print(raw_data)

[*********************100%***********************]  1 of 1 completed
(8315, 6)
                   Open         High          Low    Adj Close      Volume  Log Return
Date                                                                                  
1990-01-02   353.399994   359.690002   351.980011   359.690002   162070000    0.000000
1990-01-03   359.690002   360.589996   357.890015   358.760010   192330000   -0.258889
1990-01-04   358.760010   358.760010   352.890015   355.670013   177000000   -0.865030
1990-01-05   355.670013   355.670013   351.350006   352.200012   158530000   -0.980414
1990-01-08   352.200012   354.239990   350.540009   353.790009   140110000    0.450431
...                 ...          ...          ...          ...         ...         ...
2022-12-23  3815.110107  3845.800049  3797.010010  3844.820068  2819280000    0.585095
2022-12-27  3843.340088  3846.649902  3813.219971  3829.250000  3030300000   -0.405784
2022-12-28  3829.560059  3848.320068  3780.780029 