# Yahoo Finance Ingestion Testing

In [7]:
# Imports
import sys
sys.path.append('..')  # Add parent directory to path

import yfinance as yf
from datetime import datetime, timedelta
from decimal import Decimal
import pandas as pd
import src.models as model

In [8]:
# Fetch raw data from yfinance
ticker = yf.Ticker("AAPL")
#df = ticker.history(start="2024-01-01", end="2024-01-07")
df = ticker.history(end="2024-01-01")
#df = ticker.history(period="1mo")
print(df)


                                 Open        High         Low       Close  \
Date                                                                        
2023-12-01 00:00:00-05:00  188.549346  189.767835  187.459631  189.450836   
2023-12-04 00:00:00-05:00  188.202633  188.271986  185.696304  187.657776   
2023-12-05 00:00:00-05:00  188.430498  192.581286  188.400765  191.610458   
2023-12-06 00:00:00-05:00  192.630799  192.937896  190.312695  190.520737   
2023-12-07 00:00:00-05:00  191.818472  193.175650  191.778838  192.452484   
2023-12-08 00:00:00-05:00  192.383130  194.156392  191.858090  193.879013   
2023-12-11 00:00:00-05:00  191.303359  191.679808  189.629167  191.372696   
2023-12-12 00:00:00-05:00  191.273612  192.898268  189.926335  192.888367   
2023-12-13 00:00:00-05:00  193.264796  196.147575  193.027051  196.107956   
2023-12-14 00:00:00-05:00  196.167407  197.752429  194.324808  196.256561   
2023-12-15 00:00:00-05:00  195.682000  196.543856  195.156959  195.721634   

yfinance data comes in as numpy.float64, which is prone to off-by-one errors. TO DO: come back and research this.

In [9]:
for column in ["Open", "High", "Low", "Close", "Volume"]:
    df[column] = df[column].apply(lambda x : Decimal(str(x)))

Convert index timestamp to timezone naive UTC, then remove timezone metadata (NY for yfinance)

In [4]:
if df.index.tzinfo is not None:
    df.index = df.index.tz_convert("UTC").tz_localize(None)

print(df.index.tzinfo)

None


In [5]:
ohlcv_list: list[model.OHLCVRecord] = []

for index, row in df.iterrows():
    record = model.OHLCVRecord(
        timestamp = index.to_pydatetime(),
        symbol="AAPL",
        source=model.DataSource.YAHOO_FINANCE,
        open=row["Open"],
        high=row["High"],
        low=row["Low"],
        close=row["Close"],
        volume=int(row["Volume"]),
        adjusted_close=None,
        metadata={"dividends": float(row["Dividends"]), "stock_splits": float(row["Stock Splits"])}
    )
    ohlcv_list.append(record)

print(f"Created {len(ohlcv_list)} OHLCV records")
print(f"First record: {ohlcv_list[0]}")

Created 20 OHLCV records
First record: timestamp=datetime.datetime(2023, 12, 1, 5, 0) symbol='AAPL' source=<DataSource.YAHOO_FINANCE: 'yahoo_finance'> open=Decimal('188.54934617033052') high=Decimal('189.76783452208102') low=Decimal('187.45963130874318') close=Decimal('189.45083618164062') volume=45704800 adjusted_close=None metadata={'dividends': 0.0, 'stock_splits': 0.0}
