# 01 - Data Collection

## Objective
Collect historical OHLCV stock/ETF price data from the Yahoo Finance endpoint via `yfinance`, and save versioned raw snapshots for reproducible analysis.

## Inputs
- Ticker symbols list
- Date range
- Output version label (e.g., v1)

## Outputs
- One raw CSV per ticker saved to `data/raw/<version>/`
- Combined preview DataFrame (head) to confirm successful collection

## CRISP-DM Stage
Data Collection

In [None]:
from pathlib import Path
from datetime import datetime, timezone

import pandas as pd
import yfinance as yf

In [None]:
# Allow notebook to import from src/
import sys
from pathlib import Path

PROJECT_ROOT = Path("..").resolve()

if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

print("Project root added to sys.path:", PROJECT_ROOT)

In [None]:
from src.config import (
    DEFAULT_TICKERS,
    DEFAULT_VERSION,
    DEFAULT_START_DATE,
    DEFAULT_END_DATE,
    get_paths,
)

# Settings for this run (now controlled centrally)
VERSION = DEFAULT_VERSION
TICKERS = DEFAULT_TICKERS
START_DATE = DEFAULT_START_DATE
END_DATE = DEFAULT_END_DATE

# Paths
paths = get_paths(VERSION)
RAW_DATA_DIR = paths.raw_dir
RAW_DATA_DIR.mkdir(parents=True, exist_ok=True)

print("Saving raw data to:", RAW_DATA_DIR)
print("Date range:", START_DATE, "to", END_DATE)
print("Tickers:", ", ".join(TICKERS))

In [None]:
def download_stock_data(ticker: str, start: str, end: str) -> pd.DataFrame:
    """
    Download historical OHLCV data from Yahoo Finance via yfinance.
    Returns a tidy DataFrame with a Date column and Ticker column.
    """
    df = yf.download(
        ticker,
        start=start,
        end=end,
        auto_adjust=False,
        progress=False
    )

    if df is None or df.empty:
        raise ValueError(f"No data returned for ticker: {ticker}")

    # Reset index so Date becomes a column
    df = df.reset_index()

    # Add ticker column
    df["Ticker"] = ticker

    return df