In [1]:
# Stage 04: Data Acquisition & Ingestion


import os, pathlib, datetime as dt
import requests
import pandas as pd
from bs4 import BeautifulSoup
import yfinance as yf

# --- Setup paths ---
RAW = pathlib.Path("data/raw")
RAW.mkdir(parents=True, exist_ok=True)

def ts():
    """Generate timestamp string for file naming"""
    return dt.datetime.now().strftime("%Y%m%d-%H%M%S")

def save_csv(df: pd.DataFrame, prefix: str, **meta):
    """Save dataframe to /data/raw with metadata in filename"""
    mid = "_".join([f"{k}-{v}" for k, v in meta.items()])
    path = RAW / f"{prefix}_{mid}_{ts()}.csv"
    df.to_csv(path, index=False)
    print("Saved", path)
    return path

def validate(df: pd.DataFrame, required):
    """Check for required columns and missing values"""
    missing = [c for c in required if c not in df.columns]
    return {
        "missing": missing,
        "shape": df.shape,
        "na_total": int(df.isna().sum().sum())
    }


SYMBOL = "STX"  # Seagate Technology
df_api = yf.download(SYMBOL, period="6mo", interval="1d").reset_index()[["Date","Close"]]
df_api.columns = ["date","adj_close"]

v_api = validate(df_api, ["date","adj_close"])
print("Validation (yfinance):", v_api)

_ = save_csv(df_api.sort_values("date"), prefix="api", source="yfinance", symbol=SYMBOL)



SCRAPE_URL = "https://finance.yahoo.com/most-active"
headers = {"User-Agent":"Mozilla/5.0 (compatible; AFE-Homework/1.0)"}

try:
    resp = requests.get(SCRAPE_URL, headers=headers, timeout=30)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    rows = [
        [c.get_text(strip=True) for c in tr.find_all(["th","td"])]
        for tr in soup.find_all("tr")
    ]
    header, *data = [r for r in rows if r]
    df_scrape = pd.DataFrame(data, columns=header)
except Exception as e:
    print("Scrape failed, fallback to demo:", e)
    html = "<table><tr><th>Ticker</th><th>Price</th></tr><tr><td>AAA</td><td>101.2</td></tr></table>"
    soup = BeautifulSoup(html, "html.parser")
    rows = [
        [c.get_text(strip=True) for c in tr.find_all(["th","td"])]
        for tr in soup.find_all("tr")
    ]
    header, *data = [r for r in rows if r]
    df_scrape = pd.DataFrame(data, columns=header)

for col in df_scrape.columns:
    df_scrape[col] = pd.to_numeric(df_scrape[col], errors="ignore")

v_scrape = validate(df_scrape, list(df_scrape.columns))
print("Validation (scrape):", v_scrape)

_ = save_csv(df_scrape, prefix="scrape", site="yahoo", table="most_active")

df_api.head(), df_scrape.head()


  df_api = yf.download(SYMBOL, period="6mo", interval="1d").reset_index()[["Date","Close"]]
[*********************100%***********************]  1 of 1 completed


Validation (yfinance): {'missing': [], 'shape': (125, 2), 'na_total': 0}
Saved data\raw\api_source-yfinance_symbol-STX_20250820-232100.csv
Validation (scrape): {'missing': [], 'shape': (25, 12), 'na_total': 50}
Saved data\raw\scrape_site-yahoo_table-most_active_20250820-232101.csv


  df_scrape[col] = pd.to_numeric(df_scrape[col], errors="ignore")


(        date  adj_close
 0 2025-02-21  99.502449
 1 2025-02-24  98.742737
 2 2025-02-25  98.387543
 3 2025-02-26  98.930191
 4 2025-02-27  99.324852,
   Symbol                        Name                       Price  Change  \
 0   OPEN  Opendoor Technologies Inc. NaN  3.2200-0.4000(-11.05%)   -0.40   
 1   PLTR  Palantir Technologies Inc. NaN     156.01-1.74(-1.10%)   -1.74   
 2   NVDA          NVIDIA Corporation NaN     175.40-0.24(-0.14%)   -0.24   
 3   INTC           Intel Corporation NaN      23.54-1.77(-6.99%)   -1.77   
 4   SOFI     SoFi Technologies, Inc. NaN      22.52-0.23(-1.01%)   -0.23   
 
   Change %    Volume Avg Vol (3M) Market Cap P/E Ratio(TTM) 52 WkChange %  \
 0  -11.05%  239.019M     211.012M      2.37B             --       +59.41%   
 1   -1.10%  215.789M      76.305M   370.109B         520.03      +388.75%   
 2   -0.14%    213.1M     180.756M     4.278T          56.76       +41.75%   
 3   -6.99%  158.767M     100.132M   103.035B             --       +17.11