In [1]:
# build_step1_dataset.py
# -----------------------
# Bước 1: Tải dữ liệu giá từ Yahoo Finance và tính daily return
# Output: stock_prices.csv và stock_returns_long.csv (để import Power BI)

import os
from datetime import datetime, timedelta
import pandas as pd
import yfinance as yf

# ========= CONFIG =========
TICKERS = ["AAPL", "TSLA", "AMZN", "MSFT"]
END = datetime.today().date()
START = END - timedelta(days=365*5 + 10)   # 5 năm gần nhất
BASE_DIR = "/Users/ngocanh/Documents/Buid/Project 3"
DATA_DIR = os.path.join(BASE_DIR, "data")
os.makedirs(DATA_DIR, exist_ok=True)
# ==========================

def fetch_prices(tickers, start, end):
    df = yf.download(
        tickers=tickers,
        start=start,
        end=end + timedelta(days=1),
        auto_adjust=False,
        progress=False
    )
    if isinstance(df.columns, pd.MultiIndex):
        if "Adj Close" in df.columns.get_level_values(0):
            prices = df["Adj Close"].copy()
        else:
            prices = df["Close"].copy()
    else:
        col_name = tickers[0] if isinstance(tickers, list) and len(tickers) == 1 else "Price"
        prices = df.rename(columns={df.columns[0]: col_name})
    prices = prices.dropna(how="all")
    prices.index.name = "Date"
    return prices

def compute_returns(prices_wide):
    returns_wide = prices_wide.sort_index().pct_change().dropna(how="all")
    returns_wide.index = pd.to_datetime(returns_wide.index)
    returns_long = returns_wide.reset_index().melt(id_vars="Date", var_name="Ticker", value_name="Return")
    return returns_long, returns_wide

def main():
    print(f"[INFO] Downloading prices for {TICKERS} from {START} to {END}")
    prices = fetch_prices(TICKERS, START, END)

    # Lưu giá gốc
    prices_out = os.path.join(DATA_DIR, "stock_prices.csv")
    prices.to_csv(prices_out, index=True)
    print(f"[OK] Saved prices -> {prices_out}")

    # Tính returns
    returns_long, returns_wide = compute_returns(prices)
    returns_long.to_csv(os.path.join(DATA_DIR, "stock_returns_long.csv"), index=False)
    returns_wide.to_csv(os.path.join(DATA_DIR, "stock_returns_wide.csv"), index=True)
    print("[OK] Saved returns (long & wide)")

    print("\n[DONE] Step 1 complete. Files ready:")
    print(" - data/stock_prices.csv")
    print(" - data/stock_returns_long.csv")
    print(" - data/stock_returns_wide.csv")

if __name__ == "__main__":
    main()

[INFO] Downloading prices for ['AAPL', 'TSLA', 'AMZN', 'MSFT'] from 2020-09-14 to 2025-09-23
[OK] Saved prices -> /Users/ngocanh/Documents/Buid/Project 3/data/stock_prices.csv
[OK] Saved returns (long & wide)

[DONE] Step 1 complete. Files ready:
 - data/stock_prices.csv
 - data/stock_returns_long.csv
 - data/stock_returns_wide.csv
