In [11]:
# --- Setup paths ---
import os, pathlib, datetime as dt
import pandas as pd
import yfinance as yf

# Project paths (relative to notebook folder)
ROOT = pathlib.Path("..").resolve()
DATA = ROOT / "data"
RAW = DATA / "raw"
PROC = DATA / "processed"
for p in [RAW, PROC]:
    p.mkdir(parents=True, exist_ok=True)

print("Data folders:", RAW, PROC)


Data folders: C:\Users\10341\bootcamp_Jiayuan_zhang\project\data\raw C:\Users\10341\bootcamp_Jiayuan_zhang\project\data\processed


In [12]:
# --- Utility timestamp function ---
def ts():
    return dt.datetime.now().strftime('%Y%m%d-%H%M%S')


In [13]:
# --- Stage 04: Data Acquisition (Seagate stock prices) ---
TICKER = "STX"  # Seagate Technology
df = yf.download(TICKER, period="6mo", interval="1d").reset_index()[['Date','Close']]
df.columns = ['date','price']
df['ticker'] = TICKER

df.head()


  df = yf.download(TICKER, period="6mo", interval="1d").reset_index()[['Date','Close']]
[*********************100%***********************]  1 of 1 completed


Unnamed: 0,date,price,ticker
0,2025-02-21,99.502449,STX
1,2025-02-24,98.742737,STX
2,2025-02-25,98.387543,STX
3,2025-02-26,98.930191,STX
4,2025-02-27,99.324852,STX


In [14]:
# Save raw CSV
csv_path = RAW / f"{TICKER}_prices_{ts()}.csv"
df.to_csv(csv_path, index=False)
print("Saved raw CSV:", csv_path)


Saved raw CSV: C:\Users\10341\bootcamp_Jiayuan_zhang\project\data\raw\STX_prices_20250820-234954.csv


In [15]:
# --- Stage 05: Data Storage & Validation ---
def validate_loaded(original, reloaded):
    checks = {
        'shape_equal': original.shape == reloaded.shape,
        'date_is_datetime': pd.api.types.is_datetime64_any_dtype(reloaded['date']) if 'date' in reloaded.columns else False,
        'price_is_numeric': pd.api.types.is_numeric_dtype(reloaded['price']) if 'price' in reloaded.columns else False,
        'ticker_is_string': reloaded['ticker'].dtype == 'object' if 'ticker' in reloaded.columns else False
    }
    return checks


In [16]:
# Save CSV and Parquet to processed folder
csv_proc = PROC / f"{TICKER}_prices_{ts()}.csv"
pq_proc  = PROC / f"{TICKER}_prices_{ts()}.parquet"

df.to_csv(csv_proc, index=False)
print("Saved processed CSV:", csv_proc)

try:
    df.to_parquet(pq_proc)
    print("Saved processed Parquet:", pq_proc)
except Exception as e:
    print("Parquet save failed:", e)
    pq_proc = None


Saved processed CSV: C:\Users\10341\bootcamp_Jiayuan_zhang\project\data\processed\STX_prices_20250820-234954.csv
Saved processed Parquet: C:\Users\10341\bootcamp_Jiayuan_zhang\project\data\processed\STX_prices_20250820-234954.parquet


In [17]:
# Reload and validate CSV
df_csv = pd.read_csv(csv_proc, parse_dates=['date'])
print("CSV Validation:", validate_loaded(df, df_csv))

# Reload and validate Parquet
if pq_proc:
    try:
        df_pq = pd.read_parquet(pq_proc)
        print("Parquet Validation:", validate_loaded(df, df_pq))
    except Exception as e:
        print("Parquet read failed:", e)


CSV Validation: {'shape_equal': True, 'date_is_datetime': True, 'price_is_numeric': True, 'ticker_is_string': True}
Parquet Validation: {'shape_equal': True, 'date_is_datetime': True, 'price_is_numeric': True, 'ticker_is_string': True}
