In [1]:
# Stage 05: Data Storage
# -----------------------
# Store Seagate (STX) sample data into CSV and Parquet

import numpy as np
import pandas as pd
import pathlib, datetime as dt
import typing as t

# --- Paths (relative to notebooks/) ---
RAW = pathlib.Path("../data/raw"); RAW.mkdir(parents=True, exist_ok=True)
PROC = pathlib.Path("../data/processed"); PROC.mkdir(parents=True, exist_ok=True)

def ts(): 
    return dt.datetime.now().strftime("%Y%m%d-%H%M%S")

# --- Step 1: Generate sample data for STX (Seagate) ---
dates = pd.date_range("2024-01-01", periods=20, freq="D")
df = pd.DataFrame({
    "date": dates, 
    "ticker": ["STX"]*20, 
    "price": 80 + np.random.randn(20).cumsum()
})
df.head()


Unnamed: 0,date,ticker,price
0,2024-01-01,STX,80.871492
1,2024-01-02,STX,80.875791
2,2024-01-03,STX,81.315401
3,2024-01-04,STX,81.631618
4,2024-01-05,STX,82.622357


In [2]:
# --- Step 2: Save CSV ---
csv_path = RAW / f"sample_{ts()}.csv"
df.to_csv(csv_path, index=False)
print("Saved CSV:", csv_path)

# --- Step 3: Save Parquet ---
pq_path = PROC / f"sample_{ts()}.parquet"
try:
    df.to_parquet(pq_path)
    print("Saved Parquet:", pq_path)
except Exception as e:
    print("Parquet engine not available. Install pyarrow or fastparquet to complete this step.")
    pq_path = None


Saved CSV: ..\data\raw\sample_20250820-234113.csv
Saved Parquet: ..\data\processed\sample_20250820-234113.parquet


In [3]:
# --- Step 4: Validation after reload ---
def validate_loaded(original, reloaded):
    checks = {
        "shape_equal": original.shape == reloaded.shape,
        "date_is_datetime": pd.api.types.is_datetime64_any_dtype(reloaded["date"]) if "date" in reloaded.columns else False,
        "price_is_numeric": pd.api.types.is_numeric_dtype(reloaded["price"]) if "price" in reloaded.columns else False,
    }
    return checks

df_csv = pd.read_csv(csv_path, parse_dates=["date"])
print("Validation CSV:", validate_loaded(df, df_csv))

if pq_path:
    try:
        df_pq = pd.read_parquet(pq_path)
        print("Validation Parquet:", validate_loaded(df, df_pq))
    except Exception as e:
        print("Parquet read failed:", e)


Validation CSV: {'shape_equal': True, 'date_is_datetime': True, 'price_is_numeric': True}
Validation Parquet: {'shape_equal': True, 'date_is_datetime': True, 'price_is_numeric': True}


In [4]:
# --- Step 5: Generalized Utility Functions ---
def detect_format(path: t.Union[str, pathlib.Path]):
    s = str(path).lower()
    if s.endswith(".csv"): return "csv"
    if s.endswith(".parquet") or s.endswith(".pq") or s.endswith(".parq"): return "parquet"
    raise ValueError("Unsupported format: " + s)

def write_df(df: pd.DataFrame, path: t.Union[str, pathlib.Path]):
    p = pathlib.Path(path); p.parent.mkdir(parents=True, exist_ok=True)
    fmt = detect_format(p)
    if fmt == "csv":
        df.to_csv(p, index=False)
    else:
        try:
            df.to_parquet(p)
        except Exception as e:
            raise RuntimeError("Parquet engine not available. Install pyarrow or fastparquet.") from e
    return p

def read_df(path: t.Union[str, pathlib.Path]):
    p = pathlib.Path(path)
    fmt = detect_format(p)
    if fmt == "csv":
        return pd.read_csv(p, parse_dates=["date"]) if "date" in pd.read_csv(p, nrows=0).columns else pd.read_csv(p)
    else:
        try:
            return pd.read_parquet(p)
        except Exception as e:
            raise RuntimeError("Parquet engine not available. Install pyarrow or fastparquet.") from e


In [5]:
# --- Step 6: Demo of Utilities ---
p_csv = RAW / f"util_{ts()}.csv"
p_pq  = PROC / f"util_{ts()}.parquet"

# Write + Read CSV
write_df(df, p_csv)
print("Utility CSV loaded:\n", read_df(p_csv).head())

# Write + Read Parquet
try:
    write_df(df, p_pq)
    print("Utility Parquet loaded:\n", read_df(p_pq).head())
except RuntimeError as e:
    print("Skipping Parquet util demo:", e)


Utility CSV loaded:
         date ticker      price
0 2024-01-01    STX  80.871492
1 2024-01-02    STX  80.875791
2 2024-01-03    STX  81.315401
3 2024-01-04    STX  81.631618
4 2024-01-05    STX  82.622357
Utility Parquet loaded:
         date ticker      price
0 2024-01-01    STX  80.871492
1 2024-01-02    STX  80.875791
2 2024-01-03    STX  81.315401
3 2024-01-04    STX  81.631618
4 2024-01-05    STX  82.622357
