<a href="https://colab.research.google.com/github/Leo-xxx12/bootcamp_Leo_Xu/blob/main/Copy_of_stage05_data_storage_homework_starter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Homework Starter — Stage 05: Data Storage
Name:
Date:

Objectives:
- Env-driven paths to `data/raw/` and `data/processed/`
- Save CSV and Parquet; reload and validate
- Abstract IO with utility functions; document choices

In [2]:
import os, pathlib, datetime as dt
import pandas as pd
from dotenv import load_dotenv

load_dotenv()
RAW = pathlib.Path(os.getenv('DATA_DIR_RAW', 'data/raw'))
PROC = pathlib.Path(os.getenv('DATA_DIR_PROCESSED', 'data/processed'))
RAW.mkdir(parents=True, exist_ok=True)
PROC.mkdir(parents=True, exist_ok=True)
print('RAW ->', RAW.resolve())
print('PROC ->', PROC.resolve())

RAW -> /content/data/raw
PROC -> /content/data/processed


## 1) Create or Load a Sample DataFrame
You may reuse data from prior stages or create a small synthetic dataset.

In [3]:
import numpy as np
dates = pd.date_range('2024-01-01', periods=20, freq='D')
df = pd.DataFrame({'date': dates, 'ticker': ['AAPL']*20, 'price': 150 + np.random.randn(20).cumsum()})
df.head()

Unnamed: 0,date,ticker,price
0,2024-01-01,AAPL,150.532656
1,2024-01-02,AAPL,148.459946
2,2024-01-03,AAPL,148.490883
3,2024-01-04,AAPL,149.079034
4,2024-01-05,AAPL,148.77038


## 2) Save CSV to data/raw/ and Parquet to data/processed/ (TODO)
- Use timestamped filenames.
- Handle missing Parquet engine gracefully.

In [4]:
def ts(): return dt.datetime.now().strftime('%Y%m%d-%H%M%S')

# TODO: Save CSV
csv_path = RAW / f"sample_{ts()}.csv"
df.to_csv(csv_path, index=False)
csv_path

# TODO: Save Parquet
pq_path = PROC / f"sample_{ts()}.parquet"
try:
    df.to_parquet(pq_path)
except Exception as e:
    print('Parquet engine not available. Install pyarrow or fastparquet to complete this step.')
    pq_path = None
pq_path

PosixPath('data/processed/sample_20250821-024805.parquet')

In [5]:
import datetime as dt
from pathlib import Path

# Example: these should come from your .env (via dotenv)
RAW = Path("project/data/raw")
PROC = Path("project/data/processed")
RAW.mkdir(parents=True, exist_ok=True)
PROC.mkdir(parents=True, exist_ok=True)

def ts():
    """Return a timestamp string for filenames."""
    return dt.datetime.now().strftime('%Y%m%d-%H%M%S')

# --- Save CSV ---
csv_path = RAW / f"sample_{ts()}.csv"
df.to_csv(csv_path, index=False)
print(f"Saved CSV → {csv_path}")

# --- Save Parquet ---
pq_path = PROC / f"sample_{ts()}.parquet"
try:
    df.to_parquet(pq_path, index=False)
    print(f"Saved Parquet → {pq_path}")
except Exception as e:
    print("⚠️ Parquet engine not available. Install with:")
    print("   pip install pyarrow   OR   pip install fastparquet")
    pq_path = None


Saved CSV → project/data/raw/sample_20250821-024808.csv
Saved Parquet → project/data/processed/sample_20250821-024808.parquet


## 3) Reload and Validate (TODO)
- Compare shapes and key dtypes.

In [6]:
def validate_loaded(original, reloaded):
    checks = {
        'shape_equal': original.shape == reloaded.shape,
        'date_is_datetime': pd.api.types.is_datetime64_any_dtype(reloaded['date']) if 'date' in reloaded.columns else False,
        'price_is_numeric': pd.api.types.is_numeric_dtype(reloaded['price']) if 'price' in reloaded.columns else False,
    }
    return checks

df_csv = pd.read_csv(csv_path, parse_dates=['date'])
validate_loaded(df, df_csv)

{'shape_equal': True, 'date_is_datetime': True, 'price_is_numeric': True}

In [7]:
if pq_path:
    try:
        df_pq = pd.read_parquet(pq_path)
        validate_loaded(df, df_pq)
    except Exception as e:
        print('Parquet read failed:', e)

In [8]:
import pandas as pd
import numpy as np

def validate_loaded(original: pd.DataFrame, reloaded: pd.DataFrame, *, rtol=1e-7, atol=1e-9):
    """Return a dict of validation checks and a small diff summary."""
    checks = {}

    # Align columns order for fair comparison
    common_cols = [c for c in original.columns if c in reloaded.columns]
    o = original[common_cols].copy()
    r = reloaded[common_cols].copy()

    # Basic structure
    checks["shape_equal"]   = o.shape == r.shape
    checks["columns_equal"] = list(o.columns) == list(r.columns)

    # Dtypes (relaxed: numeric<->numeric and datetime<->datetime)
    def _dtype_class(s):
        if pd.api.types.is_numeric_dtype(s): return "numeric"
        if pd.api.types.is_datetime64_any_dtype(s): return "datetime"
        if pd.api.types.is_bool_dtype(s): return "bool"
        return "other"

    dtype_pairs = [( _dtype_class(o[c]), _dtype_class(r[c]) ) for c in common_cols]
    checks["dtype_classes_match"] = all(a==b for a,b in dtype_pairs)

    # NaN pattern
    nan_diff_cols = [c for c in common_cols if not (o[c].isna().equals(r[c].isna()))]
    checks["nan_pattern_equal"] = (len(nan_diff_cols) == 0)

    # Value equality
    value_mismatch_cols = []
    for c in common_cols:
        if _dtype_class(o[c]) == "numeric":
            ok = np.allclose(o[c].to_numpy(dtype=float), r[c].to_numpy(dtype=float), rtol=rtol, atol=atol, equal_nan=True)
        else:
            ok = o[c].equals(r[c])
        if not ok:
            value_mismatch_cols.append(c)
    checks["values_match"] = (len(value_mismatch_cols) == 0)

    # Small diff report
    diffs = {
        "nan_pattern_diff_cols": nan_diff_cols[:5],           # show a few
        "value_mismatch_cols": value_mismatch_cols[:5],
        "dtype_class_pairs": dict(zip(common_cols, dtype_pairs)),
        "original_only_cols": [c for c in original.columns if c not in reloaded.columns],
        "reloaded_only_cols": [c for c in reloaded.columns if c not in original.columns],
        "original_shape": original.shape,
        "reloaded_shape": reloaded.shape,
    }
    return checks, diffs

# ---- Reload CSV (optionally parse a 'date' column if present)
csv_read_kwargs = {}
if "date" in df.columns:
    csv_read_kwargs["parse_dates"] = ["date"]

df_csv = pd.read_csv(csv_path, **csv_read_kwargs)
checks_csv, diffs_csv = validate_loaded(df, df_csv)
print("CSV checks:", checks_csv)
if not all(checks_csv.values()):
    print("CSV diffs:", diffs_csv)

# ---- Reload Parquet (if we created it)
if pq_path:
    try:
        df_pq = pd.read_parquet(pq_path)
        checks_pq, diffs_pq = validate_loaded(df, df_pq)
        print("\nParquet checks:", checks_pq)
        if not all(checks_pq.values()):
            print("Parquet diffs:", diffs_pq)
    except Exception as e:
        print("Parquet read failed:", e)


CSV checks: {'shape_equal': True, 'columns_equal': True, 'dtype_classes_match': True, 'nan_pattern_equal': True, 'values_match': True}

Parquet checks: {'shape_equal': True, 'columns_equal': True, 'dtype_classes_match': True, 'nan_pattern_equal': True, 'values_match': True}


## 4) Utilities (TODO)
- Implement `detect_format`, `write_df`, `read_df`.
- Use suffix to route; create parent dirs if needed; friendly errors for Parquet.

In [9]:
import typing as t, pathlib

def detect_format(path: t.Union[str, pathlib.Path]):
    s = str(path).lower()
    if s.endswith('.csv'): return 'csv'
    if s.endswith('.parquet') or s.endswith('.pq') or s.endswith('.parq'): return 'parquet'
    raise ValueError('Unsupported format: ' + s)

def write_df(df: pd.DataFrame, path: t.Union[str, pathlib.Path]):
    p = pathlib.Path(path); p.parent.mkdir(parents=True, exist_ok=True)
    fmt = detect_format(p)
    if fmt == 'csv':
        df.to_csv(p, index=False)
    else:
        try:
            df.to_parquet(p)
        except Exception as e:
            raise RuntimeError('Parquet engine not available. Install pyarrow or fastparquet.') from e
    return p

def read_df(path: t.Union[str, pathlib.Path]):
    p = pathlib.Path(path)
    fmt = detect_format(p)
    if fmt == 'csv':
        return pd.read_csv(p, parse_dates=['date']) if 'date' in pd.read_csv(p, nrows=0).columns else pd.read_csv(p)
    else:
        try:
            return pd.read_parquet(p)
        except Exception as e:
            raise RuntimeError('Parquet engine not available. Install pyarrow or fastparquet.') from e

# Demo
p_csv = RAW / f"util_{ts()}.csv"
p_pq  = PROC / f"util_{ts()}.parquet"
write_df(df, p_csv); read_df(p_csv).head()
try:
    write_df(df, p_pq)
    read_df(p_pq).head()
except RuntimeError as e:
    print('Skipping Parquet util demo:', e)

In [None]:
from __future__ import annotations
import typing as t
from pathlib import Path
import pandas as pd

PathLike = t.Union[str, Path]

def detect_format(path: PathLike) -> str:
    """
    Return 'csv' or 'parquet' based on the filename.
    Supports gzip variants: .csv.gz / .parquet.gz
    """
    s = str(path).lower()
    if s.endswith((".csv", ".csv.gz")):
        return "csv"
    if s.endswith((".parquet", ".parquet.gz", ".pq", ".parq")):
        return "parquet"
    raise ValueError(f"Unsupported format for {path!s}")

def _ensure_parquet_engine() -> None:
    """
    Ensure a parquet engine (pyarrow or fastparquet) is importable.
    """
    try:
        import pyarrow  # noqa: F401
    except Exception:
        try:
            import fastparquet  # noqa: F401
        except Exception as e:
            raise RuntimeError(
                "Parquet engine not available. Install one of:\n"
                "  pip install pyarrow\n"
                "  # or\n"
                "  pip install fastparquet"
            ) from e

def write_df(
    df: pd.DataFrame,
    path: PathLike,
    *,
    index: bool = False,
    csv_kwargs: dict | None = None,
    parquet_kwargs: dict | None = None,
) -> Path:
    """
    Write DataFrame to 'path' (CSV or Parquet), creating parent dirs.
    """
    p = Path(path)
    p.parent.mkdir(parents=True, exist_ok=True)
    fmt = detect_format(p)

    if fmt == "csv":
        kw = dict(index=index)
        if csv_kwargs:
            kw.update(csv_kwargs)
        df.to_csv(p, **kw)
    else:
        _ensure_parquet_engine()
        kw = dict(index=index)
        if parquet_kwargs:
            kw.update(parquet_kwargs)
        df.to_parquet(p, **kw)
    return p

def read_df(
    path: PathLike,
    *,
    parse_date_candidates: t.Iterable[str] = ("date", "datetime", "timestamp"),
    csv_kwargs: dict | None = None,
    parquet_kwargs: dict | None = None,
) -> pd.DataFrame:
    """
    Read CSV/Parquet. For CSV, auto-parse common date columns if present.
    """
    p = Path(path)
    fmt = detect_format(p)

    if fmt == "csv":
        # Peek header once to decide parse_dates
        header = pd.read_csv(p, nrows=0)
        cols = set(header.columns)
        parse_dates = [c for c in parse_date_candidates if c in cols]
        kw = dict()
        if parse_dates:
            kw["parse_dates"] = parse_dates
        if csv_kwargs:
            kw.update(csv_kwargs)
        return pd.read_csv(p, **kw)

    _ensure_parquet_engine()
    kw = dict()
    if parquet_kwargs:
        kw.update(parquet_kwargs)
    return pd.read_parquet(p, **kw)


## 5) Documentation (TODO)
- Update README with a **Data Storage** section (folders, formats, env usage).
- Summarize validation checks and any assumptions.