# Homework Starter — Stage 05: Data Storage
Name: 
Date: 

Objectives:
- Env-driven paths to `data/raw/` and `data/processed/`
- Save CSV and Parquet; reload and validate
- Abstract IO with utility functions; document choices

In [3]:
import os, pathlib, datetime as dt
import pandas as pd
from dotenv import load_dotenv

load_dotenv()
RAW = pathlib.Path(os.getenv('DATA_DIR_RAW', 'data/raw'))
PROC = pathlib.Path(os.getenv('DATA_DIR_PROCESSED', 'data/processed'))
RAW.mkdir(parents=True, exist_ok=True)
PROC.mkdir(parents=True, exist_ok=True)
print('RAW ->', RAW.resolve())
print('PROC ->', PROC.resolve())

RAW -> C:\Users\noven\Desktop\Github Fre-5040 Bootcamp\bootcamp_Junkuang_Lai\homework\homework5\data\raw
PROC -> C:\Users\noven\Desktop\Github Fre-5040 Bootcamp\bootcamp_Junkuang_Lai\homework\homework5\data\processed


## 1) Create or Load a Sample DataFrame
You may reuse data from prior stages or create a small synthetic dataset.

In [4]:
import numpy as np
dates = pd.date_range('2024-01-01', periods=20, freq='D')
df = pd.DataFrame({'date': dates, 'ticker': ['NVDA']*20, 'price': 150 + np.random.randn(20).cumsum()})
df.head()

Unnamed: 0,date,ticker,price
0,2024-01-01,NVDA,150.715531
1,2024-01-02,NVDA,152.121055
2,2024-01-03,NVDA,153.861056
3,2024-01-04,NVDA,154.006548
4,2024-01-05,NVDA,154.260478


## 2) Save CSV to data/raw/ and Parquet to data/processed/ (TODO)
- Use timestamped filenames.
- Handle missing Parquet engine gracefully.

In [5]:
def ts(): return dt.datetime.now().strftime('%Y%m%d-%H%M%S')

# TODO: Save CSV
csv_path = RAW / f"sample_{ts()}.csv"
df.to_csv(csv_path, index=False)
csv_path

# TODO: Save Parquet
pq_path = PROC / f"sample_{ts()}.parquet"
try:
    df.to_parquet(pq_path)
except Exception as e:
    print('Parquet engine not available. Install pyarrow or fastparquet to complete this step.')
    pq_path = None
pq_path

WindowsPath('data/processed/sample_20250818-094116.parquet')

## 3) Reload and Validate (TODO)
- Compare shapes and key dtypes.

In [6]:
def validate_loaded(original, reloaded):
    checks = {
        'shape_equal': original.shape == reloaded.shape,
        'date_is_datetime': pd.api.types.is_datetime64_any_dtype(reloaded['date']) if 'date' in reloaded.columns else False,
        'price_is_numeric': pd.api.types.is_numeric_dtype(reloaded['price']) if 'price' in reloaded.columns else False,
    }
    return checks

df_csv = pd.read_csv(csv_path, parse_dates=['date'])
validate_loaded(df, df_csv)

{'shape_equal': True, 'date_is_datetime': True, 'price_is_numeric': True}

In [8]:
if pq_path:
    try:
        df_pq = pd.read_parquet(pq_path)
        validate_loaded(df, df_pq)
    except Exception as e:
        print('Parquet read failed:', e)

## 4) Utilities (TODO)
- Implement `detect_format`, `write_df`, `read_df`.
- Use suffix to route; create parent dirs if needed; friendly errors for Parquet.

In [10]:
import typing as t, pathlib

def detect_format(path: t.Union[str, pathlib.Path]):
    s = str(path).lower()
    if s.endswith('.csv'): return 'csv'
    if s.endswith('.parquet') or s.endswith('.pq') or s.endswith('.parq'): return 'parquet'
    raise ValueError('Unsupported format: ' + s)

def write_df(df: pd.DataFrame, path: t.Union[str, pathlib.Path]):
    p = pathlib.Path(path); p.parent.mkdir(parents=True, exist_ok=True)
    fmt = detect_format(p)
    if fmt == 'csv':
        df.to_csv(p, index=False)
    else:
        try:
            df.to_parquet(p)
        except Exception as e:
            raise RuntimeError('Parquet engine not available. Install pyarrow or fastparquet.') from e
    return p

def read_df(path: t.Union[str, pathlib.Path]):
    p = pathlib.Path(path)
    fmt = detect_format(p)
    if fmt == 'csv':
        return pd.read_csv(p, parse_dates=['date']) if 'date' in pd.read_csv(p, nrows=0).columns else pd.read_csv(p)
    else:
        try:
            return pd.read_parquet(p)
        except Exception as e:
            raise RuntimeError('Parquet engine not available. Install pyarrow or fastparquet.') from e

# Demo
p_csv = RAW / f"util_{ts()}.csv"
p_pq  = PROC / f"util_{ts()}.parquet"
write_df(df, p_csv); read_df(p_csv).head()
try:
    write_df(df, p_pq)
    read_df(p_pq).head()
except RuntimeError as e:
    print('Skipping Parquet util demo:', e)

## 5) Documentation (TODO)
- Update README with a **Data Storage** section (folders, formats, env usage).
- Summarize validation checks and any assumptions.

In [11]:
# 1. Load environment variables and prepare directories
import os
from dotenv import load_dotenv
from pathlib import Path

load_dotenv()
DATA_DIR_RAW = os.getenv('DATA_DIR_RAW', 'data/raw')
DATA_DIR_PROCESSED = os.getenv('DATA_DIR_PROCESSED', 'data/processed')
Path(DATA_DIR_RAW).mkdir(parents=True, exist_ok=True)
Path(DATA_DIR_PROCESSED).mkdir(parents=True, exist_ok=True)

# 2. Create sample DataFrame and save as CSV and Parquet
import pandas as pd

df = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie'],
    'age': [25, 30, 35],
    'score': [88.5, 92.0, 79.5]
})

csv_path = os.path.join(DATA_DIR_RAW, 'sample.csv')
parquet_path = os.path.join(DATA_DIR_PROCESSED, 'sample.parquet')

df.to_csv(csv_path, index=False)
try:
    df.to_parquet(parquet_path, index=False)
except ImportError:
    print('Parquet engine not installed. Please install pyarrow or fastparquet.')

# 3. Reload and validate
df_csv = pd.read_csv(csv_path)
try:
    df_parquet = pd.read_parquet(parquet_path)
except Exception:
    df_parquet = None

def validate(df1, df2, key_cols):
    if df2 is None:
        return 'Parquet not loaded, cannot validate.'
    shape_match = df1.shape == df2.shape
    dtype_match = all(df1[c].dtype == df2[c].dtype for c in key_cols)
    return f'Shape match: {shape_match}, Key columns dtype match: {dtype_match}'

print(validate(df_csv, df_parquet, ['name', 'age']))

# 4. Utility functions
def write_df(df, path):
    ext = Path(path).suffix.lower()
    Path(path).parent.mkdir(parents=True, exist_ok=True)
    if ext == '.csv':
        df.to_csv(path, index=False)
    elif ext == '.parquet':
        try:
            df.to_parquet(path, index=False)
        except ImportError:
            print('Parquet engine not installed. Please install pyarrow or fastparquet.')
    else:
        raise ValueError('Only .csv or .parquet suffix supported.')

def read_df(path):
    ext = Path(path).suffix.lower()
    if ext == '.csv':
        return pd.read_csv(path)
    elif ext == '.parquet':
        try:
            return pd.read_parquet(path)
        except ImportError:
            print('Parquet engine not installed. Please install pyarrow or fastparquet.')
            return None
    else:
        raise ValueError('Only .csv or .parquet suffix supported.')

Shape match: True, Key columns dtype match: True
