# Homework — Stage 05: Data Storage
Name: Joshua Zhong
Date: 08/19/25

Objectives:
- Env-driven paths to `data/raw/` and `data/processed/`
- Save CSV and Parquet; reload and validate
- Abstract IO with utility functions; document choices

In [1]:
import os, pathlib, datetime as dt
import pandas as pd
from dotenv import load_dotenv

load_dotenv()
RAW = pathlib.Path(os.getenv('DATA_DIR_RAW', '../data/raw'))
PROC = pathlib.Path(os.getenv('DATA_DIR_PROCESSED', '../data/processed'))
RAW.mkdir(parents=True, exist_ok=True)
PROC.mkdir(parents=True, exist_ok=True)
print('RAW ->', RAW.resolve())
print('PROC ->', PROC.resolve())

RAW -> /Users/joshz/bootcamp_Joshua_Zhong/project/data/raw
PROC -> /Users/joshz/bootcamp_Joshua_Zhong/project/data/processed


## 1) Create or Load a Sample DataFrame
A small synthetic dataset was created for the purposes of this homework

In [2]:
import numpy as np
dates = pd.date_range('2024-01-01', periods=20, freq='D')
df = pd.DataFrame({'date': dates, 'ticker': ['AAPL']*20, 'price': 150 + np.random.randn(20).cumsum()})
df.head()

Unnamed: 0,date,ticker,price
0,2024-01-01,AAPL,149.25581
1,2024-01-02,AAPL,149.744967
2,2024-01-03,AAPL,149.288323
3,2024-01-04,AAPL,150.408315
4,2024-01-05,AAPL,151.353438


## 2) Save CSV to data/raw/ and Parquet to data/processed/
- Filenames are timestamped for documentation processes
- Handle missing Parquet engine gracefully.

In [3]:
def ts(): return dt.datetime.now().strftime('%Y%m%d-%H%M%S')

# Save CSV
csv_path = RAW / f"sample_{ts()}.csv"
df.to_csv(csv_path, index=False)
csv_path

# Save Parquet
pq_path = PROC / f"sample_{ts()}.parquet"
try:
    df.to_parquet(pq_path, engine="fastparquet")
except Exception as e:
    print('Parquet engine not available, error:.', e); print("Download fastparquet by uncommenting and running below code")
    pq_path = None
pq_path

PosixPath('../data/processed/sample_20250820-104920.parquet')

In [8]:
# !pip install fastparquet

Collecting fastparquet
  Downloading fastparquet-2024.11.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (4.2 kB)
Collecting cramjam>=2.3 (from fastparquet)
  Downloading cramjam-2.11.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (5.6 kB)
Collecting fsspec (from fastparquet)
  Downloading fsspec-2025.7.0-py3-none-any.whl.metadata (12 kB)
Downloading fastparquet-2024.11.0-cp311-cp311-macosx_11_0_arm64.whl (683 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m683.8/683.8 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cramjam-2.11.0-cp311-cp311-macosx_11_0_arm64.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.7.0-py3-none-any.whl (199 kB)
Installing collected packages: fsspec, cramjam, fastparquet
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [fastparquet][0m [fastparquet]
[1A[2KSuccessfully installed cramjam-2.11.0 fastp

## 3) Reload and Validate
- Compared if shapes were equal, whether date is in format datetime64, and whether price is numeric

In [4]:
def validate_loaded(original, reloaded):
    checks = {
        'shape_equal': original.shape == reloaded.shape,
        'date_is_datetime': pd.api.types.is_datetime64_any_dtype(reloaded['date']) if 'date' in reloaded.columns else False,
        'price_is_numeric': pd.api.types.is_numeric_dtype(reloaded['price']) if 'price' in reloaded.columns else False,
    }
    return checks

df_csv = pd.read_csv(csv_path, parse_dates=['date'])

def itemized_validation(df1, df2) -> None:
    if all(validate_loaded(df1,df2).values()):
        print("Validation passed successfully!")
    else:
        for k,v in validate_loaded(df1,df2).items():
            if v == False:
                print(k, " failed validation")


itemized_validation(df, df_csv)

Validation passed successfully!


In [5]:
if pq_path:
    try:
        df_pq = pd.read_parquet(pq_path)
        itemized_validation(df, df_pq)
    except Exception as e:
        print('Parquet read failed:', e)

Validation passed successfully!


All aspects are validated

## 4) Utilities
- Implement `detect_format`, `write_df`, `read_df`.
- Use suffix to route; create parent dirs if needed; friendly errors for Parquet.

As we can see, the data was successfully written to and read from CSV files in data/raw/ and Parquet files in data/processed/

In [11]:
import typing as t, pathlib

def detect_format(path: t.Union[str, pathlib.Path]):
    s = str(path).lower()
    if s.endswith('.csv'): return 'csv'
    if s.endswith('.parquet') or s.endswith('.pq') or s.endswith('.parq'): return 'parquet'
    raise ValueError('Unsupported format: ' + s)

def write_df(df: pd.DataFrame, path: t.Union[str, pathlib.Path]):
    p = pathlib.Path(path); p.parent.mkdir(parents=True, exist_ok=True)
    fmt = detect_format(p)
    if fmt == 'csv':
        df.to_csv(p, index=False)
    else:
        try:
            df.to_parquet(p)
        except Exception as e:
            raise RuntimeError('Parquet engine not available. Install pyarrow or fastparquet.') from e
    return p

def read_df(path: t.Union[str, pathlib.Path]):
    p = pathlib.Path(path)
    fmt = detect_format(p)
    if fmt == 'csv':
        return pd.read_csv(p, parse_dates=['date']) if 'date' in pd.read_csv(p, nrows=0).columns else pd.read_csv(p)
    else:
        try:
            return pd.read_parquet(p)
        except Exception as e:
            raise RuntimeError('Parquet engine not available. Install pyarrow or fastparquet.') from e

# Demo
p_csv = RAW / f"util_{ts()}.csv"
p_pq  = PROC / f"util_{ts()}.parquet"
print(p_csv)
print(p_pq)

write_df(df, p_csv); 
print(read_df(p_csv).head())
try:
    write_df(df, p_pq)
    print(read_df(p_pq).head())
except RuntimeError as e:
    print('Skipping Parquet util demo:', e)

../data/raw/util_20250820-110450.csv
../data/processed/util_20250820-110450.parquet
        date ticker       price
0 2024-01-01   AAPL  149.255810
1 2024-01-02   AAPL  149.744967
2 2024-01-03   AAPL  149.288323
3 2024-01-04   AAPL  150.408315
4 2024-01-05   AAPL  151.353438
        date ticker       price
0 2024-01-01   AAPL  149.255810
1 2024-01-02   AAPL  149.744967
2 2024-01-03   AAPL  149.288323
3 2024-01-04   AAPL  150.408315
4 2024-01-05   AAPL  151.353438


## 5) Documentation
- The README markdown was updated with a **Data Storage** section that includes the folders, formats, env usage.
- Validation was performed successfully on the data, as both .csv and .pq files were found in their respective folders