##1. Save in Two Formats

In [43]:
import os, pathlib, datetime as dt
import pandas as pd
from dotenv import load_dotenv

load_dotenv()

RAW = pathlib.Path(os.getenv("DATA_DIR_RAW", "data/raw"))
PROC = pathlib.Path(os.getenv("DATA_DIR_PROCESSED", "data/processed"))

RAW.mkdir(parents=True, exist_ok=True)
PROC.mkdir(parents=True, exist_ok=True)

In [44]:
import numpy as np

dates = pd.date_range('2024-01-01', periods=20, freq='D')
df = pd.DataFrame({
    'date': dates,
    'ticker': ['AAPL'] * 20,
    'price': 150 + np.random.randn(20).cumsum()
})
df.head()

Unnamed: 0,date,ticker,price
0,2024-01-01,AAPL,149.08961
1,2024-01-02,AAPL,147.613039
2,2024-01-03,AAPL,149.218474
3,2024-01-04,AAPL,148.996356
4,2024-01-05,AAPL,148.48311


In [45]:
def ts():
    return dt.datetime.now().strftime('%Y%m%d-%H%M%S')

csv_path = RAW / f"sample_{ts()}.csv"
parquet_path = PROC / f"sample_{ts()}.parquet"

In [47]:
# TODO: Save CSV
csv_path = RAW / f"sample_{ts()}.csv"
df.to_csv(csv_path, index=False)
csv_path

# TODO: Save Parquet
pq_path = RAW / f"sample_{ts()}.parquet"
try:
    df.to_parquet(pq_path, engine='fastparquet')
except Exception as e:
    print('Parquet engine not available. Install pyarrow or fastparquet to complete this step.')
    pq_path = None
pq_path

WindowsPath('data/raw/sample_20250817-224638.parquet')

In [48]:
import sys
print(sys.executable)

c:\Users\DELL\bootcamp_Nancy_Wang\homework\homework2\env\Scripts\python.exe


##2. Reload and Validate

In [49]:
def validate_loaded(original, reloaded):
    checks = {
        'shape_equal': original.shape == reloaded.shape,
        'date_is_datetime': pd.api.types.is_datetime64_any_dtype(reloaded['date']) if 'date' in reloaded.columns else False,
        'price_is_numeric': pd.api.types.is_numeric_dtype(reloaded['price']) if 'price' in reloaded.columns else False,
    }
    return checks

df_csv = pd.read_csv(csv_path, parse_dates=['date'])
validate_loaded(df, df_csv)

{'shape_equal': True, 'date_is_datetime': True, 'price_is_numeric': True}

In [50]:
if pq_path:
    try:
        df_pq = pd.read_parquet(pq_path, engine='fastparquet')
        validate_loaded(df, df_pq)
    except Exception as e:
        print('Parquet read failed:', e)
print(validate_loaded(df, df_pq))        

{'shape_equal': True, 'date_is_datetime': True, 'price_is_numeric': True}


##3. Refactor to Utilities

In [53]:
import typing as t, pathlib

def detect_format(path: t.Union[str, pathlib.Path]):
    s = str(path).lower()
    if s.endswith('.csv'): return 'csv'
    if s.endswith('.parquet') or s.endswith('.pq') or s.endswith('.parq'): return 'parquet'
    raise ValueError('Unsupported format: ' + s)

def write_df(df: pd.DataFrame, path: t.Union[str, pathlib.Path]):
    p = pathlib.Path(path); p.parent.mkdir(parents=True, exist_ok=True)
    fmt = detect_format(p)
    if fmt == 'csv':
        df.to_csv(p, index=False)
    else:
        try:
            df.to_parquet(p, engine='fastparquet')
        except Exception as e:
            raise RuntimeError('Parquet engine not available. Install pyarrow or fastparquet.') from e
    return p

def read_df(path: t.Union[str, pathlib.Path]):
    p = pathlib.Path(path)
    fmt = detect_format(p)
    if fmt == 'csv':
        return pd.read_csv(p, parse_dates=['date']) if 'date' in pd.read_csv(p, nrows=0).columns else pd.read_csv(p)
    else:
        try:
            return pd.read_parquet(p, engine='fastparquet')
        except Exception as e:
            raise RuntimeError('Parquet engine not available. Install pyarrow or fastparquet.') from e

# Demo
p_csv = PROC / f"util_{ts()}.csv"
p_pq  = PROC / f"util_{ts()}.parquet"
write_df(df, p_csv); read_df(p_csv).head()
try:
    write_df(df, p_pq)
    read_df(p_pq).head()
except RuntimeError as e:
    print('Skipping Parquet util demo:', e)