In [1]:
import os, pathlib, datetime as dt
import pandas as pd
import numpy as np
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Set RAW and PROCESSED directories from .env, with defaults
RAW = pathlib.Path(os.getenv('DATA_DIR_RAW', 'data/raw'))
PROC = pathlib.Path(os.getenv('DATA_DIR_PROCESSED', 'data/processed'))

# Make sure directories exist
RAW.mkdir(parents=True, exist_ok=True)
PROC.mkdir(parents=True, exist_ok=True)

print('RAW ->', RAW.resolve())
print('PROC ->', PROC.resolve())

# Create sample DataFrame
dates = pd.date_range('2024-01-01', periods=20, freq='D')
df = pd.DataFrame({
    'date': dates,
    'ticker': ['AAPL']*20,
    'price': 150 + np.random.randn(20).cumsum()
})

df.head()


RAW -> /Users/mayurakshi/bootcamp_mayurakshi_biswas/homework/homework5/notebooks/data/raw
PROC -> /Users/mayurakshi/bootcamp_mayurakshi_biswas/homework/homework5/notebooks/data/processed


Unnamed: 0,date,ticker,price
0,2024-01-01,AAPL,149.319545
1,2024-01-02,AAPL,149.897366
2,2024-01-03,AAPL,149.833188
3,2024-01-04,AAPL,149.508149
4,2024-01-05,AAPL,151.411527


In [2]:
# Helper function for timestamp
def ts():
    return dt.datetime.now().strftime('%Y%m%d-%H%M%S')

# File paths
csv_path = RAW / f"sample_{ts()}.csv"
pq_path  = PROC / f"sample_{ts()}.parquet"

# Save CSV and Parquet
df.to_csv(csv_path, index=False)
try:
    df.to_parquet(pq_path)
except Exception as e:
    print('Parquet save failed:', e)

print("CSV saved ->", csv_path)
print("Parquet saved ->", pq_path)


Parquet save failed: Unable to find a usable engine; tried using: 'pyarrow', 'fastparquet'.
A suitable version of pyarrow or fastparquet is required for parquet support.
Trying to import the above resulted in these errors:
 - Missing optional dependency 'pyarrow'. pyarrow is required for parquet support. Use pip or conda to install pyarrow.
 - Missing optional dependency 'fastparquet'. fastparquet is required for parquet support. Use pip or conda to install fastparquet.
CSV saved -> data/raw/sample_20250820-235130.csv
Parquet saved -> data/processed/sample_20250820-235130.parquet


In [3]:
# Reload CSV
df_csv = pd.read_csv(csv_path, parse_dates=['date'])

# Validation function
def validate_loaded(original, reloaded):
    checks = {
        'shape_equal': original.shape == reloaded.shape,
        'date_is_datetime': pd.api.types.is_datetime64_any_dtype(reloaded['date']) if 'date' in reloaded.columns else False,
        'price_is_numeric': pd.api.types.is_numeric_dtype(reloaded['price']) if 'price' in reloaded.columns else False,
    }
    return checks

# Validate CSV
csv_validation = validate_loaded(df, df_csv)
print("CSV Validation:", csv_validation)

# Optionally, try Parquet if engine available
try:
    df_pq = pd.read_parquet(pq_path)
    pq_validation = validate_loaded(df, df_pq)
    print("Parquet Validation:", pq_validation)
except Exception as e:
    print("Parquet read skipped:", e)


CSV Validation: {'shape_equal': True, 'date_is_datetime': True, 'price_is_numeric': True}
Parquet read skipped: Unable to find a usable engine; tried using: 'pyarrow', 'fastparquet'.
A suitable version of pyarrow or fastparquet is required for parquet support.
Trying to import the above resulted in these errors:
 - Missing optional dependency 'pyarrow'. pyarrow is required for parquet support. Use pip or conda to install pyarrow.
 - Missing optional dependency 'fastparquet'. fastparquet is required for parquet support. Use pip or conda to install fastparquet.


In [5]:
import typing as t, pathlib

# Detect format by file extension
def detect_format(path: t.Union[str, pathlib.Path]):
    s = str(path).lower()
    if s.endswith('.csv'): return 'csv'
    if s.endswith('.parquet') or s.endswith('.pq') or s.endswith('.parq'): return 'parquet'
    raise ValueError('Unsupported format: ' + s)

# Write DataFrame to CSV or Parquet
def write_df(df: pd.DataFrame, path: t.Union[str, pathlib.Path]):
    p = pathlib.Path(path)
    p.parent.mkdir(parents=True, exist_ok=True)
    fmt = detect_format(p)
    if fmt == 'csv':
        df.to_csv(p, index=False)
    else:
        try:
            df.to_parquet(p)
        except Exception as e:
            raise RuntimeError('Parquet engine not available. Install pyarrow or fastparquet.') from e
    print(f"{fmt.upper()} saved -> {p}")
    return p

# Read DataFrame from CSV or Parquet
def read_df(path: t.Union[str, pathlib.Path]):
    p = pathlib.Path(path)
    fmt = detect_format(p)
    if fmt == 'csv':
        return pd.read_csv(p, parse_dates=['date']) if 'date' in pd.read_csv(p, nrows=0).columns else pd.read_csv(p)
    else:
        try:
            return pd.read_parquet(p)
        except Exception as e:
            raise RuntimeError('Parquet engine not available. Install pyarrow or fastparquet.') from e


In [6]:
# Demo usage of the utilities
p_csv = RAW / f"util_{dt.datetime.now().strftime('%Y%m%d-%H%M%S')}.csv"
p_pq  = PROC / f"util_{dt.datetime.now().strftime('%Y%m%d-%H%M%S')}.parquet"

# Write and read CSV
write_df(df, p_csv)
df_csv_loaded = read_df(p_csv)
df_csv_loaded.head()

# Write and read Parquet (will fail if pyarrow/fastparquet missing)
try:
    write_df(df, p_pq)
    df_pq_loaded = read_df(p_pq)
    df_pq_loaded.head()
except RuntimeError as e:
    print('Skipping Parquet demo:', e)


CSV saved -> data/raw/util_20250820-235421.csv
Skipping Parquet demo: Parquet engine not available. Install pyarrow or fastparquet.


In [7]:
# Validation of CSV reload
validation_result = validate_loaded(df, df_csv_loaded)
print("CSV Validation:", validation_result)


CSV Validation: {'shape_equal': True, 'date_is_datetime': True, 'price_is_numeric': True}


In [8]:
import typing as t, pathlib

def detect_format(path: t.Union[str, pathlib.Path]):
    s = str(path).lower()
    if s.endswith('.csv'): return 'csv'
    if s.endswith('.parquet') or s.endswith('.pq') or s.endswith('.parq'): return 'parquet'
    raise ValueError('Unsupported format: ' + s)

def write_df(df: pd.DataFrame, path: t.Union[str, pathlib.Path]):
    p = pathlib.Path(path)
    p.parent.mkdir(parents=True, exist_ok=True)
    fmt = detect_format(p)
    if fmt == 'csv':
        df.to_csv(p, index=False)
    else:
        try:
            df.to_parquet(p)
        except Exception as e:
            raise RuntimeError('Parquet engine not available. Install pyarrow or fastparquet.') from e
    return p

def read_df(path: t.Union[str, pathlib.Path]):
    p = pathlib.Path(path)
    fmt = detect_format(p)
    if fmt == 'csv':
        df0 = pd.read_csv(p, nrows=0)
        parse_dates = ['date'] if 'date' in df0.columns else None
        return pd.read_csv(p, parse_dates=parse_dates)
    else:
        try:
            return pd.read_parquet(p)
        except Exception as e:
            raise RuntimeError('Parquet engine not available. Install pyarrow or fastparquet.') from e


In [9]:
# Demo using utilities
p_csv = RAW / f"util_{dt.datetime.now().strftime('%Y%m%d-%H%M%S')}.csv"
p_pq  = PROC / f"util_{dt.datetime.now().strftime('%Y%m%d-%H%M%S')}.parquet"

# Write CSV and Parquet
write_df(df, p_csv)
try:
    write_df(df, p_pq)
except RuntimeError as e:
    print('Skipping Parquet util demo:', e)

# Read back CSV (Parquet read skipped if engine missing)
df_csv_loaded = read_df(p_csv)
print('CSV Validation:', validate_loaded(df, df_csv_loaded))

try:
    df_pq_loaded = read_df(p_pq)
    print('Parquet Validation:', validate_loaded(df, df_pq_loaded))
except RuntimeError as e:
    print('Parquet read skipped:', e)


Skipping Parquet util demo: Parquet engine not available. Install pyarrow or fastparquet.
CSV Validation: {'shape_equal': True, 'date_is_datetime': True, 'price_is_numeric': True}
Parquet read skipped: Parquet engine not available. Install pyarrow or fastparquet.
