In [1]:
#Environment check /installs
import sys, platform
print("Python:", sys.version)
print("Platform:", platform.platform())

Python: 3.13.7 (tags/v3.13.7:bcee1c3, Aug 14 2025, 14:15:11) [MSC v.1944 64 bit (AMD64)]
Platform: Windows-11-10.0.26200-SP0


In [11]:
#Imports & paths (annotated)
import pandas as pd             # Core data handling: read, clean, and transform CSVs
import numpy as np              # Numerical operations, NaN handling, and unit conversions
import re                       # Regular expressions for parsing messy strings (coords, units)
from pathlib import Path         # Cross-platform file path handling for locating input/output files
from datetime import datetime    # Working with timestamps and generating sample date ranges
from dateutil import parser as dateparser  # Flexible parsing of inconsistent date formats

RAW_DIR = Path("../data")   #location of csvs
PATH_STATIONS = RAW_DIR / "raw_station_metadata.csv"
PATH_SAMPLES = RAW_DIR / "raw_water_samples.csv"

pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 140)

In [None]:
#Inspect Raw Structure (diagnose junk headers, footers, columns)

#Read just a small sample from the start and end of each CSV to preview structure
def quick_peek_csv(path, n=5): #5 get used by default unless specified, ie. quick_peek_csv(PATH_STATIONS, n=10)
    """Peek at the first and last few lines of a CSV file to detect header/footer artifacts."""
    print(f"\n==={path.name} ===")
    with open(path, "r", encoding="utf-8", errors="ignore") as f:   #open file located at path, in read mode ("r"), utf-8 ensures special characters read correctly, skips errors instead of failing to execute code
        lines = f.readlines()   #reads all lines
    print("\n--- FIRST FEW LINES ---")
    for l in lines[:n]: #limits line read to first 5 (or specified)
        print(l.strip())
    print("\n--- LAST FEW LINES ---")
    for l in lines[-n:]:    #limits line read to last 5 (or specified) note colon placement different
        print(l.strip())

quick_peek_csv(PATH_STATIONS)
quick_peek_csv(PATH_SAMPLES)

#PICK UP HERE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!


===raw_station_metadata.csv ===

--- FIRST FEW LINES ---
THIS IS A HEADER,foo, Station ID ,SiteName,Latitude,Longitude,Coordinates,Elev,Agency_Code,CRS,Notes
StationMetadata v0.1,bar,,,,,,,,,
,,S001,River Site 1,37.872701,-78.532764,,246.9 m,City Lab,,
,,S002,River Site 2,40.753572,-77.566062,,76.5 m,EPA,,Near bridge
,,S003,River Site 3,,,"39.65997,-75.801949",35.4 m,USGS,EPSG:4326,Upstream

--- LAST FEW LINES ---
,,S012,River Site 12,40.849549,-78.40261,,203.1 m,EPA,,Downstream
,,S013,River Site 13  ,"40°9'43""N","75°53'10""S",,23.4 m,City Lab,EPSG:4269,
,,S014,River Site 14,"37°3'42""N","75°15'38""S",,364.6 m,DWR,WGS84,
,,S015,River Site 15,36.909125,-79.628397,,110.9 m,USGS,EPSG:4326,Near bridge
,,S004,River Site 4,"38°59'35""N","76°32'39""S",,380.1 m,USGS,,Downstream

===raw_water_samples.csv ===

--- FIRST FEW LINES ---
Sample ID,StationCode,Sample Date,Analyte,Result,Units,Method ID,DetectLimit,Temp,Remarks,footer
WQ1000,S014,2025-10-20,Chloride,2698.40,ug/L,SM 4500-NO3,0.07,66.

In [None]:
#Robust Load (handle junk header/footer rows, everything as string)

def read_raw(path):
    df = pd.read_csv(path, dtype=str, keep_default_na=False, na_values=["", " "])   #dont translate the default conditions as NA (b/c some might be legitamit data), just "" and " "
    #Drop obvious junk rows
    df = df[[c for c in df.columns if not c.startswith("Unnamed")]]
    df = df[~df.apply(lambda r: r.astype(str).str.contains("End of File|StationMetadata v0.1", case=False).any(), axis=1)]

In [None]:
# Cell 8 — Optional: PostGIS connection imports (annotated)

from sqlalchemy import create_engine, text  
# → SQLAlchemy is the core interface to databases.
#   `create_engine()` builds a connection string for PostgreSQL/PostGIS.
#   `text()` allows you to execute raw SQL commands (e.g., creating schemas, geometry columns).

import psycopg2  
# → PostgreSQL driver used under the hood by SQLAlchemy to communicate with the PostGIS container.
#   Required for executing SQL and writing DataFrames to PostGIS tables.

# (Optional, for geospatial validation later)
# from shapely.geometry import Point  
# → If you later want to create geometry objects or verify coordinates before sending to PostGIS.

# from geopandas import GeoDataFrame  
# → Optional upgrade of pandas DataFrames into GeoDataFrames for spatial operations before export.
