# Normalize `y` by Watershed Area

Calculate normalized values of the outcome y-columns `ft` and `m3`, by dividing by the gage watershed areas.

In [1]:

import os
import sys

import pandas as pd

sys.path.append(os.path.join(os.path.dirname(os.getcwd()), "code"))
from usgs_drainage_areas import USGSDrainageArea
from utils import get_usgs_site_info


DATA_DIR = os.path.join(os.path.dirname(os.getcwd()), "data")

In [2]:
# Calculate the drainage areas for each gage:
GAGES = ['11402000', '11318500', '11266500', '11208000', '11202710', '11185500', '11189500']

drainage_areas = dict()
for gage in GAGES:
    fp = os.path.join(DATA_DIR, "usgs_basins", f"{gage}.geojson")
    drainage_areas[gage] = USGSDrainageArea(fp)

In [3]:
# Load the y-variable CSV:
fp = os.path.join(DATA_DIR, "streamgage-full.csv")
df = pd.read_csv(fp, encoding="utf-8")
df["gage"] = df["gage"].astype(str)

In [4]:
# Get the data in each unit of m/km/miles:
areas_m = {k: v.area_m for k, v in drainage_areas.items()}
areas_km = {k: v.area_km for k, v in drainage_areas.items()}
areas_miles = {k: v.area_miles for k, v in drainage_areas.items()}

In [5]:
# Add the normalized columns to the dataframe:
for y_col in ("ft", "m3"):
    for unit, data in zip(("m", "km", "miles"), (areas_m, areas_km, areas_miles)):
        label = f"area_{unit}"
        df[label] = df["gage"].map(data)
        df[f"{y_col}_per_{label}"] = df[y_col] / df[label]

In [6]:
# Check how many null values there are:
df.isna().astype(int).sum()

gage                    0
time                    0
ft                   3709
m3                   3653
area_m                  0
ft_per_area_m        3709
area_km                 0
ft_per_area_km       3709
area_miles              0
ft_per_area_miles    3709
m3_per_area_m        3653
m3_per_area_km       3653
m3_per_area_miles    3653
dtype: int64

In [7]:
# Save the CSV:
df.to_csv(fp, encoding="utf-8", index=False)