In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [None]:
from snowflake.snowpark.context import get_active_session
session = get_active_session()

In [None]:
df = session.table("WATER_QUALITY.FEATURES.TRAINING_FEATURES").to_pandas()
df.columns = [c.lower() for c in df.columns]

date_col = "sample_date"

targets = {
    "total_alkalinity": "Total Alkalinity",
    "electrical_conductance": "Electrical Conductance",
    "dissolved_reactive_phosphorus": "DRP"
}

df[date_col] = pd.to_datetime(df[date_col])

print(df.shape)
df.head()

In [None]:
df.describe()

In [None]:
for col, label in targets.items():
    plt.figure()
    df[col].dropna().hist(bins=50)
    plt.title(label)
    plt.xlabel(label)
    plt.ylabel("Count")
    plt.show()

In [None]:
for col, label in targets.items():
    plt.figure(figsize=(10,4))
    df.sort_values(date_col).plot(
        x=date_col, y=col, alpha=0.3, legend=False
    )
    plt.title(f"{label} over time")
    plt.ylabel(label)
    plt.show()

In [None]:
feature_cols = [
    c for c in df.columns
    if c not in targets and c not in ["site_id", "sample_date"]
]

null_rates = (
    df[feature_cols]
    .isna()
    .mean()
    .sort_values(ascending=False)
)

null_rates.head(20)

In [None]:
corrs = {}

for t in targets:
    corrs[t] = (
        df[feature_cols + [t]]
        .corr()[t]
        .drop(t)
        .sort_values(key=lambda s: s.abs(), ascending=False)
        .head(10)
    )

corrs

In [None]:
top_feats = list(
    corrs["dissolved_reactive_phosphorus"].index[:3]
)

for f in top_feats:
    plt.figure()
    plt.scatter(df[f], df["dissolved_reactive_phosphorus"], alpha=0.2)
    plt.xlabel(f)
    plt.ylabel("Dissolved Reactive Phosphorus")
    plt.title(f"{f} vs Dissolved Reactive Phosphorus")
    plt.show()

In [None]:
df["month"] = df[date_col].dt.month

for col, label in targets.items():
    df.boxplot(column=col, by="month", figsize=(10,4))
    plt.title(f"{label} by Month")
    plt.suptitle("")
    plt.xlabel("Month")
    plt.ylabel(label)
    plt.show()

In [None]:
def plot_target_map(target, title):
    plt.figure(figsize=(8,6))
    sc = plt.scatter(
        df['latitude'],
        df['longitude'],
        c=df[target],
        cmap="viridis",
        s=12,
        alpha=0.7
    )
    plt.colorbar(sc, label=title)
    plt.xlabel("Longitude")
    plt.ylabel("Latitude")
    plt.title(f"{title} â€” spatial distribution")
    plt.show()

for col, label in targets.items():
    plot_target_map(col, label)