In [1]:
#confirm connection to python Kernel and pandas version
import sys, pandas as pd
print("Python:", sys.executable)
print("Pandas:", pd.__version__)


Python: c:\Users\chamb\env-data-analyst-oct\notebooks\.venv\Scripts\python.exe
Pandas: 2.3.3


In [2]:
#Load data from csv
import pandas as pd
df = pd.read_csv("week1_task3_water_quality_sample.csv", parse_dates=["date"])
df.head()


Unnamed: 0,site_id,date,nitrate_mgL,phosphate_mgL,turbidity_NTU,temp_C,county,state
0,WC-001,2025-09-28,2.3,0.12,3.1,14.5,Willow Creek,OR
1,WC-002,2025-09-28,,0.05,1.0,13.8,willow creek,OR
2,WC-003,2025-09-28,5.8,,8.9,12.4,Willow Creek,OR
3,WC-004,2025-09-29,0.8,0.03,0.6,,Willow Creek,OR
4,WC-005,2025-09-29,12.1,0.2,15.2,10.1,North Fork,OR


In [3]:
# ============================================
# Week 1 · Task 3 — Step 2: Missing-Values Diagnostics (READ-ONLY)
# Assumes you already ran:
#   df = pd.read_csv("week1_task3_water_quality_sample.csv", parse_dates=["date"])
# This cell inspects the dataset for nulls so we can choose a cleanup strategy next.
# ============================================

#1 Basic shape and data types
print("Shape (rows, columns):", df.shape)
print("\nDtypes:")
print(df.dtypes.to_string())

Shape (rows, columns): (10, 8)

Dtypes:
site_id                  object
date             datetime64[ns]
nitrate_mgL             float64
phosphate_mgL           float64
turbidity_NTU           float64
temp_C                  float64
county                   object
state                    object


In [4]:
#Count missing values per column (sorted, highest first)
na_counts = df.isna().sum().sort_values(ascending=False)
print("\nMissing values per column:")
print(na_counts.to_string())


Missing values per column:
phosphate_mgL    2
nitrate_mgL      2
temp_C           1
turbidity_NTU    1
site_id          0
date             0
county           0
state            0


In [5]:
#Quick peek at rows that contain ANY missing values (top 10)
rows_with_na = df[df.isna().any(axis=1)]    #axis=1 means across columns horizontally per row(0 would mean down each column), .any() means "true" for .isna()
print("\nRows with any missing values (show first 10):")
rows_with_na.head((10))


Rows with any missing values (show first 10):


Unnamed: 0,site_id,date,nitrate_mgL,phosphate_mgL,turbidity_NTU,temp_C,county,state
1,WC-002,2025-09-28,,0.05,1.0,13.8,willow creek,OR
2,WC-003,2025-09-28,5.8,,8.9,12.4,Willow Creek,OR
3,WC-004,2025-09-29,0.8,0.03,0.6,,Willow Creek,OR
5,WC-006,2025-09-30,,0.07,2.2,11.0,North Fork,OR
6,WC-007,2025-09-30,3.0,0.02,,9.7,South Fork,OR
9,WC-010,2025-10-02,0.5,,0.5,8.1,South Fork,OR


In [6]:
# ============================================
# Week 1 · Task 3 — Step 3: Handle Missing Values (WRITE changes)
# Assumes df is already loaded from:
#   df = pd.read_csv("week1_task3_water_quality_sample.csv", parse_dates=["date"])
# Strategy:
# - Numeric columns: fill NaN with the column median
# - Text columns: standardize casing / fill simple placeholders
# ============================================

import numpy as np

#1 Preview current missing values
print("Before fill - NaNs per column:")
print(df.isna().sum().to_string(), "\n")

Before fill - NaNs per column:
site_id          0
date             0
nitrate_mgL      2
phosphate_mgL    2
turbidity_NTU    1
temp_C           1
county           0
state            0 



In [7]:
#2 Fill numeric columns with median
numeric_cols = ["nitrate_mgL", "phosphate_mgL", "turbidity_NTU", "temp_C"]
for col in numeric_cols:    #loop through the list of numeric_cols
    if col in df.columns:   #if in numeric_col and a column of df, do the following
        median_val = df[col].median(skipna=True)    #a function for finding median value, skipping NaNye values
        df[col] = df[col].fillna(median_val) #in column that is in df and numeric_col, execute this function that replaces NaN values with the result of our median funciton

In [8]:
#3 Light cleanup for text columns
#county: title case (eg. 'willow creek', -> 'Willow Creek')
if "county" in df.columns:
    df["county"] = df["county"].astype(str).str.strip().str.title() #str.strip() removes white spaces

#state: fill missing with 'Unknown' (you can change this later)
if "state" in df.columns:
    df["state"] = df["state"].fillna("Unknown").astype(str).str.strip().str.upper() #note: astype(str) sets the type as string, it isnt searching for a string

In [9]:
#4 Verify results
print("After fill - NaN per column:")
print(df.isna().sum().to_string())

After fill - NaN per column:
site_id          0
date             0
nitrate_mgL      0
phosphate_mgL    0
turbidity_NTU    0
temp_C           0
county           0
state            0


In [10]:
# Optional: Show the rows that previously had NaNs to confirm they’re now filled
print("\nSample of cleaned rows (first 8):")
df.head(8)


Sample of cleaned rows (first 8):


Unnamed: 0,site_id,date,nitrate_mgL,phosphate_mgL,turbidity_NTU,temp_C,county,state
0,WC-001,2025-09-28,2.3,0.12,3.1,14.5,Willow Creek,OR
1,WC-002,2025-09-28,2.65,0.05,1.0,13.8,Willow Creek,OR
2,WC-003,2025-09-28,5.8,0.06,8.9,12.4,Willow Creek,OR
3,WC-004,2025-09-29,0.8,0.03,0.6,10.1,Willow Creek,OR
4,WC-005,2025-09-29,12.1,0.2,15.2,10.1,North Fork,OR
5,WC-006,2025-09-30,2.65,0.07,2.2,11.0,North Fork,OR
6,WC-007,2025-09-30,3.0,0.02,2.2,9.7,South Fork,OR
7,WC-008,2025-10-01,1.1,0.01,0.3,8.8,South Fork,OR


In [11]:
# ============================================
# Week 1 · Task 3 — Step 5: Filter Rows (READ-ONLY practice)
# We'll make small, labeled subsets just to practice filtering logic.
# ============================================

# 1) Categorical filter: choose one county
df_willow = df[df["county"] == "Willow Creek"].copy()   #df[...] uses Boolean fcn in pandas, .copy() keeps df_willow separate from df, funciton results in True or False for each row in column
print("Rows in Willow Creek subset:", len(df_willow))   #len() counts number of rows in our new df_willow object or what is called a DataFrame in pandas
#a DataFrame stores both values and the functions to be performed on those values ("what to do with the values")

Rows in Willow Creek subset: 4


In [16]:
#2 Numeric filter: high nitrate (>5 mg/L)
df_nitrate_high = df[df["nitrate_mgL"] > 5].copy()  #note the inner df[...] produces true/false results, and the outer df[...] is where those results are applied to the actual data set and selects "true" rows of data
print("Rows with nitrate > 5 mg/L:", len(df_nitrate_high))  #without len() would get actual rows of data

Rows with nitrate > 5 mg/L: 3


In [17]:
#3 Numeric filter: moderate turbidity (<= 5 NTU)
df_turbidity_ok = df[df["turbidity_NTU"] <= 5].copy()
print("Rows with turbidity <= 5 NTU:", len(df_turbidity_ok))

Rows with turbidity <= 5 NTU: 7


In [21]:
# quick look at first few results from each
print("\n--- Willow Creek sample ---")
print(df_willow.head())
print("\n--- High Nitrate sample ---")
print(df_nitrate_high.head())
print("\n--- Low turbidity sample ---")
print(df_turbidity_ok.head())


--- Willow Creek sample ---
  site_id       date  nitrate_mgL  phosphate_mgL  turbidity_NTU  temp_C  \
0  WC-001 2025-09-28         2.30           0.12            3.1    14.5   
1  WC-002 2025-09-28         2.65           0.05            1.0    13.8   
2  WC-003 2025-09-28         5.80           0.06            8.9    12.4   
3  WC-004 2025-09-29         0.80           0.03            0.6    10.1   

         county state  
0  Willow Creek    OR  
1  Willow Creek    OR  
2  Willow Creek    OR  
3  Willow Creek    OR  

--- High Nitrate sample ---
  site_id       date  nitrate_mgL  phosphate_mgL  turbidity_NTU  temp_C  \
2  WC-003 2025-09-28          5.8           0.06            8.9    12.4   
4  WC-005 2025-09-29         12.1           0.20           15.2    10.1   
8  WC-009 2025-10-01          7.4           0.15           12.0    10.0   

         county state  
2  Willow Creek    OR  
4    North Fork    OR  
8    South Fork    OR  

--- Low turbidity sample ---
  site_id       dat