# Exploratory Data Analysis of NWM and USGS Time Series

This notebook performs exploratory analysis of the National Water Model (NWM) forecasts and USGS observed runoff data.

## Objectives:
1. Understand the structure and properties of the data
2. Identify patterns and trends in NWM forecasts and USGS observations
3. Visualize forecast errors across different lead times
4. Evaluate data quality issues such as missing values and outliers
5. Identify potential features for model development

In [1]:
# Import necessary libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from glob import glob

# Set plotting style
sns.set_style("whitegrid")
plt.rcParams.update({'font.size': 12})

## 1. Data Loading and Inspection

First, let's load the USGS observed data and NWM forecast data for both streams.

In [2]:
# Define paths
data_path = "../data/raw"
stream_ids = ["20380357", "21609641"]

# Function to load USGS data
def load_usgs_data(stream_id):
    usgs_files = glob(os.path.join(data_path, str(stream_id), "*_Strt_*.csv"))
    if not usgs_files:
        raise FileNotFoundError(f"No USGS data files found for stream {stream_id}")
    
    usgs_df = pd.read_csv(usgs_files[0])
    # Convert to datetime
    usgs_df['datetime'] = pd.to_datetime(usgs_df['datetime'])
    usgs_df.set_index('datetime', inplace=True)
    
    return usgs_df

# Function to load NWM data
def load_nwm_data(stream_id):
    nwm_files = glob(os.path.join(data_path, str(stream_id), "streamflow_*.csv"))
    if not nwm_files:
        raise FileNotFoundError(f"No NWM data files found for stream {stream_id}")
    
    dfs = []
    for file in nwm_files:
        df = pd.read_csv(file)
        dfs.append(df)
    
    nwm_df = pd.concat(dfs, ignore_index=True)
    nwm_df['reference_time'] = pd.to_datetime(nwm_df['reference_time'])
    nwm_df['value_time'] = pd.to_datetime(nwm_df['value_time'])
    
    return nwm_df

# Load data for each stream
data = {}
for stream_id in stream_ids:
    usgs_df = load_usgs_data(stream_id)
    nwm_df = load_nwm_data(stream_id)
    data[stream_id] = {
        "usgs": usgs_df,
        "nwm": nwm_df
    }

KeyError: 'datetime'

## 2. Examining Data Structure

Let's take a look at the structure and contents of both datasets.

In [None]:
# Stream ID to examine
stream_id = stream_ids[0]  # First stream

print("USGS Data Structure:")
print(data[stream_id]["usgs"].head())
print("\nUSGS Data Info:")
print(data[stream_id]["usgs"].info())
print("\nUSGS Data Statistics:")
print(data[stream_id]["usgs"].describe())

print("\n\nNWM Data Structure:")
print(data[stream_id]["nwm"].head())
print("\nNWM Data Info:")
print(data[stream_id]["nwm"].info())
print("\nNWM Data Statistics by Lead Time:")
print(data[stream_id]["nwm"].groupby("lead_time")["streamflow"].describe())

## 3. Temporal Coverage and Availability

Let's examine the temporal coverage and availability of the data.

In [None]:
for stream_id in stream_ids:
    usgs_df = data[stream_id]["usgs"]
    nwm_df = data[stream_id]["nwm"]
    
    print(f"Stream {stream_id} - USGS Data:")
    print(f"Start date: {usgs_df.index.min()}")
    print(f"End date: {usgs_df.index.max()}")
    print(f"Total records: {len(usgs_df)}")
    print(f"Missing values: {usgs_df.isna().sum().sum()}")
    
    print(f"\nStream {stream_id} - NWM Data:")
    print(f"Start reference time: {nwm_df['reference_time'].min()}")
    print(f"End reference time: {nwm_df['reference_time'].max()}")
    print(f"Start value time: {nwm_df['value_time'].min()}")
    print(f"End value time: {nwm_df['value_time'].max()}")
    print(f"Total records: {len(nwm_df)}")
    print(f"Missing values: {nwm_df.isna().sum().sum()}")
    print("-" * 60)

## 4. Data Distribution Analysis

Let's visualize the distribution of runoff values in both datasets.

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(18, 12))

for i, stream_id in enumerate(stream_ids):
    usgs_df = data[stream_id]["usgs"]
    nwm_df = data[stream_id]["nwm"]
    
    # Histogram of USGS observed runoff
    ax = axes[i, 0]
    ax.hist(usgs_df['value'], bins=50, alpha=0.7)
    ax.set_title(f"Stream {stream_id} - USGS Observed Runoff Distribution")
    ax.set_xlabel("Runoff")
    ax.set_ylabel("Frequency")
    
    # Histogram of NWM forecast runoff by lead time
    ax = axes[i, 1]
    lead_times = [1, 6, 12, 18]  # Representative lead times
    for lead in lead_times:
        lead_data = nwm_df[nwm_df['lead_time'] == lead]['streamflow']
        ax.hist(lead_data, bins=50, alpha=0.4, label=f"Lead {lead}h")
    
    ax.set_title(f"Stream {stream_id} - NWM Forecast Runoff Distribution by Lead Time")
    ax.set_xlabel("Runoff")
    ax.set_ylabel("Frequency")
    ax.legend()

plt.tight_layout()
plt.show()

## 5. Time Series Visualization

Let's visualize the time series data to understand patterns and relationships.

In [None]:
# Restructure NWM data to align with USGS observations
def align_nwm_with_usgs(stream_id, lead_times=[1, 6, 12, 18]):
    usgs_df = data[stream_id]["usgs"]
    nwm_df = data[stream_id]["nwm"]
    
    aligned_data = {}
    aligned_data['usgs'] = usgs_df['value']
    
    for lead in lead_times:
        # Filter for this lead time
        lead_df = nwm_df[nwm_df['lead_time'] == lead].copy()
        # Set index to value_time (when the forecast is for)
        lead_df.set_index('value_time', inplace=True)
        # Get the streamflow column
        lead_series = lead_df['streamflow']
        # Add to aligned data with a descriptive name
        aligned_data[f'nwm_lead_{lead}'] = lead_series
    
    # Combine into one DataFrame
    return pd.DataFrame(aligned_data)

In [None]:
# Create aligned data for visualization
for stream_id in stream_ids:
    aligned_df = align_nwm_with_usgs(stream_id)
    
    # Plot time series for a 3-month period
    sample_start = pd.Timestamp('2022-01-01')
    sample_end = pd.Timestamp('2022-04-01')
    sample_df = aligned_df.loc[sample_start:sample_end]
    
    plt.figure(figsize=(18, 8))
    plt.plot(sample_df.index, sample_df['usgs'], label='USGS Observed', linewidth=2)
    for lead in [1, 6, 12, 18]:
        plt.plot(sample_df.index, sample_df[f'nwm_lead_{lead}'], 
                 label=f'NWM Lead {lead}h', alpha=0.7)
    
    plt.title(f"Stream {stream_id} - Observed vs. Forecast Runoff (Jan-Mar 2022)")
    plt.xlabel("Date")
    plt.ylabel("Runoff")
    plt.legend()
    plt.grid(True)
    plt.show()

## 6. Forecast Error Analysis

Let's calculate and visualize the errors in the NWM forecasts compared to USGS observations.

In [None]:
# Calculate forecast errors
for stream_id in stream_ids:
    aligned_df = align_nwm_with_usgs(stream_id, lead_times=range(1, 19))
    
    # Add error columns
    for lead in range(1, 19):
        if f'nwm_lead_{lead}' in aligned_df.columns:
            aligned_df[f'error_lead_{lead}'] = aligned_df[f'nwm_lead_{lead}'] - aligned_df['usgs']
    
    # Plot error distributions for selected lead times
    plt.figure(figsize=(15, 8))
    lead_times = [1, 6, 12, 18]
    for i, lead in enumerate(lead_times):
        if f'error_lead_{lead}' in aligned_df.columns:
            plt.subplot(2, 2, i+1)
            sns.histplot(aligned_df[f'error_lead_{lead}'].dropna(), kde=True)
            plt.title(f"Stream {stream_id} - Error Distribution (Lead {lead}h)")
            plt.xlabel("Forecast Error (NWM - USGS)")
    
    plt.tight_layout()
    plt.show()
    
    # Plot error vs lead time
    error_means = [aligned_df[f'error_lead_{lead}'].mean() for lead in range(1, 19) if f'error_lead_{lead}' in aligned_df.columns]
    error_stds = [aligned_df[f'error_lead_{lead}'].std() for lead in range(1, 19) if f'error_lead_{lead}' in aligned_df.columns]
    
    plt.figure(figsize=(12, 6))
    plt.errorbar(range(1, len(error_means)+1), error_means, yerr=error_stds, fmt='o-')
    plt.title(f"Stream {stream_id} - Mean Forecast Error by Lead Time")
    plt.xlabel("Lead Time (hours)")
    plt.ylabel("Mean Error (NWM - USGS)")
    plt.grid(True)
    plt.show()

## 7. Correlation Analysis

Let's investigate the correlation between observed values and forecasts at different lead times.

In [None]:
import scipy.stats as stats

for stream_id in stream_ids:
    aligned_df = align_nwm_with_usgs(stream_id, lead_times=range(1, 19))
    
    correlations = []
    lead_times = []
    
    for lead in range(1, 19):
        col = f'nwm_lead_{lead}'
        if col in aligned_df.columns:
            # Drop any NaN values
            valid_data = aligned_df[['usgs', col]].dropna()
            if len(valid_data) > 0:
                corr, _ = stats.pearsonr(valid_data['usgs'], valid_data[col])
                correlations.append(corr)
                lead_times.append(lead)
    
    # Plot correlation vs lead time
    plt.figure(figsize=(12, 6))
    plt.plot(lead_times, correlations, 'o-')
    plt.title(f"Stream {stream_id} - Correlation between Observed and Forecast Runoff")
    plt.xlabel("Lead Time (hours)")
    plt.ylabel("Pearson Correlation Coefficient")
    plt.grid(True)
    plt.ylim(0, 1)
    plt.show()

## 8. Seasonal Patterns in Forecast Errors

Let's investigate if there are seasonal patterns in the forecast errors.

In [None]:
for stream_id in stream_ids:
    aligned_df = align_nwm_with_usgs(stream_id)
    
    # Add month column
    aligned_df['month'] = aligned_df.index.month
    
    # Calculate mean errors by month for different lead times
    plt.figure(figsize=(12, 8))
    for i, lead in enumerate([1, 6, 12, 18]):
        if f'error_lead_{lead}' in aligned_df.columns:
            monthly_errors = aligned_df.groupby('month')[f'error_lead_{lead}'].mean()
            plt.plot(monthly_errors.index, monthly_errors.values, 'o-', 
                     label=f'Lead {lead}h')
    
    plt.title(f"Stream {stream_id} - Monthly Mean Forecast Error")
    plt.xlabel("Month")
    plt.ylabel("Mean Error (NWM - USGS)")
    plt.xticks(range(1, 13), ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 
                             'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
    plt.legend()
    plt.grid(True)
    plt.show()

## 9. Autocorrelation Analysis of Forecast Errors

Let's investigate the autocorrelation in forecast errors.

In [None]:
from pandas.plotting import autocorrelation_plot

for stream_id in stream_ids:
    aligned_df = align_nwm_with_usgs(stream_id)
    
    # Plot autocorrelation for errors at different lead times
    plt.figure(figsize=(15, 12))
    for i, lead in enumerate([1, 6, 12, 18]):
        if f'error_lead_{lead}' in aligned_df.columns:
            plt.subplot(2, 2, i+1)
            autocorrelation_plot(aligned_df[f'error_lead_{lead}'].dropna())
            plt.title(f"Stream {stream_id} - Error Autocorrelation (Lead {lead}h)")
    
    plt.tight_layout()
    plt.show()

## 10. Error Persistence Analysis

Let's analyze whether errors are persistent across lead times, which would indicate potential advantages for the baseline persistence model.

In [None]:
for stream_id in stream_ids:
    aligned_df = align_nwm_with_usgs(stream_id, lead_times=range(1, 19))
    
    # Calculate errors
    for lead in range(1, 19):
        col = f'nwm_lead_{lead}'
        if col in aligned_df.columns:
            aligned_df[f'error_lead_{lead}'] = aligned_df[col] - aligned_df['usgs']
    
    # Calculate correlation between errors at different lead times
    error_cols = [col for col in aligned_df.columns if col.startswith('error_lead_')]
    if error_cols:
        error_corr = aligned_df[error_cols].corr()
        
        plt.figure(figsize=(12, 10))
        sns.heatmap(error_corr, annot=True, cmap='coolwarm', fmt='.2f', 
                   linewidths=0.5, square=True)
        plt.title(f"Stream {stream_id} - Error Correlation Matrix")
        plt.tight_layout()
        plt.show()

## 11. Summary of Findings

Based on our exploratory data analysis, we can make the following observations:

1. **Data Coverage**: [Observations about temporal coverage and completeness]
2. **Error Distribution**: [Observations about error distributions across lead times]
3. **Seasonal Patterns**: [Observations about seasonal patterns in errors]
4. **Error Persistence**: [Observations about error persistence across lead times]
5. **Correlation**: [Observations about correlation between observed and forecast values]

These findings will inform our approach to model development and feature engineering.