# Pollutant Trends EDA

This notebook analyzes temporal patterns in pollutant data to inform autoregressive forecasting models.

In [None]:
import sys
import os
sys.path.append(os.path.abspath('..'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

from src.database.mongodb_client import MongoDBClient

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (15, 6)

## 1. Load Data from MongoDB

In [None]:
# Fetch all data
client = MongoDBClient()
data = list(client.collection.find().sort('timestamp', 1))
client.close()

# Convert to DataFrame
df = pd.DataFrame(data)
df['datetime'] = pd.to_datetime(df['timestamp'], unit='s')
df = df.sort_values('datetime').reset_index(drop=True)

print(f"Total records: {len(df)}")
print(f"Date range: {df['datetime'].min()} to {df['datetime'].max()}")
print(f"\nColumns: {df.columns.tolist()}")
df.head()

## 2. Time Series Plots for Each Pollutant

In [None]:
pollutants = ['pm25', 'pm10', 'no2', 'o3', 'so2', 'co']

fig, axes = plt.subplots(3, 2, figsize=(18, 12))
axes = axes.flatten()

for idx, pollutant in enumerate(pollutants):
    axes[idx].plot(df['datetime'], df[pollutant], linewidth=0.8)
    axes[idx].set_title(f'{pollutant.upper()} Over Time', fontsize=14, fontweight='bold')
    axes[idx].set_xlabel('Date')
    axes[idx].set_ylabel(f'{pollutant.upper()} Level')
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 3. Correlation Analysis: Pollutants vs Weather

In [None]:
# Select relevant columns
correlation_cols = pollutants + ['temperature', 'humidity', 'pressure', 'wind_speed', 'clouds']
corr_matrix = df[correlation_cols].corr()

# Plot heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation: Pollutants vs Weather', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

print("\nKey Insights:")
print("=" * 50)
for pollutant in pollutants:
    weather_corrs = corr_matrix[pollutant][['temperature', 'humidity', 'wind_speed']]
    print(f"\n{pollutant.upper()}:")
    print(weather_corrs.sort_values(ascending=False))

## 4. Autocorrelation Analysis (Lag Effects)

In [None]:
from pandas.plotting import autocorrelation_plot

fig, axes = plt.subplots(3, 2, figsize=(18, 12))
axes = axes.flatten()

for idx, pollutant in enumerate(pollutants):
    autocorrelation_plot(df[pollutant], ax=axes[idx])
    axes[idx].set_title(f'{pollutant.upper()} Autocorrelation', fontsize=14, fontweight='bold')
    axes[idx].set_xlabel('Lag (4-hour intervals)')
    axes[idx].set_ylabel('Autocorrelation')
    axes[idx].set_xlim(0, 50)  # Show first 50 lags (~8 days)

plt.tight_layout()
plt.show()

## 5. Hourly Patterns (Diurnal Variation)

In [None]:
# Group by hour
hourly_avg = df.groupby('hour')[pollutants].mean()

fig, axes = plt.subplots(3, 2, figsize=(18, 12))
axes = axes.flatten()

for idx, pollutant in enumerate(pollutants):
    axes[idx].plot(hourly_avg.index, hourly_avg[pollutant], marker='o', linewidth=2)
    axes[idx].set_title(f'{pollutant.upper()} by Hour of Day', fontsize=14, fontweight='bold')
    axes[idx].set_xlabel('Hour')
    axes[idx].set_ylabel(f'Average {pollutant.upper()}')
    axes[idx].set_xticks(range(0, 24, 2))
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 6. Day of Week Patterns

In [None]:
# Group by day of week
dow_avg = df.groupby('day_of_week')[pollutants].mean()
dow_labels = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']

fig, axes = plt.subplots(3, 2, figsize=(18, 12))
axes = axes.flatten()

for idx, pollutant in enumerate(pollutants):
    axes[idx].bar(range(7), dow_avg[pollutant], color='steelblue', alpha=0.7)
    axes[idx].set_title(f'{pollutant.upper()} by Day of Week', fontsize=14, fontweight='bold')
    axes[idx].set_xlabel('Day')
    axes[idx].set_ylabel(f'Average {pollutant.upper()}')
    axes[idx].set_xticks(range(7))
    axes[idx].set_xticklabels(dow_labels)
    axes[idx].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

## 7. Rolling Statistics (Trends)

In [None]:
# Calculate 24-hour (6 intervals) rolling mean
window = 6

fig, axes = plt.subplots(3, 2, figsize=(18, 12))
axes = axes.flatten()

for idx, pollutant in enumerate(pollutants):
    axes[idx].plot(df['datetime'], df[pollutant], alpha=0.3, label='Actual', linewidth=0.5)
    axes[idx].plot(df['datetime'], df[pollutant].rolling(window=window).mean(), 
                   label=f'{window*4}h Rolling Mean', linewidth=2, color='red')
    axes[idx].set_title(f'{pollutant.upper()} with Rolling Mean', fontsize=14, fontweight='bold')
    axes[idx].set_xlabel('Date')
    axes[idx].set_ylabel(f'{pollutant.upper()} Level')
    axes[idx].legend()
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 8. Lag Correlation Matrix

In [None]:
# Calculate correlation with lagged values
lag_correlations = {}

for pollutant in pollutants:
    correlations = []
    for lag in range(1, 13):  # Check lags 1-12 (4-48 hours)
        corr = df[pollutant].corr(df[pollutant].shift(lag))
        correlations.append(corr)
    lag_correlations[pollutant] = correlations

# Plot
lag_df = pd.DataFrame(lag_correlations, index=range(1, 13))

plt.figure(figsize=(12, 6))
for pollutant in pollutants:
    plt.plot(lag_df.index, lag_df[pollutant], marker='o', label=pollutant.upper(), linewidth=2)

plt.title('Lag Correlation for Each Pollutant', fontsize=16, fontweight='bold')
plt.xlabel('Lag (4-hour intervals)')
plt.ylabel('Correlation with Current Value')
plt.legend()
plt.grid(True, alpha=0.3)
plt.axhline(y=0.5, color='r', linestyle='--', alpha=0.5, label='0.5 threshold')
plt.tight_layout()
plt.show()

print("\nLag Correlations (first 3 lags):")
print(lag_df.head(3))

## 9. Summary Statistics

In [None]:
print("Pollutant Summary Statistics:")
print("=" * 80)
print(df[pollutants].describe())

print("\n\nMissing Values:")
print("=" * 80)
print(df[pollutants].isnull().sum())

## 10. Key Findings for Model Development

Based on this EDA, we can conclude:

1. **Lag Features**: Strong autocorrelation at lags 1-3 suggests using these as features
2. **Weather Correlation**: Temperature, humidity, and wind speed show correlation with pollutants
3. **Temporal Patterns**: Hour of day and day of week show distinct patterns
4. **Rolling Statistics**: 24-hour rolling means capture trends effectively

**Recommended Features for Pollutant Prediction Models**:
- Lag features: `pollutant_lag1`, `pollutant_lag2`, `pollutant_lag3`
- Weather: `temperature`, `humidity`, `wind_speed`, `pressure`, `clouds`
- Temporal: `hour`, `day_of_week`, `month`
- Rolling stats: `pollutant_rolling_mean_24h`, `pollutant_rolling_std_24h`