# üìà Advanced Visualizations & Insights

**Publication-Ready Plots & Deep Analysis**

This notebook creates sophisticated visualizations for better understanding of AQI patterns.

## Setup & Data Loading

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Style settings
sns.set_palette('husl')
plt.style.use('seaborn-v0_8-darkgrid')

# Load data
df = pd.read_csv('../data/raw/final_dataset.csv')
POLLUTANTS = ['PM2.5', 'PM10', 'NO2', 'SO2', 'CO', 'Ozone']

# Feature engineering
df['DateTime'] = pd.to_datetime(df[['Year', 'Month', 'Date']].rename(
    columns={'Date': 'day', 'Month': 'month', 'Year': 'year'}
))

def get_season(month):
    if month in [12, 1, 2]: return 'Winter'
    elif month in [3, 4, 5]: return 'Summer'
    elif month in [6, 7, 8, 9]: return 'Monsoon'
    else: return 'Autumn'

df['Season'] = df['Month'].apply(get_season)
df['IsWeekend'] = df['Days'].isin([6, 7]).astype(int)

print("‚úì Data loaded and features created")

## üìä Multi-Panel Distribution Analysis

In [None]:
# Create comprehensive distribution plot
fig, axes = plt.subplots(3, 2, figsize=(14, 10))
axes = axes.flatten()

for idx, pollutant in enumerate(POLLUTANTS):
    # Histogram with KDE
    axes[idx].hist(df[pollutant], bins=30, alpha=0.6, color='steelblue', edgecolor='black')
    
    # Add KDE
    ax2 = axes[idx].twinx()
    df[pollutant].plot(kind='kde', ax=ax2, color='red', linewidth=2)
    ax2.set_ylabel('Density')
    
    axes[idx].set_title(f'{pollutant} Distribution', fontweight='bold')
    axes[idx].set_xlabel(f'{pollutant} Level')
    axes[idx].set_ylabel('Frequency')
    axes[idx].grid(alpha=0.3)

plt.suptitle('Pollutant Distributions with KDE', fontsize=14, fontweight='bold', y=0.995)
plt.tight_layout()
plt.show()

In [None]:
## üéª Violin Plots - Seasonal Patterns

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(15, 8))
axes = axes.flatten()

seasons_order = ['Winter', 'Summer', 'Monsoon', 'Autumn']

for idx, pollutant in enumerate(POLLUTANTS):
    sns.violinplot(data=df, x='Season', y=pollutant, ax=axes[idx], 
                   palette='Set2', order=seasons_order)
    axes[idx].set_title(f'{pollutant} by Season', fontweight='bold', fontsize=11)
    axes[idx].set_xlabel('')
    axes[idx].grid(alpha=0.3, axis='y')

# AQI distribution
sns.violinplot(data=df, x='Season', y='AQI', ax=axes[5], 
               palette='Set2', order=seasons_order)
axes[5].set_title('AQI by Season', fontweight='bold', fontsize=11)
axes[5].set_xlabel('')
axes[5].grid(alpha=0.3, axis='y')

plt.suptitle('Seasonal Variations - Violin Plots', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
## üìÖ Time Series with Trend & Rolling Average

In [None]:
# Sort by date
df_sorted = df.sort_values('DateTime')

# Calculate rolling averages
df_sorted['AQI_MA7'] = df_sorted['AQI'].rolling(window=7, center=True).mean()
df_sorted['AQI_MA30'] = df_sorted['AQI'].rolling(window=30, center=True).mean()

# Plot
fig, ax = plt.subplots(figsize=(15, 6))

ax.plot(df_sorted['DateTime'], df_sorted['AQI'], label='Daily AQI', 
        alpha=0.3, color='gray', linewidth=0.8)
ax.plot(df_sorted['DateTime'], df_sorted['AQI_MA7'], label='7-Day MA', 
        color='steelblue', linewidth=2)
ax.plot(df_sorted['DateTime'], df_sorted['AQI_MA30'], label='30-Day MA', 
        color='darkred', linewidth=2, linestyle='--')

# Fill between
ax.fill_between(df_sorted['DateTime'], df_sorted['AQI_MA7'], 
                 alpha=0.2, color='steelblue')

ax.set_xlabel('Date', fontweight='bold')
ax.set_ylabel('AQI Level', fontweight='bold')
ax.set_title('AQI Time Series with Moving Averages', fontsize=14, fontweight='bold')
ax.legend(loc='best', fontsize=10)
ax.grid(alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
## üî• Correlation Heatmap (All Variables)

In [None]:
# Calculate correlation
corr_matrix = df[POLLUTANTS + ['AQI']].corr()

# Create heatmap
fig, ax = plt.subplots(figsize=(10, 8))

sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='RdYlGn', center=0,
            square=True, linewidths=1, cbar_kws={'label': 'Correlation'},
            vmin=-1, vmax=1, ax=ax, annot_kws={'fontsize': 10, 'fontweight': 'bold'})

ax.set_title('Pollutant & AQI Correlation Matrix', fontsize=14, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

In [None]:
## üåê Pair Plot - Pollutant Relationships

In [None]:
# Create pair plot for selected pollutants
pair_data = df[['PM2.5', 'PM10', 'NO2', 'AQI']].head(500)  # Sample for performance

pair_plot = sns.pairplot(pair_data, diag_kind='kde', plot_kws={'alpha': 0.6, 's': 30},
                          diag_kws={'shade': True})
pair_plot.fig.suptitle('Pollutant Relationships (Sample of 500 observations)', 
                        fontsize=14, fontweight='bold', y=0.995)
plt.tight_layout()
plt.show()

In [None]:
## üìç Box Plots - Outlier Detection

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(14, 8))
axes = axes.flatten()

for idx, pollutant in enumerate(POLLUTANTS):
    bp = axes[idx].boxplot(df[pollutant].dropna(), vert=True, patch_artist=True)
    
    # Customize colors
    for patch in bp['boxes']:
        patch.set_facecolor('lightblue')
    
    # Count outliers
    Q1 = df[pollutant].quantile(0.25)
    Q3 = df[pollutant].quantile(0.75)
    IQR = Q3 - Q1
    outliers = ((df[pollutant] < Q1 - 1.5*IQR) | (df[pollutant] > Q3 + 1.5*IQR)).sum()
    
    axes[idx].set_title(f'{pollutant}\n({outliers} outliers)', fontweight='bold')
    axes[idx].set_ylabel('Value')
    axes[idx].grid(alpha=0.3, axis='y')

# AQI boxplot
bp = axes[5].boxplot(df['AQI'].dropna(), vert=True, patch_artist=True)
for patch in bp['boxes']:
    patch.set_facecolor('lightcoral')

Q1 = df['AQI'].quantile(0.25)
Q3 = df['AQI'].quantile(0.75)
IQR = Q3 - Q1
outliers = ((df['AQI'] < Q1 - 1.5*IQR) | (df['AQI'] > Q3 + 1.5*IQR)).sum()

axes[5].set_title(f'AQI\n({outliers} outliers)', fontweight='bold')
axes[5].set_ylabel('Value')
axes[5].grid(alpha=0.3, axis='y')

plt.suptitle('Box Plots - Outlier Detection', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
## üìä Monthly Trends

In [None]:
# Monthly aggregates
monthly_data = df.groupby('Month')[['AQI'] + POLLUTANTS].mean()

fig, ax = plt.subplots(figsize=(12, 6))

for pollutant in POLLUTANTS:
    ax.plot(monthly_data.index, monthly_data[pollutant], 
            marker='o', label=pollutant, linewidth=2, markersize=8)

ax.set_xlabel('Month', fontweight='bold')
ax.set_ylabel('Average Concentration', fontweight='bold')
ax.set_title('Monthly Average Pollutant Trends', fontsize=14, fontweight='bold')
ax.set_xticks(range(1, 13))
ax.legend(loc='best', ncol=3, fontsize=9)
ax.grid(alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
## üí° Key Insights Summary

In [None]:
print("\n" + "="*70)
print("üìä VISUALIZATION INSIGHTS SUMMARY")
print("="*70)

print("\n1Ô∏è‚É£ DISTRIBUTIONS:")
print(f"   - Most pollutants show right-skewed distributions")
print(f"   - PM2.5 and PM10 have wider ranges than other pollutants")

print("\n2Ô∏è‚É£ SEASONAL PATTERNS:")
worst_season = df.groupby('Season')['AQI'].mean().idxmax()
best_season = df.groupby('Season')['AQI'].mean().idxmin()
print(f"   - Worst season: {worst_season} (AQI: {df.groupby('Season')['AQI'].mean()[worst_season]:.1f})")
print(f"   - Best season: {best_season} (AQI: {df.groupby('Season')['AQI'].mean()[best_season]:.1f})")

print("\n3Ô∏è‚É£ CORRELATIONS:")
strongest_corr = df[POLLUTANTS].corrwith(df['AQI']).idxmax()
print(f"   - {strongest_corr} has strongest correlation with AQI")
print(f"   - All pollutants positively correlate with AQI")

print("\n4Ô∏è‚É£ TEMPORAL TRENDS:")
aqi_trend = df_sorted['AQI'].iloc[-100:].mean() - df_sorted['AQI'].iloc[:100].mean()
print(f"   - Recent AQI trend: {'‚Üë Increasing' if aqi_trend > 0 else '‚Üì Decreasing'}")
print(f"   - 30-day moving average smooths daily variations")

print("\n5Ô∏è‚É£ OUTLIERS:")
total_outliers = 0
for col in POLLUTANTS + ['AQI']:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    outliers = ((df[col] < Q1 - 1.5*IQR) | (df[col] > Q3 + 1.5*IQR)).sum()
    total_outliers += outliers

print(f"   - Total outliers detected: {total_outliers}")
print(f"   - Consider for data cleaning in production pipelines")

print("\n" + "="*70)