In [None]:
# ===============================================================
# 01_eda.ipynb
# Module 1: Data Preprocessing & EDA (Adapted for your dataset)
# ===============================================================

# STEP 1: Install dependencies
!pip install pandas numpy matplotlib seaborn

In [None]:
# STEP 2: Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# STEP 3: Load dataset
df = pd.read_csv("../data/raw/air_quality.csv", sep=None, engine="python")

print("Dataset shape:", df.shape)
print(df.head())

In [None]:
# STEP 4: Convert datetime column
df['Last Updated'] = pd.to_datetime(df['Last Updated'])
df = df.set_index('Last Updated').sort_index()
print("After datetime index:", df.head())

In [None]:
# STEP 5: Pivot dataset so pollutants become columns
df_pivot = df.pivot_table(
    index=df.index,          # use datetime index
    columns="Pollutant",     # each pollutant becomes its own column
    values="Value",          # pollution values
    aggfunc="mean"           # average if duplicates
)


In [None]:
# Flatten multi-index columns if needed
df_pivot.columns = [str(c) for c in df_pivot.columns]

print("Pivoted dataset (wide format):")
print(df_pivot.head())

In [None]:
# STEP 6: Resample daily averages
df_daily = df_pivot.resample('D').mean()

In [None]:
# STEP 7: Handle missing values (fill gaps)
df_daily = df_daily.interpolate(limit_direction="both")


In [None]:
# STEP 8: Handle outliers (clip extreme values using IQR)
for col in df_daily.columns:
    q1 = df_daily[col].quantile(0.25)
    q3 = df_daily[col].quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    df_daily[col] = df_daily[col].clip(lower, upper)

In [None]:
# STEP 9: Add extra features
df_daily['dayofweek'] = df_daily.index.dayofweek
df_daily['month'] = df_daily.index.month
df_daily['year'] = df_daily.index.year

In [None]:
# STEP 10: Save cleaned dataset
df_daily.to_csv("../data/processed/air_quality_cleaned.csv")
print("✅ Cleaned dataset saved to data/processed/air_quality_cleaned.csv")


In [None]:
# ===============================================================
# EDA (Exploratory Data Analysis)
# ===============================================================

# 1. Line plots for pollutants
plt.figure(figsize=(12,6))
for col in df_pivot.columns:
    df_daily[col].plot(label=col)
plt.title("Pollutant Trends Over Time")
plt.legend()
plt.show()

In [None]:
# 2. Correlation heatmap
plt.figure(figsize=(8,6))
sns.heatmap(df_daily.corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Between Pollutants and Features")
plt.show()


In [None]:
# 3. Seasonal analysis: Monthly averages (example PM2.5 if exists)
if "PM2.5" in df_daily.columns:
    df_daily.groupby(df_daily.index.month)["PM2.5"].mean().plot(
        kind="bar", figsize=(8,4), title="Average Monthly PM2.5 Levels"
    )
    plt.show()

In [None]:
# 4. Weekly pattern: Day of week averages (example PM2.5 if exists)
if "PM2.5" in df_daily.columns:
    df_daily.groupby(df_daily.index.dayofweek)["PM2.5"].mean().plot(
        kind="bar", figsize=(8,4), title="Average PM2.5 by Day of Week (0=Monday)"
    )
    plt.show()


In [None]:
# 5. Dataset summary
print("Final cleaned dataset shape:", df_daily.shape)
print(df_daily.describe())