# üåç Exploratory Data Analysis (EDA) - AQI Prediction

This notebook covers the exploratory data analysis for the Air Quality Index (AQI) prediction system. We analyze historical AQI data, pollutant distributions, temporal patterns, and feature correlations.


In [None]:
import hopsworks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import os
from dotenv import load_dotenv
import warnings

warnings.filterwarnings('ignore')
load_dotenv()

# Set plot style
try:
    plt.style.use('seaborn-v0_8-whitegrid')
except:
    plt.style.use('default')

sns.set_palette("husl")

## 1. Load Data from Hopsworks
We fetch the latest data from the feature group `aqi_feature_group` version 1.

In [None]:
project = hopsworks.login(
    api_key_value=os.getenv("HOPSWORKS_API_KEY"),
    project="aqi_predicton"
)
fs = project.get_feature_store()

# Read Feature Group
fg = fs.get_feature_group(name="aqi_feature_group", version=1)
df = fg.read()

# Convert datetime
df['datetime'] = pd.to_datetime(df['datetime'])
df = df.sort_values('datetime')

print(f"Data Loaded: {df.shape[0]} rows, {df.shape[1]} columns")
df.head()

## 2. Basic Statistics
Overview of the dataset structure and summary statistics.

In [None]:
print(f"Date Range: {df['datetime'].min()} to {df['datetime'].max()}")
df.describe().T

In [None]:
df.info()

## 3. Univariate Analysis
Analyzing the distribution of the target variable (AQI) and key pollutants.

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df['aqi'], bins=20, kde=True)
plt.title('AQI Distribution in Karachi')
plt.xlabel('AQI Value')
plt.ylabel('Frequency')
plt.show()

In [None]:
pollutants = ['co', 'no2', 'o3', 'so2', 'pm2_5', 'pm10', 'nh3']
plt.figure(figsize=(15, 8))
df_melted = df.melt(value_vars=pollutants, var_name='Pollutant', value_name='Concentration')
sns.boxplot(data=df_melted, x='Pollutant', y='Concentration')
plt.title('Pollutant Concentration Distributions')
plt.xticks(rotation=45)
plt.yscale('log')  # Log scale to handle varying ranges
plt.ylabel('Concentration (Log Scale)')
plt.show()

## 4. Temporal Analysis
Analyzing trends over time (Hourly, Daily, Monthly).

In [None]:
# Hourly Pattern
plt.figure(figsize=(12, 6))
hourly_avg = df.groupby('hour')['aqi'].mean()
plt.plot(hourly_avg.index, hourly_avg.values, marker='o', linewidth=2)
plt.title('Average AQI by Hour of Day')
plt.xlabel('Hour (0-23)')
plt.ylabel('Average AQI')
plt.xticks(range(0, 24))
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# Recent Trend (Last 30 Days)
recent_data = df.sort_values('datetime').tail(720)  # Approx last 30 days
plt.figure(figsize=(15, 6))
plt.plot(recent_data['datetime'], recent_data['aqi'], label='AQI', alpha=0.8)
plt.title('AQI Trend - Last 30 Days')
plt.xlabel('Date')
plt.ylabel('AQI')
plt.legend()
plt.xticks(rotation=45)
plt.show()

## 5. Correlation Analysis
Relationships between different pollutants and the AQI.

In [None]:
plt.figure(figsize=(12, 10))
features = ['aqi', 'co', 'no2', 'o3', 'so2', 'pm2_5', 'pm10', 'nh3', 'hour', 'month']
corr_matrix = df[features].corr()

sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

## 6. Health Risk Analysis
Categorizing AQI values based on OpenWeather scale.

In [None]:
def get_aqi_category(aqi):
    if aqi <= 1: return "Good"
    elif aqi <= 2: return "Moderate"
    elif aqi <= 3: return "Unhealthy (Sensitive)"
    elif aqi <= 4: return "Unhealthy"
    else: return "Hazardous"

df['aqi_category'] = df['aqi'].apply(get_aqi_category)

plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='aqi_category', order=["Good", "Moderate", "Unhealthy (Sensitive)", "Unhealthy", "Hazardous"], palette="viridis")
plt.title('Distribution of Air Quality Categories')
plt.xlabel('Category')
plt.ylabel('Count')
plt.show()