# Exploratory Data Analysis (EDA) - Karachi AQI

This notebook performs EDA on the historical air quality data for Karachi.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

# Add src to path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from src.database.mongodb_client import MongoDBClient

## 1. Load Data from MongoDB

In [None]:
db_client = MongoDBClient()
data = db_client.fetch_all()
db_client.close()

df = pd.DataFrame(data)
print(f"Loaded {len(df)} records.")
df.head()

## 2. Data Overview

In [None]:
df.info()

In [None]:
df.describe()

## 3. AQI Distribution

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df['aqi'], bins=30, kde=True)
plt.title('Distribution of AQI')
plt.xlabel('AQI')
plt.ylabel('Frequency')
plt.show()

## 4. Correlation Matrix

In [None]:
plt.figure(figsize=(12, 10))
numeric_df = df.select_dtypes(include=['float64', 'int64'])
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

## 5. Time Series Analysis

In [None]:
if 'timestamp' in df.columns:
    df['dt'] = pd.to_datetime(df['timestamp'], unit='s')
    df = df.sort_values('dt')
    
    plt.figure(figsize=(14, 6))
    plt.plot(df['dt'], df['aqi'], label='AQI')
    plt.title('AQI over Time')
    plt.xlabel('Date')
    plt.ylabel('AQI')
    plt.legend()
    plt.show()