# Exploratory Data Analysis (EDA) and Model Prototyping

This notebook covers the initial exploration of the Urban Air Quality dataset and prototypes a simple predictive model. [cite: 8, 14]

## 1. Setup and Data Loading

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json

# Load dataset [cite: 21]
df = pd.read_csv('../data/air_quality_global.csv')

# Load metadata [cite: 22]
with open('../data/metadata.json', 'r') as f:
    metadata = json.load(f)

print('Dataset Info:')
df.info()

print('\nMetadata Keys:')
print(metadata.keys())

## 2. Initial Data Exploration [cite: 27]

In [None]:
print('First 5 rows of the dataset:')
display(df.head())

print('\nDescriptive Statistics:')
display(df.describe())

### Checking for Missing Values

In [None]:
missing_values = df.isnull().sum()
missing_percent = (missing_values / len(df)) * 100
pd.DataFrame({'count': missing_values, 'percentage': missing_percent}).sort_values('percentage', ascending=False)

## 3. Data Visualization

In [None]:
sns.set_style('whitegrid')

# Time Series Plot of average PM2.5 per year
plt.figure(figsize=(12, 6))
df.groupby('year')['pm25_ugm3'].mean().plot(kind='line', marker='o')
plt.title('Average PM2.5 Concentration Over Years')
plt.ylabel('PM2.5 (µg/m³)')
plt.xlabel('Year')
plt.grid(True)
plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(10, 8))
numeric_cols = df.select_dtypes(include=np.number).columns
sns.heatmap(df[numeric_cols].corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix of Numeric Features')
plt.show()

## 4. Model Prototyping [cite: 31]

Let's build a simple model to predict `pm25_ugm3`.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer

# Define features and target
features = ['latitude', 'longitude', 'year', 'month', 'no2_ugm3']
target = 'pm25_ugm3'

# Drop rows where target is NaN and create a copy
model_df = df.dropna(subset=[target]).copy()

# Impute missing values in features [cite: 28, 55]
imputer = SimpleImputer(strategy='median')
model_df[features] = imputer.fit_transform(model_df[features])

X = model_df[features]
y = model_df[target]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # [cite: 72]

# Train a simple model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)

# Evaluate
y_pred = rf_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse:.4f}')
print(f'R-squared Score: {r2:.4f}')

## 5. Feature Importance [cite: 34]

In [None]:
feature_importances = pd.DataFrame({'feature': features, 'importance': rf_model.feature_importances_})
feature_importances = feature_importances.sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importances)
plt.title('Feature Importances for PM2.5 Prediction')
plt.show()