In [3]:
# Cell 1: Setup
"""
Air Quality Index - Exploratory Data Analysis
Notebook 1: Understanding the Data
"""

# First, install the required packages if they're not already installed
# Uncomment and run these lines if you need to install the packages
!pip install numpy pandas matplotlib seaborn

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("✅ Libraries imported successfully!")

Collecting matplotlib
  Downloading matplotlib-3.10.7-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting cycler>=0.10 (from matplotlib)
  Downloading cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.60.1-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (112 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Downloading kiwisolver-1.4.9-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (6.3 kB)
Collecting pyparsing>=3 (from matplotlib)
  Downloading pyparsing-3.2.5-py3-none-any.whl.metadata (5.0 kB)
Downloading matplotlib-3.10.7-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (8.7 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.7/8.7 MB[0m [31m789.7 kB/s[0m  [33m0:00:11[0m

In [4]:
# Cell 2: Load Data
"""
Load the dataset for analysis
"""

# Option 1: Generate synthetic data for testing
from src.utils import generate_synthetic_aqi_data
df = generate_synthetic_aqi_data(n_samples=1000)

# Option 2: Load real data (uncomment when you have it)
# df = pd.read_csv('data/raw/aqi_data.csv')

print(f"Dataset shape: {df.shape}")
print("\nFirst 5 rows:")
df.head()

ModuleNotFoundError: No module named 'src'

In [None]:
# Cell 3: Basic Information
"""
Examine data structure and types
"""

print("="*60)
print("DATASET INFORMATION")
print("="*60)
print(f"\nShape: {df.shape}")
print(f"Rows: {df.shape[0]}")
print(f"Columns: {df.shape[1]}")
print("\nColumn Names:")
print(df.columns.tolist())
print("\nData Types:")
print(df.dtypes)
print("\nMemory Usage:")
print(df.memory_usage(deep=True))


In [None]:
# Cell 4: Statistical Summary
"""
Calculate descriptive statistics
"""

print("="*60)
print("STATISTICAL SUMMARY")
print("="*60)
df.describe()

In [None]:
# Cell 5: Missing Values
"""
Check for missing values
"""

print("="*60)
print("MISSING VALUES ANALYSIS")
print("="*60)

missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100

missing_df = pd.DataFrame({
    'Column': missing.index,
    'Missing_Count': missing.values,
    'Percentage': missing_pct.values
}).sort_values('Missing_Count', ascending=False)

print(missing_df)

# Visualize
plt.figure(figsize=(10, 6))
missing_df[missing_df['Missing_Count'] > 0].plot(
    x='Column', y='Percentage', kind='bar', color='coral'
)
plt.title('Missing Values by Column', fontsize=14, fontweight='bold')
plt.ylabel('Percentage Missing (%)')
plt.xlabel('Column')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# Cell 6: Distribution Analysis
"""
Analyze distributions of numerical features
"""

print("="*60)
print("DISTRIBUTION ANALYSIS")
print("="*60)

# Plot histograms for all numerical columns
df.hist(figsize=(15, 10), bins=30, edgecolor='black')
plt.suptitle('Distribution of All Features', fontsize=16, fontweight='bold', y=1.00)
plt.tight_layout()
plt.show()

In [None]:
# Cell 7: Target Variable Analysis
"""
Analyze the target variable (AQI)
"""

print("="*60)
print("TARGET VARIABLE (AQI) ANALYSIS")
print("="*60)

print(f"Mean AQI: {df['AQI'].mean():.2f}")
print(f"Median AQI: {df['AQI'].median():.2f}")
print(f"Std Dev: {df['AQI'].std():.2f}")
print(f"Min AQI: {df['AQI'].min():.2f}")
print(f"Max AQI: {df['AQI'].max():.2f}")

# Categorize AQI
from src.utils import categorize_aqi
df['AQI_Category'] = df['AQI'].apply(categorize_aqi)

print("\nAQI Category Distribution:")
print(df['AQI_Category'].value_counts())

# Visualize
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Histogram
axes[0].hist(df['AQI'], bins=30, color='steelblue', edgecolor='black')
axes[0].axvline(df['AQI'].mean(), color='red', linestyle='--', linewidth=2, label='Mean')
axes[0].set_xlabel('AQI Value')
axes[0].set_ylabel('Frequency')
axes[0].set_title('AQI Distribution')
axes[0].legend()
axes[0].grid(alpha=0.3)

# Box plot
axes[1].boxplot(df['AQI'], vert=True)
axes[1].set_ylabel('AQI Value')
axes[1].set_title('AQI Box Plot')
axes[1].grid(alpha=0.3)

# Pie chart of categories
category_counts = df['AQI_Category'].value_counts()
axes[2].pie(category_counts.values, labels=category_counts.index, autopct='%1.1f%%', startangle=90)
axes[2].set_title('AQI Categories')

plt.tight_layout()
plt.show()


In [None]:
# Cell 8: Correlation Analysis (Fixed)
"""
Analyze correlations between features
"""

print("="*60)
print("CORRELATION ANALYSIS")
print("="*60)

# Check if we have numerical columns to correlate
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
if len(numerical_cols) < 2:
    print("Not enough numerical columns for correlation analysis")
else:
    try:
        # Calculate correlation matrix
        correlation_matrix = df[numerical_cols].corr()
        
        # Display correlations with AQI (if exists)
        if 'AQI' in numerical_cols:
            aqi_correlations = correlation_matrix['AQI'].sort_values(ascending=False)
            print("\nCorrelations with AQI:")
            print(aqi_correlations)
        
        # Visualize correlation matrix
        plt.figure(figsize=(10, 8))
        mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
        sns.heatmap(correlation_matrix, mask=mask, annot=True, fmt=".2f", 
                    cmap='coolwarm', linewidths=0.5, cbar_kws={"shrink": .8})
        plt.title('Feature Correlation Matrix', fontsize=14, fontweight='bold')
        plt.tight_layout()
        plt.show()
        
        # Identify highly correlated features
        high_corr_threshold = 0.7
        high_corr_features = []
        
        for i in range(len(correlation_matrix.columns)):
            for j in range(i):
                if abs(correlation_matrix.iloc[i, j]) > high_corr_threshold:
                    high_corr_features.append(
                        (correlation_matrix.columns[i], 
                         correlation_matrix.columns[j], 
                         correlation_matrix.iloc[i, j])
                    )
        
        if high_corr_features:
            print("\nHighly correlated feature pairs (|r| > 0.7):")
            for feat1, feat2, corr in high_corr_features:
                print(f"{feat1} — {feat2}: {corr:.3f}")
    except Exception as e:
        print(f"Error in correlation analysis: {e}")

In [None]:
# Cell 9: Summary and Next Steps
"""
Summarize findings and outline next steps
"""
print("="*60)
print("SUMMARY AND NEXT STEPS")
print("="*60)

print("Key Findings:")
print("1. Dataset shape and structure analyzed")
print("2. Missing values identified and visualized")
print("3. Distribution of features examined")
print("4. AQI distribution and categories analyzed")
print("5. Feature correlations with AQI identified")

print("\nNext Steps:")
print("1. Handle missing values")
print("2. Address outliers if necessary")
print("3. Feature engineering based on correlations")
print("4. Prepare data for modeling")
print("5. Develop predictive models for AQI")

In [None]:
# Cell 10: Feature Relationships
"""
Visualize relationships between key features and AQI
"""

print("="*60)
print("FEATURE RELATIONSHIPS WITH AQI")
print("="*60)

# Key pollutants vs AQI
pollutants = ['PM2.5', 'PM10', 'NO2', 'SO2', 'CO']

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for idx, pollutant in enumerate(pollutants):
    if pollutant in df.columns:
        axes[idx].scatter(df[pollutant], df['AQI'], alpha=0.5, s=20)
        axes[idx].set_xlabel(pollutant)
        axes[idx].set_ylabel('AQI')
        axes[idx].set_title(f'{pollutant} vs AQI')
        axes[idx].grid(alpha=0.3)
        
        # Add trend line
        z = np.polyfit(df[pollutant], df['AQI'], 1)
        p = np.poly1d(z)
        axes[idx].plot(df[pollutant], p(df[pollutant]), "r--", alpha=0.8, linewidth=2)

# Remove empty subplot
fig.delaxes(axes[5])

plt.suptitle('Pollutant Relationships with AQI', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# Cell 11: Key Insights
"""
Summary of Key Findings
"""

print("="*60)
print("KEY INSIGHTS FROM EDA")
print("="*60)

insights = f"""
1. DATASET OVERVIEW:
   - Total samples: {len(df)}
   - Features: {len(df.columns) - 1}
   - Target variable: AQI (range: {df['AQI'].min():.1f} - {df['AQI'].max():.1f})

2. DATA QUALITY:
   - Missing values: {df.isnull().sum().sum()}
   - Duplicate rows: {df.duplicated().sum()}

3. AQI DISTRIBUTION:
   - Mean: {df['AQI'].mean():.2f}
   - Most common category: {df['AQI_Category'].mode()[0]}
   
4. STRONGEST CORRELATIONS WITH AQI:
{aqi_correlations.head(6).to_string()}

5. NEXT STEPS:
   - Handle outliers if necessary
   - Create additional features
   - Prepare data for modeling
"""

print(insights)

In [None]:
# Cell 12: Save Processed Data
"""
Save cleaned data for modeling
"""

# Save to processed folder
df.to_csv('data/processed/aqi_data_explored.csv', index=False)
print("✅ Explored data saved to data/processed/aqi_data_explored.csv")