In [8]:
import os
import pandas as pd
import numpy as np
import sub_index
# import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt

In [9]:
data_dir = "../data/cleaneddata/"
output_dir = "../data/AQIdata/"
os.makedirs(output_dir, exist_ok=True)

In [10]:
stations = ["Chennai", "Delhi", "Koonimedu","Mawlynnong"]

In [11]:
from sub_index import get_PM25_subindex, get_PM10_subindex, get_CO_subindex, get_O3_subindex, get_SO2_subindex, get_NOx_subindex

In [12]:
# Function to calculate hourly AQI
def calculate_hourly_aqi(df):
    sub_indices = {
        "pm2_5 (μg/m³)": get_PM25_subindex,
        "pm10 (μg/m³)": get_PM10_subindex,
        "carbon_monoxide (μg/m³)": get_CO_subindex,
        "ozone (μg/m³)": get_O3_subindex,
        "sulphur_dioxide (μg/m³)": get_SO2_subindex,
        "nitrogen_dioxide (μg/m³)": get_NOx_subindex
    }
    
    # Convert pollutants to numeric (force errors to NaN)
    for col in sub_indices.keys():
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    # Compute sub-indices
    for pollutant, func in sub_indices.items():
        if pollutant in df.columns:
            df[f"{pollutant}_subindex"] = df[pollutant].apply(lambda x: func(x) if pd.notna(x) else None)

    # Compute AQI using the highest subindex
    df["AQI"] = df[[col for col in df.columns if "subindex" in col]].max(axis=1)

    print(df.head())  # Debugging: Print first few rows to check calculations

    return df

In [13]:
# Function to calculate daily AQI using 24-hour rolling averages
def calculate_daily_aqi(df):
    # Resample to daily level and calculate mean for each day
    df_daily = df.set_index('time').resample('D').mean().reset_index()
    
    return calculate_hourly_aqi(df_daily)

In [14]:
# Process each station CSV
for station in stations:
    input_file = os.path.join(data_dir, f"{station}.csv")

    if os.path.exists(input_file):
        df = pd.read_csv(input_file, parse_dates=["time"])  # Ensure 'time' column is datetime
        df.sort_values(by="time", inplace=True)  # Sort time-series data

        # Debugging: Print first few rows to check if data is loaded properly
        print(f"Processing {station} - First few rows:")
        print(df.head())

        # Hourly AQI Calculation
        hourly_aqi_df = calculate_hourly_aqi(df.copy())
        hourly_aqi_df.to_csv(os.path.join(output_dir, f"{station}_hourly_AQI.csv"), index=False)

        # Daily AQI Calculation
        daily_aqi_df = calculate_daily_aqi(df.copy())
        daily_aqi_df.rename(columns={"time": "date"}, inplace=True)
        daily_aqi_df.to_csv(os.path.join(output_dir, f"{station}_daily_AQI.csv"), index=False)

        print(f"AQI computed and saved for {station}")
    else:
        print(f"Data file for {station} not found!")

Processing Chennai - First few rows:
                 time  pm10 (μg/m³)  pm2_5 (μg/m³)  carbon_monoxide (μg/m³)  \
0 2022-09-01 00:00:00     21.691429      14.634286               303.028571   
1 2022-09-01 01:00:00     20.940000      14.240000               292.028571   
2 2022-09-01 02:00:00     18.751429      12.802857               270.285714   
3 2022-09-01 03:00:00     15.928571      10.820000               247.228571   
4 2022-09-01 04:00:00     14.751429       9.914286               226.571429   

   nitrogen_dioxide (μg/m³)  sulphur_dioxide (μg/m³)  ozone (μg/m³)  
0                 25.082857                30.522857      12.342857  
1                 23.211429                30.034286      15.085714  
2                 19.668571                29.022857      20.000000  
3                 16.268571                27.911429      24.742857  
4                 13.840000                26.925714      27.914286  
                 time  pm10 (μg/m³)  pm2_5 (μg/m³)  carbon_monoxide 