In [26]:
import os
import pandas as pd
import numpy as np
import sub_index
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
data_dir = "../data/preprocessed/"
output_dir = "../data/AQIdata/"
os.makedirs(output_dir, exist_ok=True)

In [28]:
stations = ["Chennai", "Delhi", "Koonimedu","Mawlynnong"]

In [29]:
from sub_index import get_PM25_subindex, get_PM10_subindex, get_CO_subindex, get_O3_subindex, get_SO2_subindex, get_NOx_subindex

In [30]:
import os
import pandas as pd
import numpy as np

# Process each station separately
for station in stations:
    file_path = os.path.join(data_dir, f"{station}_preprocessed.csv")
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        continue

    # Load data
    df = pd.read_csv(file_path)

    print(f"\n==== {station.upper()} - RAW DATA HEAD ====")
    print(df.head())

    # Convert timestamp column
    df["time"] = pd.to_datetime(df["time"])
    df.set_index("time", inplace=True)  # Ensure Timestamp is the index

    # Convert all numeric columns to float
    for column in df.columns:
        df[column] = pd.to_numeric(df[column], errors='coerce')

    print(f"\n==== {station.upper()} - DATA INFO ====")
    print(df.info())

    # AQI Sub-Index Calculation
    df["PM2.5_SubIndex"] = df["pm2_5 (μg/m³)"].apply(lambda x: get_PM25_subindex(x))
    df["PM10_SubIndex"] = df["pm10 (μg/m³)"].apply(lambda x: get_PM10_subindex(x))
    df["CO_SubIndex"] = df["carbon_monoxide (μg/m³)"].apply(lambda x: get_CO_subindex(x/1000))
    df["Ozone_SubIndex"] = df["ozone (μg/m³)"].apply(lambda x: get_O3_subindex(x))
    df["SO2_SubIndex"] = df["sulphur_dioxide (μg/m³)"].apply(lambda x: get_SO2_subindex(x))
    df["NOx_SubIndex"] = df["nitrogen_dioxide (μg/m³)"].apply(lambda x: get_NOx_subindex(x))

    # Debug sub-index values
    print(f"\n==== {station.upper()} - SUB-INDEX STATISTICS ====")
    print(df[["PM2.5_SubIndex", "PM10_SubIndex", "CO_SubIndex", "Ozone_SubIndex", "SO2_SubIndex", "NOx_SubIndex"]].describe())

    # AQI Calculation (Max Sub-Index Method)
    df["Checks"] = (df["PM2.5_SubIndex"] > 0).astype(int) + \
                   (df["PM10_SubIndex"] > 0).astype(int) + \
                   (df["NOx_SubIndex"] > 0).astype(int) + \
                   (df["CO_SubIndex"] > 0).astype(int) + \
                   (df["SO2_SubIndex"] > 0).astype(int) + \
                   (df["Ozone_SubIndex"] > 0).astype(int)

    df["AQI"] = df[["PM2.5_SubIndex", "PM10_SubIndex", "NOx_SubIndex", 
                     "CO_SubIndex", "Ozone_SubIndex", "SO2_SubIndex"]].max(axis=1)

    # Remove invalid AQI values (where less than 3 pollutants are available)
    df.loc[df["Checks"] < 3, "AQI"] = np.nan

    # Debug high AQI values
    print(f"\n⚠️ Debugging High AQI Values for {station} ⚠️")
    print(df[df["AQI"] > 500][["PM2.5_SubIndex", "PM10_SubIndex", "NOx_SubIndex", 
                                "CO_SubIndex", "Ozone_SubIndex", "SO2_SubIndex", "AQI"]])
    
    # Save hourly AQI data
    output_path = os.path.join(output_dir, f"{station}_hourly_aqi.csv")
    df.to_csv(output_path)
    print(f"✅ Saved hourly AQI data for {station}: {output_path}")

print("\n🎉 Processing complete!")



==== CHENNAI - RAW DATA HEAD ====
                  time  pm10 (μg/m³)  pm2_5 (μg/m³)  carbon_monoxide (μg/m³)  \
0  2022-09-01 00:00:00          21.7           14.6                      305   
1  2022-09-01 01:00:00          21.0           14.3                      291   
2  2022-09-01 02:00:00          19.0           13.0                      272   
3  2022-09-01 03:00:00          15.8           10.7                      245   
4  2022-09-01 04:00:00          14.4            9.7                      227   

   nitrogen_dioxide (μg/m³)  sulphur_dioxide (μg/m³)  ozone (μg/m³)     City  
0                      25.4                     30.6             12  Chennai  
1                      23.1                     30.0             15  Chennai  
2                      19.9                     29.1             20  Chennai  
3                      15.9                     27.8             25  Chennai  
4                      13.9                     26.9             28  Chennai  

==== CHEN