In [9]:
import pandas as pd

In [10]:
def preprocess_air_quality(file_path):
    # Load the dataset
    df = pd.read_csv(file_path)
    
    # Clean column names
    df.columns = df.columns.str.strip()
    
    # Convert date column to datetime format
    df["date"] = pd.to_datetime(df["date"], format="%d-%m-%Y")
    
    # Convert pollutant columns to numeric
    pollutants = ["pm25", "pm10", "o3", "no2", "so2", "co"]
    df[pollutants] = df[pollutants].apply(pd.to_numeric, errors="coerce")
    
    # Handle missing values using forward fill
    df.fillna(method='ffill', inplace=True)
    
    # Define AQI calculation function
    def calculate_aqi(pm25, pm10, o3, no2, so2, co):
        # Using the worst-case approach: AQI is determined by the highest individual sub-index
        aqi_values = {
            'pm25': [0, 30, 60, 90, 120, 250],
            'pm10': [0, 50, 100, 250, 350, 430],
            'o3': [0, 50, 100, 168, 208, 748],
            'no2': [0, 40, 80, 180, 280, 400],
            'so2': [0, 40, 80, 380, 800, 1600],
            'co': [0, 1, 2, 10, 17, 34]
        }
        
        aqi_sub_indices = []
        for pollutant, breakpoints in aqi_values.items():
            value = locals()[pollutant]
            for i in range(len(breakpoints) - 1):
                if breakpoints[i] <= value < breakpoints[i+1]:
                    aqi_sub_index = ((value - breakpoints[i]) / (breakpoints[i+1] - breakpoints[i])) * 100
                    aqi_sub_indices.append(aqi_sub_index)
                    break
        
        return max(aqi_sub_indices) if aqi_sub_indices else None
    
    # Calculate AQI for each row
    df['aqi'] = df.apply(lambda row: calculate_aqi(row['pm25'], row['pm10'], row['o3'], row['no2'], row['so2'], row['co']), axis=1)
    
    # Reorder columns to have 'date' first
    df = df[['date', 'pm25', 'pm10', 'o3', 'no2', 'so2', 'co', 'aqi']]
    
    return df


In [11]:
# File path
file_path = "maninagar_ahmedabad_air_quality_csv.csv"

In [12]:
# Preprocess the data
df_cleaned = preprocess_air_quality(file_path)

In [13]:
# Display the cleaned dataset
df_cleaned.head()

Unnamed: 0,date,pm25,pm10,o3,no2,so2,co,aqi
0,2025-02-01,65.0,91.0,11.0,11.0,14.0,4.0,82.0
1,2025-02-02,95.0,100.0,14.0,10.0,13.0,4.0,32.5
2,2025-02-03,113.0,82.0,9.0,11.0,15.0,4.0,76.666667
3,2025-02-04,98.0,63.0,7.0,10.0,5.0,4.0,26.666667
4,2025-02-05,45.0,76.0,8.0,10.0,12.0,3.0,52.0


In [15]:
df_cleaned.tail()

Unnamed: 0,date,pm25,pm10,o3,no2,so2,co,aqi
2994,2017-10-17,130.0,93.0,22.0,59.0,41.0,2.0,86.0
2995,2016-07-13,130.0,93.0,22.0,45.0,27.0,10.0,86.0
2996,2016-07-24,130.0,93.0,22.0,43.0,27.0,10.0,86.0
2997,2015-11-02,130.0,93.0,22.0,11.0,20.0,9.0,87.5
2998,2014-12-25,130.0,93.0,22.0,72.0,63.0,88.0,86.0


In [16]:
df_cleaned.isnull().sum()

date    0
pm25    0
pm10    0
o3      0
no2     0
so2     0
co      0
aqi     0
dtype: int64

In [14]:
# Save cleaned dataset to CSV
df_cleaned.to_csv("cleaned_air_quality_data.csv", index=False)

print("Cleaned dataset saved successfully in 'cleaned_air_quality_data.csv'")


Cleaned dataset saved successfully in 'data/cleaned_air_quality_data.csv'
